tools.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. import datetime
  2. import hashlib
  3. import re
  4. import time
  5. from bs4 import BeautifulSoup
  6. from lxml.html import HtmlElement, fromstring, tostring
  7. def element2html(element: HtmlElement) -> str:
  8. return tostring(element, encoding="utf-8").decode()
  9. def html2element(html_str: str, base_url=None) -> HtmlElement:
  10. html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
  11. html_str = re.sub('</?br.*?>', '', html_str)
  12. html_str = re.sub(r'<\?xml.*?>', '', html_str)
  13. html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
  14. return fromstring(html_str, base_url=base_url)
  15. def valid_element(node: HtmlElement, feature: str):
  16. if len(node.xpath(feature)) > 0:
  17. return True
  18. else:
  19. return False
  20. def remove_node(node: HtmlElement):
  21. """
  22. this is a in-place operation, not necessary to return
  23. :param node:
  24. :return:
  25. """
  26. parent = node.getparent()
  27. if parent is not None:
  28. parent.remove(node)
  29. def clean_html(html_str: str):
  30. html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
  31. html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
  32. html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
  33. html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
  34. html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
  35. html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
  36. html_str = re.sub(r'<img[^>]*>', '', html_str)
  37. return html_str
  38. def extract_text(html_str: str):
  39. soup = BeautifulSoup(html_str, "lxml")
  40. return soup.get_text()
  41. def verify_text(val: str, length=50):
  42. """检查数字、字母、中文的个数"""
  43. if val is None:
  44. return False
  45. sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
  46. for pattern in sub_pattern:
  47. val = re.sub(pattern, '', val)
  48. # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
  49. if len(val) < length:
  50. '''无效文本'''
  51. return False
  52. '''有效文本'''
  53. return True
  54. def sha1(text: str):
  55. """
  56. 十六进制数字字符串形式摘要值
  57. @param text: 字符串文本
  58. @return: 摘要值
  59. """
  60. _sha1 = hashlib.sha1()
  61. _sha1.update(text.encode("utf-8"))
  62. return _sha1.hexdigest()
  63. def get_ms() -> int:
  64. return int(round(time.time() * 1000))
  65. def get_current_date():
  66. return datetime.datetime.now().strftime("%Y-%m-%d")
  67. def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"):
  68. """毫秒转日期"""
  69. timestamp = float(ms / 1000)
  70. time_array = time.localtime(timestamp)
  71. return time.strftime(fmt, time_array)
  72. def convert2type(ts_str):
  73. """字符串类型时间戳转成整型"""
  74. return int(float(ts_str) / 1000)
  75. def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str:
  76. """
  77. 时间戳转成日期
  78. :param ts_str: 毫秒级时间戳
  79. :param fmt: 日期格式
  80. :return: 日期
  81. """
  82. timestamp = int(float(ts_str) / 1000)
  83. time_array = time.localtime(timestamp)
  84. return time.strftime(fmt, time_array)
  85. def date2ts(date_str: str, fmt="%Y-%m-%d"):
  86. """日期转成时间戳"""
  87. time_array = time.strptime(date_str, fmt)
  88. timestamp = int(time.mktime(time_array))
  89. return timestamp
  90. def delay_by(delay=0, method='seconds', fmt="%Y-%m-%d %H:%M:%S"):
  91. """按指定方式获得顺延时间"""
  92. _current_now = datetime.datetime.now()
  93. if method == 'days':
  94. _timedelta = datetime.timedelta(days=delay)
  95. elif method == 'hours':
  96. _timedelta = datetime.timedelta(hours=delay)
  97. elif method == 'minutes':
  98. _timedelta = datetime.timedelta(minutes=delay)
  99. elif method == 'microseconds':
  100. _timedelta = datetime.timedelta(microseconds=delay)
  101. else:
  102. _timedelta = datetime.timedelta(seconds=delay)
  103. return (_current_now + _timedelta).strftime(fmt)