tools.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. import datetime
  2. import hashlib
  3. import re
  4. import time
  5. from lxml.html import HtmlElement, fromstring, tostring
  6. def element2html(element: HtmlElement) -> str:
  7. return tostring(element, encoding="utf-8").decode()
  8. def html2element(html_str: str) -> HtmlElement:
  9. html_str = re.sub('\ufeff|\xa0|\u3000', '', html_str)
  10. html_str = re.sub('</?br.*?>', '', html_str)
  11. html_str = re.sub(r'<\?xml.*?>', '', html_str)
  12. html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
  13. return fromstring(html_str)
  14. def valid_element(node: HtmlElement, feature: str):
  15. if len(node.xpath(feature)) > 0:
  16. return True
  17. else:
  18. return False
  19. def remove_node(node: HtmlElement):
  20. """
  21. this is a in-place operation, not necessary to return
  22. :param node:
  23. :return:
  24. """
  25. parent = node.getparent()
  26. if parent is not None:
  27. parent.remove(node)
  28. def verify_text(val: str):
  29. """检查数字、字母、中文的个数"""
  30. if val is None:
  31. return False
  32. sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
  33. for pattern in sub_pattern:
  34. val = re.sub(pattern, '', val)
  35. # 若文本长度小于50,表示页面内容无详情内容
  36. if len(val) < 50:
  37. '''无效文本'''
  38. return False
  39. '''有效文本'''
  40. return True
  41. def sha1(text: str):
  42. """
  43. 十六进制数字字符串形式摘要值
  44. @param text: 字符串文本
  45. @return: 摘要值
  46. """
  47. _sha1 = hashlib.sha1()
  48. _sha1.update(text.encode("utf-8"))
  49. return _sha1.hexdigest()
  50. def get_ms() -> int:
  51. return int(round(time.time() * 1000))
  52. def get_current_date():
  53. return datetime.datetime.now().strftime("%Y-%m-%d")
  54. def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"):
  55. """毫秒转日期"""
  56. timestamp = float(ms / 1000)
  57. time_array = time.localtime(timestamp)
  58. return time.strftime(fmt, time_array)
  59. def convert2type(ts_str):
  60. """字符串类型时间戳转成整型"""
  61. return int(float(ts_str) / 1000)
  62. def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str:
  63. """
  64. 时间戳转成日期
  65. :param ts_str: 毫秒级时间戳
  66. :param fmt: 日期格式
  67. :return: 日期
  68. """
  69. timestamp = int(float(ts_str) / 1000)
  70. time_array = time.localtime(timestamp)
  71. return time.strftime(fmt, time_array)
  72. def date2ts(date_str: str, fmt="%Y-%m-%d"):
  73. """日期转成时间戳"""
  74. time_array = time.strptime(date_str, fmt)
  75. timestamp = int(time.mktime(time_array))
  76. return timestamp
  77. def delay_by(delay=0, method='seconds', fmt="%Y-%m-%d %H:%M:%S"):
  78. """按指定方式获得顺延时间"""
  79. _current_now = datetime.datetime.now()
  80. if method == 'days':
  81. _timedelta = datetime.timedelta(days=delay)
  82. elif method == 'hours':
  83. _timedelta = datetime.timedelta(hours=delay)
  84. elif method == 'minutes':
  85. _timedelta = datetime.timedelta(minutes=delay)
  86. elif method == 'microseconds':
  87. _timedelta = datetime.timedelta(microseconds=delay)
  88. else:
  89. _timedelta = datetime.timedelta(seconds=delay)
  90. return (_current_now + _timedelta).strftime(fmt)