tools.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. import datetime
  2. import hashlib
  3. import re
  4. import time
  5. from lxml.html import HtmlElement, fromstring, tostring
  6. def element2html(element: HtmlElement) -> str:
  7. return tostring(element, encoding="utf-8").decode()
  8. def html2element(html_str: str) -> HtmlElement:
  9. html_str = re.sub('</?br.*?>', '', html_str)
  10. html_str = re.sub(r'<\?xml.*?>', '', html_str)
  11. html_str = re.sub(r'<DOCTYPE.*?>', '', html_str)
  12. return fromstring(html_str)
  13. def valid_element(node: HtmlElement, feature: str):
  14. if len(node.xpath(feature)) > 0:
  15. return True
  16. else:
  17. return False
  18. def remove_node(node: HtmlElement):
  19. """
  20. this is a in-place operation, not necessary to return
  21. :param node:
  22. :return:
  23. """
  24. parent = node.getparent()
  25. if parent is not None:
  26. parent.remove(node)
  27. def verify_text(val: str):
  28. """检查数字、字母、中文的个数"""
  29. if val is None:
  30. return False
  31. sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
  32. for pattern in sub_pattern:
  33. val = re.sub(pattern, '', val)
  34. # 若文本长度小于50,表示页面内容无详情内容
  35. if len(val) < 50:
  36. '''无效文本'''
  37. return False
  38. '''有效文本'''
  39. return True
  40. def sha1(text: str):
  41. """
  42. 十六进制数字字符串形式摘要值
  43. @param text: 字符串文本
  44. @return: 摘要值
  45. """
  46. _sha1 = hashlib.sha1()
  47. _sha1.update(text.encode("utf-8"))
  48. return _sha1.hexdigest()
  49. def get_ms() -> int:
  50. return int(round(time.time() * 1000))
  51. def get_current_date():
  52. return datetime.datetime.now().strftime("%Y-%m-%d")
  53. def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"):
  54. """毫秒转日期"""
  55. timestamp = float(ms / 1000)
  56. time_array = time.localtime(timestamp)
  57. return time.strftime(fmt, time_array)
  58. def convert2type(ts_str):
  59. """字符串类型时间戳转成整型"""
  60. return int(float(ts_str) / 1000)
  61. def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str:
  62. """
  63. 时间戳转成日期
  64. :param ts_str: 毫秒级时间戳
  65. :param fmt: 日期格式
  66. :return: 日期
  67. """
  68. timestamp = int(float(ts_str) / 1000)
  69. time_array = time.localtime(timestamp)
  70. return time.strftime(fmt, time_array)
  71. def date2ts(date_str: str, fmt="%Y-%m-%d"):
  72. """日期转成时间戳"""
  73. time_array = time.strptime(date_str, fmt)
  74. timestamp = int(time.mktime(time_array))
  75. return timestamp
  76. def delay_by(delay=0, method='seconds', fmt="%Y-%m-%d %H:%M:%S"):
  77. """按指定方式获得顺延时间"""
  78. _current_now = datetime.datetime.now()
  79. if method == 'days':
  80. _timedelta = datetime.timedelta(days=delay)
  81. elif method == 'hours':
  82. _timedelta = datetime.timedelta(hours=delay)
  83. elif method == 'minutes':
  84. _timedelta = datetime.timedelta(minutes=delay)
  85. elif method == 'microseconds':
  86. _timedelta = datetime.timedelta(microseconds=delay)
  87. else:
  88. _timedelta = datetime.timedelta(seconds=delay)
  89. return (_current_now + _timedelta).strftime(fmt)