tools.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. import datetime
  2. import hashlib
  3. import re
  4. import time
  5. from collections import namedtuple
  6. from urllib.parse import urlparse, urljoin
  7. from lxml.html import HtmlElement, fromstring, tostring
  8. SearchText = namedtuple('SearchText', ['total'])
  9. def element2html(element: HtmlElement) -> str:
  10. return tostring(element, encoding="utf-8").decode()
  11. def html2element(html: str) -> HtmlElement:
  12. return fromstring(html)
  13. def valid_element(node: HtmlElement, feature: str):
  14. if len(node.xpath(feature)) > 0:
  15. return True
  16. else:
  17. return False
  18. def remove_node(node: HtmlElement):
  19. """
  20. this is a in-place operation, not necessary to return
  21. :param node:
  22. :return:
  23. """
  24. parent = node.getparent()
  25. if parent is not None:
  26. parent.remove(node)
  27. def text_search(text: str) -> SearchText:
  28. """
  29. 中文检索
  30. :param text: 文本
  31. :return: 中文数量
  32. """
  33. if not text:
  34. return SearchText(0)
  35. results = re.findall('[\u4e00-\u9fa5]', text, re.S)
  36. # 列表长度即是中文的字数
  37. return SearchText(len(results))
  38. def verify_text(val: str):
  39. """检查数字、字母、中文的个数"""
  40. if val is None:
  41. return False
  42. sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
  43. for pattern in sub_pattern:
  44. val = re.sub(pattern, '', val)
  45. # 若文本长度小于50,表示页面内容无详情内容
  46. if len(val) < 50:
  47. '''无效文本'''
  48. return False
  49. '''有效文本'''
  50. return True
  51. def sha1(text: str):
  52. """
  53. 十六进制数字字符串形式摘要值
  54. @param text: 字符串文本
  55. @return: 摘要值
  56. """
  57. _sha1 = hashlib.sha1()
  58. _sha1.update(text.encode("utf-8"))
  59. return _sha1.hexdigest()
  60. def get_ms() -> int:
  61. return int(round(time.time() * 1000))
  62. def get_current_date():
  63. return datetime.datetime.now().strftime("%Y-%m-%d")
  64. def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"):
  65. """毫秒转日期"""
  66. timestamp = float(ms / 1000)
  67. time_array = time.localtime(timestamp)
  68. return time.strftime(fmt, time_array)
  69. def convert2type(ts_str):
  70. """字符串类型时间戳转成整型"""
  71. return int(float(ts_str) / 1000)
  72. def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str:
  73. """
  74. 时间戳转成日期
  75. :param ts_str: 毫秒级时间戳
  76. :param fmt: 日期格式
  77. :return: 日期
  78. """
  79. timestamp = int(float(ts_str) / 1000)
  80. time_array = time.localtime(timestamp)
  81. return time.strftime(fmt, time_array)
  82. def date2ts(date_str: str, fmt="%Y-%m-%d"):
  83. """日期转成时间戳"""
  84. time_array = time.strptime(date_str, fmt)
  85. timestamp = int(time.mktime(time_array))
  86. return timestamp
  87. def delay_by_hour(hour, fmt="%Y-%m-%d %H:%M:%S"):
  88. """按小时延时"""
  89. _hour = int(hour)
  90. _current_now = datetime.datetime.now()
  91. return (_current_now + datetime.timedelta(hours=_hour)).strftime(fmt)
  92. def delay_by_minutes(minutes, fmt="%Y-%m-%d %H:%M:%S"):
  93. """按分钟延时"""
  94. _minutes = int(minutes)
  95. _current_now = datetime.datetime.now()
  96. return (_current_now + datetime.timedelta(minutes=_minutes)).strftime(fmt)
  97. def delay_by_day(days, fmt="%Y-%m-%d %H:%M:%S"):
  98. """按天延时"""
  99. _days = int(days)
  100. _current_now = datetime.datetime.now()
  101. return (_current_now + datetime.timedelta(days=_days)).strftime(fmt)
  102. def compliance_href(href: str):
  103. if href in [None, ''] or re.match("^((https|http|ftp|rtsp|mms)?://|/\w+\\?)", href) is None:
  104. return False
  105. return True
  106. def is_href(href: str):
  107. result = urlparse(href)
  108. if all([len(result.scheme) == 0 and len(result.netloc) == 0]):
  109. return False
  110. return True
  111. def join_url(base, url):
  112. return urljoin(base, url)