tools.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. import datetime
  2. import hashlib
  3. import re
  4. import time
  5. from collections import namedtuple
  6. from lxml.html import HtmlElement, fromstring, tostring
  7. SearchText = namedtuple('SearchText', ['total'])
  8. def element2html(element: HtmlElement) -> str:
  9. return tostring(element, encoding="utf-8").decode()
  10. def html2element(html: str) -> HtmlElement:
  11. return fromstring(html)
  12. def valid_element(node: HtmlElement, feature: str):
  13. if len(node.xpath(feature)) > 0:
  14. return True
  15. else:
  16. return False
  17. def remove_node(node: HtmlElement):
  18. """
  19. this is a in-place operation, not necessary to return
  20. :param node:
  21. :return:
  22. """
  23. parent = node.getparent()
  24. if parent is not None:
  25. parent.remove(node)
  26. def text_search(text: str) -> SearchText:
  27. """
  28. 中文检索
  29. :param text: 文本
  30. :return: 中文数量
  31. """
  32. if not text:
  33. return SearchText(0)
  34. results = re.findall('[\u4e00-\u9fa5]', text, re.S)
  35. # 列表长度即是中文的字数
  36. return SearchText(len(results))
  37. def verify_text(val: str):
  38. if val is None:
  39. return False
  40. """检查数字、字母、中文的个数"""
  41. sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
  42. for pattern in sub_pattern:
  43. val = re.sub(pattern, '', val)
  44. # 若文本长度小于50,表示页面内容无详情内容
  45. if len(val) < 50:
  46. '''无效文本'''
  47. return False
  48. '''有效文本'''
  49. return True
  50. def sha1(text: str):
  51. """
  52. 十六进制数字字符串形式摘要值
  53. @param text: 字符串文本
  54. @return: 摘要值
  55. """
  56. _sha1 = hashlib.sha1()
  57. _sha1.update(text.encode("utf-8"))
  58. return _sha1.hexdigest()
  59. def get_ms() -> int:
  60. return int(round(time.time() * 1000))
  61. def get_current_date():
  62. return datetime.datetime.now().strftime("%Y-%m-%d")
  63. def ms2date(ms: int, fmt="%Y-%m-%d %H:%M:%S"):
  64. """毫秒转日期"""
  65. timestamp = float(ms / 1000)
  66. time_array = time.localtime(timestamp)
  67. return time.strftime(fmt, time_array)
  68. def convert2type(ts_str):
  69. """字符串类型时间戳转成整型"""
  70. return int(float(ts_str) / 1000)
  71. def ts2date(ts_str, fmt="%Y-%m-%d %H:%M:%S") -> str:
  72. """
  73. 时间戳转成日期
  74. :param ts_str: 毫秒级时间戳
  75. :param fmt: 日期格式
  76. :return: 日期
  77. """
  78. timestamp = int(float(ts_str) / 1000)
  79. time_array = time.localtime(timestamp)
  80. return time.strftime(fmt, time_array)
  81. def date2ts(date_str: str, fmt="%Y-%m-%d"):
  82. """日期转成时间戳"""
  83. time_array = time.strptime(date_str, fmt)
  84. timestamp = int(time.mktime(time_array))
  85. return timestamp