utils.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. import re
  2. from urllib.parse import urlencode, urljoin
  3. from bs4 import BeautifulSoup
  4. from lxml.html import HtmlElement, fromstring, tostring
  5. from urllib3 import get_host
  6. from common.log import logger
  7. def err_details(worker):
  8. worker_exception = worker.exception()
  9. if worker_exception:
  10. logger.exception("Worker return exception: {}".format(worker_exception))
  11. return worker
  12. def extract_host(url):
  13. """
  14. # >>> base_url = extract_host('http://192.168.3.207:8080/')
  15. """
  16. _s, _h, _p = get_host(url)
  17. return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
  18. def split_domain(val: str):
  19. if re.match(r'\d+', val) is None:
  20. return re.split(r'[\\.:]', val)
  21. return [val]
  22. def extract_domain(url):
  23. """
  24. # >>> base_url = extract_domain('http://192.168.3.207:8080/')
  25. """
  26. _, host, port = get_host(url)
  27. return f"{host}" if port is None else f"{host}:{port}"
  28. def extract_page_title(html):
  29. element = html2element(html)
  30. nodes = element.xpath('/html/head/title/text()|//title/text()')
  31. if len(nodes) > 1:
  32. return "".join(";".join(nodes).split())
  33. return "".join("".join(nodes).split())
  34. def is_url(url):
  35. """判断url格式畸形与否"""
  36. _regex = re.compile(
  37. r'^(?:http|ftp)s?://' # http:// or https://
  38. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
  39. r'localhost|' # localhost...
  40. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  41. r'(?::\d+)?' # optional port
  42. r'(?:/?|[/?]\S+)$', re.IGNORECASE)
  43. return re.match(_regex, url) is not None
  44. def is_domain(domain):
  45. _regex = re.compile(
  46. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
  47. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  48. r'(?::\d+)?', re.IGNORECASE)
  49. return re.match(_regex, domain) is not None
  50. def label_split(val):
  51. # '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
  52. result = re.split(r'[- _,,\\.|-「」【】??!!/、] *', val)
  53. result = [v for v in result if len(v) > 0]
  54. return result
  55. def get_url(url: str, parameters: dict):
  56. """
  57. 拼接url与所带参数
  58. :param url: 链接
  59. :param parameters: 参数
  60. :return: 拼接后的url
  61. """
  62. _data = '?' + urlencode(parameters)
  63. return urljoin(url, _data)
  64. def iter_node(element: HtmlElement, depth=1):
  65. yield element, depth
  66. depth += 1
  67. for sub_element in element:
  68. if isinstance(sub_element, HtmlElement):
  69. yield from iter_node(sub_element, depth)
  70. def element2html(element: HtmlElement) -> str:
  71. return tostring(element, encoding="utf-8").decode()
  72. def html2element(html_str: str, base_url=None) -> HtmlElement:
  73. html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
  74. html_str = re.sub('</?br.*?>', '', html_str)
  75. html_str = re.sub(r'<\?xml.*?>', '', html_str)
  76. html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
  77. return fromstring(html_str, base_url=base_url)
  78. def valid_element(node: HtmlElement, feature: str):
  79. if len(node.xpath(feature)) > 0:
  80. return True
  81. else:
  82. return False
  83. def remove_node(node: HtmlElement):
  84. """
  85. this is a in-place operation, not necessary to return
  86. :param node:
  87. :return:
  88. """
  89. parent = node.getparent()
  90. if parent is not None:
  91. parent.remove(node)
  92. def drop_tag(node: HtmlElement):
  93. """
  94. only delete the tag, but merge its text to parent.
  95. :param node:
  96. :return:
  97. """
  98. parent = node.getparent()
  99. if parent is not None:
  100. node.drop_tag()
  101. def clean_html(html_str: str):
  102. html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
  103. html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
  104. html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
  105. html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
  106. html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
  107. html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
  108. html_str = re.sub(r'<img[^>]*>', '', html_str)
  109. return html_str
  110. def extract_text(html_str: str):
  111. soup = BeautifulSoup(html_str, "lxml")
  112. return soup.get_text()
  113. def verify_text(val: str, length=50):
  114. """检查数字、字母、中文的个数"""
  115. if val is None:
  116. return False
  117. sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
  118. for pattern in sub_pattern:
  119. val = re.sub(pattern, '', val)
  120. # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
  121. if len(val) < length:
  122. '''无效文本'''
  123. return False
  124. '''有效文本'''
  125. return True