utils.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. import io
  2. import operator
  3. import re
  4. import zlib
  5. from html import unescape
  6. from urllib.parse import urlencode, urljoin
  7. import tldextract
  8. from bs4 import BeautifulSoup
  9. from lxml.html import etree, HtmlElement, fromstring, tostring
  10. from urllib3 import get_host
  11. from common.log import logger
  12. from crawler.defaults import (
  13. USELESS_TAG,
  14. USELESS_ATTR,
  15. TAGS_CAN_BE_REMOVE_IF_EMPTY,
  16. VALID_WORDS,
  17. VOID_WORDS,
  18. PAGE_TEXT_CHECK_WORDS,
  19. PAGE_TEXT_FILTER_WORDS
  20. )
  21. from predict_bidding_model import exists_ztb
  22. def err_details(worker):
  23. worker_exception = worker.exception()
  24. if worker_exception:
  25. logger.exception("Worker return exception: {}".format(worker_exception))
  26. return worker
  27. def split_domain(val: str):
  28. if re.match(r'\d+', val) is None:
  29. return re.split(r'[\\.:]', val)
  30. return [val]
  31. def extract_host(url):
  32. """
  33. # >>> base_url = extract_host('http://192.168.3.207:8080/')
  34. """
  35. _s, _h, _p = get_host(url)
  36. return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
  37. def extract_domain(url):
  38. """
  39. 抽取一级域名,使用点连接域和后缀字段(如果提供的域名是ipv4,就返回ipv4;)
  40. # >>> extract_domain('http://192.168.3.207:8080/')
  41. 192.168.3.207
  42. # >>> extract_domain('http://forums.bbc.co.uk')
  43. 'bbc.co.uk'
  44. """
  45. # ext = tldextract.extract(url)
  46. # return ext.registered_domain or ext.ipv4
  47. _, h, p = get_host(url)
  48. return f"{h}:{p}" if p else h # 域名判重时使用全称
  49. def extract_fqdn(url):
  50. """返回一个完全限定的域名"""
  51. ext = tldextract.extract(url)
  52. return ext.fqdn or ext.ipv4
  53. def extract_page_title(source):
  54. node = ''
  55. try:
  56. element = html2element(source)
  57. node = element.xpath('/html/head/title/text()|//title/text()')
  58. except etree.ParserError:
  59. pass
  60. if len(node) > 1:
  61. return "".join(";".join(node).split())
  62. return "".join("".join(node).split())
  63. def is_url(url):
  64. """判断url格式"""
  65. _regex = re.compile(
  66. r'^(?:http|ftp)s?://' # http:// or https://
  67. r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
  68. r'localhost|' # localhost...
  69. r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
  70. r'(?::\d+)?' # optional port
  71. r'(?:/?|[/?]\S+)$', re.IGNORECASE)
  72. return re.match(_regex, url) is not None
  73. def is_contains(val: str, feature: str):
  74. if operator.contains(val, feature):
  75. return True
  76. return False
  77. def is_domain(domain):
  78. ext = tldextract.extract(domain)
  79. if not ext.domain:
  80. return False
  81. return True
  82. def label_split(val):
  83. # '~`!#$%^&*()_+-=|\';"":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》{《}】【\n\]\[ '
  84. result = re.split(r'[- _,,\\.|-「」【】??!!/、] *', val)
  85. result = [v for v in result if len(v) > 0]
  86. return result
  87. def join_url(url: str, parameters: dict):
  88. """
  89. 拼接url与所带参数
  90. :param url: 链接
  91. :param parameters: 参数
  92. :return: 拼接后的url
  93. """
  94. _data = '?' + urlencode(parameters)
  95. return urljoin(url, _data)
  96. def extract_text(source: str):
  97. soup = BeautifulSoup(source, "lxml")
  98. return soup.get_text()
  99. def verify_text(val: str, length=50):
  100. """检查数字、字母、中文的个数"""
  101. if val is None:
  102. return False
  103. sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
  104. for pattern in sub_pattern:
  105. val = re.sub(pattern, '', val)
  106. # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
  107. if len(val) < length:
  108. '''无效文本'''
  109. return False
  110. '''有效文本'''
  111. return True
  112. def clean_whitespace(text: str):
  113. """清洗空白符 \n=换行 \r=回车 \v=垂直制表符 \f=换页"""
  114. obj = io.StringIO()
  115. for i in text:
  116. # 不要剔除 "" 和 "\t" 空白符,保持页面标签名称与属性分离
  117. if i not in '\n\r\v\f':
  118. obj.write(i)
  119. return obj.getvalue()
  120. def clean_html(source):
  121. html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', source)
  122. html_str = re.sub('<!--[\s\S]*?-->', '', html_str) # 清除注释
  123. html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str) # 清除样式
  124. html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>', '', html_str) # 清除js
  125. html_str = re.sub('</?br.*?>', '', html_str)
  126. html_str = re.sub(r'<\?xml.*?>', '', html_str)
  127. html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
  128. html_str = clean_whitespace(html_str)
  129. return html_str
  130. def html2element(source: str, base_url=None) -> HtmlElement:
  131. html_str = clean_html(source)
  132. if len(html_str) == 0:
  133. # 防止清洗页面垃圾标签,导致fromstring抛出lxml.etree.ParserError
  134. html_str = '''<html lang="en"></html>'''
  135. return fromstring(html_str, base_url=base_url)
  136. def element2html(element: HtmlElement) -> str:
  137. return unescape(tostring(element, encoding="utf-8").decode())
  138. def iter_node(element: HtmlElement, depth=1):
  139. yield element, depth
  140. depth += 1
  141. for sub_element in element:
  142. if isinstance(sub_element, HtmlElement):
  143. yield from iter_node(sub_element, depth)
  144. # print('退出', depth)
  145. def remove_node(node: HtmlElement):
  146. """
  147. this is a in-place operation, not necessary to return
  148. :param node:
  149. :return:
  150. """
  151. parent = node.getparent()
  152. if parent is not None:
  153. node.drop_tree()
  154. # parent.remove(node)
  155. def drop_tag(node: HtmlElement):
  156. """
  157. only delete the tag, but merge its text to parent.
  158. :param node:
  159. :return:
  160. """
  161. parent = node.getparent()
  162. if parent is not None:
  163. node.drop_tag()
  164. def is_empty_element(node: HtmlElement):
  165. return not node.getchildren() and not node.text
  166. def normalize_node(element: HtmlElement):
  167. etree.strip_elements(element, *USELESS_TAG, with_tail=False)
  168. # 节点预处理,删除节点与更新节点的操作在同一循环发生时,更新节点的操作不会生效,原因:?
  169. # 空节点合并、噪声节点剔除
  170. for node, _ in iter_node(element):
  171. if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
  172. remove_node(node)
  173. if node.tag.lower() == 'p':
  174. etree.strip_tags(node, 'span')
  175. etree.strip_tags(node, 'strong')
  176. # if a div tag does not contain any sub node, it could be converted to p node.
  177. if node.tag.lower() == 'div' and not node.getchildren():
  178. node.tag = 'p'
  179. if node.tag.lower() == 'span' and not node.getchildren():
  180. node.tag = 'p'
  181. # remove empty p tag
  182. if node.tag.lower() == 'p' and not node.xpath('.//img'):
  183. if not (node.text and node.text.strip()):
  184. drop_tag(node)
  185. # Delete inline styles
  186. style = node.get('style')
  187. if style:
  188. del node.attrib['style']
  189. # Obsolete scroll property
  190. if node.tag.lower() == 'marquee':
  191. remove_node(node)
  192. # 删除包含干扰属性的节点(完全匹配)
  193. for node, _ in iter_node(element):
  194. attr = (node.get('id') or node.get('class'))
  195. if attr:
  196. if attr.lower() in USELESS_ATTR:
  197. remove_node(node)
  198. break
  199. def pre_parse(element):
  200. normalize_node(element)
  201. return element
  202. def check_text_by_words(val: str):
  203. for word in VOID_WORDS:
  204. search = re.search(word, val)
  205. if search is not None:
  206. return False
  207. for keyword in VALID_WORDS:
  208. search = re.search(keyword, val)
  209. if search is not None:
  210. return True
  211. return False
  212. def check_page_by_words(val: str):
  213. if 7 < len(val) < 100:
  214. for word in PAGE_TEXT_FILTER_WORDS:
  215. search = re.search(word, val)
  216. if search is not None:
  217. return False
  218. for keyword in PAGE_TEXT_CHECK_WORDS:
  219. search = re.search(keyword, val)
  220. if search is not None:
  221. return True
  222. return False
  223. def predict_bidding_model(item: dict):
  224. result = {**item}
  225. predict_result = exists_ztb(item)
  226. predict = any({v for _, v in predict_result.items()})
  227. result['predict'] = int(predict)
  228. return result
  229. def compress_str(content, level=9):
  230. return zlib.compress(content.encode(encoding='utf-8'), level=level)