channel.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. import json
  2. import os
  3. import pathlib
  4. from urllib.parse import urljoin
  5. from lxml.html import etree, HtmlElement
  6. from common.tools import sha1, detect_encoding
  7. from crawler.analysis import TimeExtractor
  8. from crawler.defaults import PAGE_TEXTS, LOGIN_TEXTS, NAV_TEXTS
  9. from crawler.download import Downloader
  10. from crawler.utils import (
  11. element2html,
  12. html2element,
  13. iter_node,
  14. drop_tag,
  15. remove_node,
  16. pre_parse,
  17. is_empty_element,
  18. check_text_by_words,
  19. )
  20. _base_path = pathlib.Path(__file__).parent
  21. def analysis(origin_lst, target_lst):
  22. results = []
  23. for target_ in target_lst:
  24. source: str = target_['contenthtml']
  25. _c = 0
  26. for item in origin_lst:
  27. href, channel = item['href'], item['channel']
  28. if source.count(channel) > 0 or source.count(href) > 0:
  29. _c += 1
  30. if _c > 0:
  31. results.append({
  32. 'similarity': _c,
  33. 'contenthtml': source,
  34. 'depth': target_['depth']
  35. })
  36. results = sorted(results, key=lambda x: x['similarity'], reverse=True)
  37. _t = max(results, key=lambda dic: dic['depth'])
  38. return _t
  39. def extract_text(node: HtmlElement):
  40. text = (node.text or node.tail or node.xpath('string(.)'))
  41. return "".join(f"{text}".split())
  42. def tag_date_total(node: HtmlElement, tag=None):
  43. count = 0
  44. # 先辈节点与目标节点名称相同并且包含时间文本的个数
  45. contains_date_nodes = []
  46. if tag is not None:
  47. descendants = list(node.iterdescendants(tag))
  48. else:
  49. descendants = list(node.iterdescendants())
  50. for descendant in descendants:
  51. pt = TimeExtractor().extractor(descendant)
  52. children = len(list(descendant.iterchildren())) > 0
  53. if pt != '' and not children and descendant not in contains_date_nodes:
  54. contains_date_nodes.append(descendant)
  55. count += 1
  56. return count
  57. def remove_ancestors_date_tag(node: HtmlElement):
  58. prev_node = node
  59. # 情况1: 先辈节点下直接放置全部的时间节点,直接删除
  60. remove_count = 0
  61. for ancestor in node.iterancestors():
  62. if ancestor.tag.lower() in ['body', 'html']:
  63. continue
  64. is_remove = False
  65. total = tag_date_total(ancestor)
  66. # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
  67. if total > 3:
  68. # 情况2: ancestor节点下面不直接放置全部的时间节点;
  69. # 首先查询出时间文本大于3及以上的节点,此时直接删除ancestor节点,会导致
  70. # ancestor节点边界变大,包括不想期望保留节点一起被删除。此处遍历查询prev_node
  71. # 兄弟节点(ancestor后裔节点)寻找时间文本大于3及以上的兄弟节点,若不存
  72. # 时间文本大于3及以上的情况,此时再处理ancestor节点,达到增大边界效果,删除即可
  73. for sibling in prev_node.itersiblings():
  74. sibling_tag_date_total = tag_date_total(sibling)
  75. if sibling_tag_date_total > 3:
  76. remove_node(sibling)
  77. is_remove = True
  78. # print("remove sibling tag >>> ", sibling.tag, sibling.attrib, )
  79. # 查询出时间文本大于3及以上,若prev_node的没有兄弟节点,此时直接删除该ancestor
  80. if not is_remove:
  81. remove_node(ancestor)
  82. # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
  83. is_remove = True
  84. elif 1 < total <= 2:
  85. # 逐条删除(查询出时间文本条数,从子节点边界范围进行查询、剔除,防止)
  86. for child in ancestor.iterchildren():
  87. child_tag_date_total = tag_date_total(child)
  88. if child_tag_date_total > 0:
  89. remove_node(child)
  90. # print("remove child tag >>> ", child.tag, child.attrib, )
  91. is_remove = True
  92. if not is_remove:
  93. remove_node(ancestor)
  94. # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
  95. is_remove = True
  96. else:
  97. # 保存上一次查询的先辈节点(时间文本小于等于3)
  98. prev_node = ancestor
  99. if is_remove:
  100. remove_count += 1
  101. return True if remove_count > 0 else False
  102. def show_html(page, *, file=None, bash_url=None):
  103. if bash_url is None:
  104. bash_url = ''
  105. if isinstance(page, HtmlElement):
  106. source = element2html(page)
  107. elif isinstance(page, bytes):
  108. source = page.decode(detect_encoding(page), 'surrogatepass')
  109. else:
  110. source = page
  111. if file is None:
  112. dirname = 'html'
  113. os.makedirs(dirname, mode=0o777, exist_ok=True)
  114. file = f'{dirname}/{sha1(bash_url)}.html'
  115. else:
  116. file = file
  117. with open(file, 'w') as fp:
  118. fp.write(source)
  119. def trim_node(element: HtmlElement):
  120. """
  121. 整理节点 (body子节点更新为div,在更新过程中子节点会合并文本)
  122. :param element:
  123. :return:
  124. """
  125. children = element.xpath('/html/body/child::*')
  126. for child in children:
  127. for node, _ in iter_node(child):
  128. # print('trim_node >>> ', node.tag, node.attrib)
  129. if node.tag.lower() == 'div':
  130. break
  131. drop_tag(node)
  132. return element
  133. def strip_node(element: HtmlElement):
  134. """
  135. 剔除节点,若不是标签a且无时间文本,关键词标题,标签a个数,移除或者剔除
  136. :param element:
  137. :return:
  138. """
  139. for node, _ in iter_node(element):
  140. # 删除掉没有文本、发布时间、发布标题、href属性的节点及父节点
  141. if node.tag.lower() != 'a':
  142. # 节点文本(剔除左右空白、换行、回车符号)
  143. text = "".join("".join(node.xpath('./text()')).strip())
  144. # 关键词文本
  145. non_title = check_text_by_words(text) is False
  146. # 后裔a节点数量
  147. sub_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
  148. # 时间文本
  149. publish_time = TimeExtractor().extractor(node)
  150. # print('>>> ', node.tag, node.attrib, text)
  151. if non_title and sub_tag_gt_0 and publish_time == '':
  152. # print('strip_node >>> ', node.tag, node.attrib, text)
  153. parent = node.getparent()
  154. if parent is not None:
  155. if parent.tag.lower() == 'a':
  156. etree.strip_tags(parent, node.tag)
  157. elif parent.tag.lower() == 'td':
  158. if not node.getchildren():
  159. if len(text) == 0:
  160. remove_node(parent)
  161. else:
  162. etree.strip_tags(parent, node.tag)
  163. else:
  164. name = [child.tag for child in node.getchildren()]
  165. etree.strip_tags(parent, *name)
  166. else:
  167. remove_node(node)
  168. def remove_nav_node(element: HtmlElement):
  169. for node, _ in iter_node(element):
  170. text = extract_text(node)
  171. # print('nav_node >>> ', node.tag, text)
  172. # (板块文本|页脚文本|导航文本)
  173. for item in NAV_TEXTS:
  174. if item in text:
  175. remove_node(node)
  176. # 登录标签
  177. if text in LOGIN_TEXTS:
  178. remove_node(node)
  179. # 翻页标签栏
  180. if text in PAGE_TEXTS:
  181. tag = node.tag.lower()
  182. siblings = list(node.itersiblings(tag))
  183. # 通过查询相同标签兄弟节点,兄弟节点包含在一个先辈节点内;翻页标签不包含在同一先辈节点
  184. if len(siblings) > 3 and node.tag.lower() == tag:
  185. remove_node(node.getparent())
  186. else:
  187. for ancestor in node.iterancestors():
  188. # 先辈节点包含两个以上同类节点,则认为该先辈节点包含全部想要删除标签
  189. if len(list(ancestor.iterdescendants(tag))) > 2:
  190. remove_node(ancestor)
  191. break
  192. # 位置标签
  193. if '位置' in text:
  194. remove_node(node)
  195. def remove_date_node(element: HtmlElement):
  196. retrieve = False
  197. for node, _ in iter_node(element):
  198. if retrieve:
  199. break
  200. publish_time = TimeExtractor().extractor(node)
  201. # print('date_node >>> ', node.tag, node.attrib, publish_time)
  202. # 首先找拥有时间文本最深层级节点(拥有时间文本,不存在子节点)
  203. if publish_time != '' and len(list(node.iterchildren())) == 0:
  204. # print("date_node >>> ", node.tag, node.attrib, len(list(node.itersiblings())))
  205. # 时间文本有可能来自兄弟节点,而不是自身
  206. if len(list(node.itersiblings())) > 0:
  207. # 存在多个兄弟节点,所以通过分析共同的父节点
  208. parent = node.getparent()
  209. # 统计父节点的拥有时间文本的后裔节点个数
  210. total = tag_date_total(parent)
  211. if total > 3:
  212. # 简单场景:仅仅单个兄弟节点拥有时间文本(发布时间)
  213. remove_node(parent)
  214. else:
  215. # 复杂场景:多个兄弟节点拥有时间文本(开始时间,截止时间...)
  216. # print("parent_date_node >>> ", parent.tag, parent.attrib, len(list(parent.itersiblings())))
  217. # 从父节点开始,查询且删除先辈节点中拥有时间文本的先辈节点
  218. retrieve = remove_ancestors_date_tag(parent)
  219. else:
  220. # 情况2:无兄弟节点,从自身开始,查询且删除先辈节点中拥有时间文本的先辈节点
  221. retrieve = remove_ancestors_date_tag(node)
  222. def clean_node(element):
  223. for node, _ in iter_node(element):
  224. if is_empty_element(node):
  225. # print(' clean_node >>> ', node.tag, node.attrib, extract_text(node))
  226. remove_node(node)
  227. if node.tag.lower() == 'a' and list(node.iterchildren()):
  228. # 剔除a标签包含的单个或多个短语标签、文本标签,保留内部文本
  229. for child in node.iterchildren():
  230. etree.strip_tags(node, child.tag)
  231. def extract_data(source, base_url):
  232. element = html2element(source)
  233. children = element.xpath('/html/body/child::*')
  234. result = {}
  235. for index, child in enumerate(children):
  236. data = []
  237. for node in child.iterdescendants('a'):
  238. title = extract_text(node)
  239. href = urljoin(base_url, node.attrib.get('href'))
  240. if check_text_by_words(title) and len(title) <= 15:
  241. item = (title, href)
  242. data.append(item)
  243. key = "{}_{}".format(child.tag.lower(), index)
  244. result[key] = data
  245. print(result)
  246. for key, items in result.items():
  247. print(f"=============== {base_url} && {key} ===============")
  248. for val in items:
  249. print(val)
  250. print()
  251. return result
  252. def process_page(source):
  253. element = html2element(source)
  254. # web网页预处理(web页面去噪,会改变原始dom结构)
  255. element = pre_parse(element)
  256. # show_html(element, file='2预处理.html')
  257. # 整理节点
  258. element = trim_node(element)
  259. # show_html(element, file='3整理body节点.html')
  260. # 剔除节点
  261. strip_node(element)
  262. # show_html(element, file='4剔除节点.html')
  263. # 删除导航节点
  264. remove_nav_node(element)
  265. # show_html(element, file='5删除导航条.html')
  266. # 删除时间节点
  267. remove_date_node(element)
  268. # show_html(element, file='6删除时间块.html')
  269. # 清理节点
  270. clean_node(element)
  271. # show_html(element, file='7清理dom.html')
  272. return element2html(element)
  273. def bfs(response, base_url):
  274. try:
  275. source = response.json().get('html', '')
  276. except json.decoder.JSONDecodeError:
  277. source = response.text
  278. # show_html(source, file='1原始页.html')
  279. if len(source) == 0:
  280. return {}
  281. source = process_page(source)
  282. items = extract_data(source, base_url)
  283. return items