channel.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. import re
  2. from urllib.parse import urljoin
  3. from lxml.html import etree, HtmlElement
  4. from crawler.analysis import TimeExtractor
  5. from crawler.defaults import USELESS_ATTR, FOOTER_TEXTS, CATEGORY_TEXTS, \
  6. PAGE_TEXTS, LOGIN_TEXTS
  7. from crawler.download import Downloader
  8. from crawler.utils import (
  9. element2html,
  10. html2element,
  11. iter_node,
  12. drop_tag,
  13. remove_node,
  14. pre_parse,
  15. is_empty_element,
  16. is_title,
  17. )
  18. import pathlib
  19. import os
  20. from common.tools import sha1, detect_encoding
  21. _base_path = pathlib.Path(__file__).parent
  22. def analysis(origin_lst, target_lst):
  23. results = []
  24. for target_ in target_lst:
  25. source: str = target_['contenthtml']
  26. _c = 0
  27. for item in origin_lst:
  28. href, channel = item['href'], item['channel']
  29. if source.count(channel) > 0 or source.count(href) > 0:
  30. _c += 1
  31. if _c > 0:
  32. results.append({
  33. 'similarity': _c,
  34. 'contenthtml': source,
  35. 'depth': target_['depth']
  36. })
  37. results = sorted(results, key=lambda x: x['similarity'], reverse=True)
  38. _t = max(results, key=lambda dic: dic['depth'])
  39. return _t
  40. def extract_text(node: HtmlElement):
  41. text = (node.text or node.tail or node.xpath('string(.)'))
  42. return "".join(f"{text}".split())
  43. def query_descendants_tag_date_total(node: HtmlElement, tag=None):
  44. count = 0
  45. # 先辈节点与目标节点名称相同并且包含时间文本的个数
  46. contains_date_nodes = []
  47. if tag is not None:
  48. descendants = list(node.iterdescendants(tag))
  49. else:
  50. descendants = list(node.iterdescendants())
  51. for descendant in descendants:
  52. pt = TimeExtractor().extractor(descendant)
  53. children = len(list(descendant.iterchildren())) > 0
  54. if pt != '' and not children and descendant not in contains_date_nodes:
  55. contains_date_nodes.append(descendant)
  56. count += 1
  57. return count
  58. def remove_ancestors_date_tag(node: HtmlElement):
  59. break_loop = False
  60. tag = node.tag.lower() # TODO 是否一定需要后裔节点的类型
  61. prev_node = node
  62. # 情况1: 先辈节点下直接放置全部的时间节点,直接删除
  63. for ancestor in node.iterancestors():
  64. # total = query_descendants_tag_date_total(ancestor, tag) #TODO 当node为父节点时,会影响识别,?若是node为其本身的时候,是否会议还能发异常?
  65. total = query_descendants_tag_date_total(ancestor)
  66. # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
  67. if total > 3:
  68. # 情况2: ancestor节点下面不直接放置全部的时间节点;
  69. # 首先查询出时间文本大于3及以上的节点,此时直接删除ancestor节点,会导致
  70. # ancestor节点边界变大,包括不想期望保留节点一起被删除。此处遍历查询prev_node
  71. # 兄弟节点(ancestor子节点)寻找时间文本大于3及以上的兄弟节点,若不存
  72. # 时间文本大于3及以上的情况,此时再处理ancestor节点,达到增大边界效果,删除即可
  73. for sibling in prev_node.itersiblings():
  74. # count = query_descendants_tag_date_total(sibling, tag)
  75. count = query_descendants_tag_date_total(sibling)
  76. if count > 3:
  77. remove_node(sibling)
  78. break_loop = True
  79. print("remove tag >>> ", sibling.tag, sibling.attrib, )
  80. break
  81. else:
  82. remove_node(ancestor)
  83. print("remove tag >>> ", ancestor.tag, ancestor.attrib, )
  84. break_loop = True
  85. else:
  86. # 保存上一次查询的先辈节点
  87. prev_node = ancestor
  88. return break_loop
  89. def show_html(page, *, file=None, bash_url=None):
  90. if bash_url is None:
  91. bash_url = ''
  92. if isinstance(page, HtmlElement):
  93. source = element2html(page)
  94. elif isinstance(page, bytes):
  95. source = page.decode(detect_encoding(page), 'surrogatepass')
  96. else:
  97. source = page
  98. if file is None:
  99. dirname = 'html'
  100. os.makedirs(dirname, mode=0o777, exist_ok=True)
  101. file = f'{dirname}/{sha1(bash_url)}.html'
  102. else:
  103. file = file
  104. with open(file, 'w') as fp:
  105. fp.write(source)
  106. def trim_node(element: HtmlElement):
  107. """
  108. 整理节点 (body子节点更新为div,在更新过程中子节点会合并文本)
  109. :param element:
  110. :return:
  111. """
  112. children = element.xpath('/html/body/child::*')
  113. for child in children:
  114. for node, _ in iter_node(child):
  115. # print('trim_node >>> ', node.tag, node.attrib)
  116. if node.tag.lower() == 'div':
  117. break
  118. drop_tag(node)
  119. return element
  120. def strip_node(element: HtmlElement):
  121. """
  122. 剔除节点,若不是标签a且无时间文本,关键词标题,标签a个数,移除或者剔除
  123. :param element:
  124. :return:
  125. """
  126. for node, _ in iter_node(element):
  127. # 节点文本(剔除空白、换行、回车符号)
  128. text = "".join("".join(node.xpath('./text()')).split())
  129. # 删除掉没有文本、发布时间、发布标题、href属性的节点及父节点
  130. if node.tag.lower() != 'a':
  131. # 关键词文本
  132. non_title = is_title(text) is False
  133. # 后裔a节点数量
  134. total_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
  135. # 时间文本
  136. publish_time = TimeExtractor().extractor(node)
  137. # print('>>> ', node.tag, node.attrib, text)
  138. if non_title and total_tag_gt_0 and publish_time == '':
  139. # print('strip_node >>> ', node.tag, node.attrib, text)
  140. parent = node.getparent()
  141. if parent is not None and parent.tag.lower() == 'a':
  142. etree.strip_tags(parent, node.tag)
  143. elif parent is not None and parent.tag.lower() == 'td':
  144. remove_node(parent)
  145. else:
  146. remove_node(node)
  147. def remove_nav_node(element: HtmlElement):
  148. for node, _ in iter_node(element):
  149. text = extract_text(node)
  150. # print('nav_node >>> ', node.tag, text)
  151. # 板块标签&页脚标签
  152. texts = {*CATEGORY_TEXTS, *FOOTER_TEXTS}
  153. for item in texts:
  154. if item in text:
  155. remove_node(node)
  156. # 登录标签
  157. if text in LOGIN_TEXTS:
  158. remove_node(node)
  159. # 翻页标签栏
  160. if text in PAGE_TEXTS:
  161. tag = node.tag.lower()
  162. siblings = list(node.itersiblings(tag))
  163. # 通过查询相同标签兄弟节点,兄弟节点包含在一个先辈节点内;翻页标签不包含在同一先辈节点
  164. if len(siblings) > 3 and node.tag.lower() == tag:
  165. remove_node(node.getparent())
  166. else:
  167. for ancestor in node.iterancestors():
  168. # 先辈节点包含两个以上同类节点,则认为该先辈节点包含全部想要删除标签
  169. if len(list(ancestor.iterdescendants(tag))) > 2:
  170. remove_node(ancestor)
  171. break
  172. # 位置标签
  173. if '位置' in text:
  174. remove_node(node)
  175. def remove_date_node(element: HtmlElement):
  176. break_loop = False
  177. for node, _ in iter_node(element):
  178. if break_loop:
  179. break
  180. publish_time = TimeExtractor().extractor(node)
  181. # print('date_node >>> ', node.tag, node.attrib, publish_time)
  182. # 首先找拥有时间文本最深层级节点(拥有时间文本,不存在子节点)
  183. if publish_time != '' and len(list(node.iterchildren())) == 0:
  184. # print("date_node >>> ", node.tag, node.attrib, len(list(node.itersiblings())))
  185. # 时间文本有可能来自兄弟节点,而不是自身
  186. if len(list(node.itersiblings())) > 0:
  187. # 存在多个兄弟节点,所以通过分析共同的父节点
  188. parent = node.getparent()
  189. # 统计父节点的拥有时间文本的后裔节点个数
  190. total = query_descendants_tag_date_total(parent)
  191. if total > 3:
  192. # 简单场景:仅仅单个兄弟节点拥有时间文本(发布时间)
  193. remove_node(parent)
  194. else:
  195. # 复杂场景:多个兄弟节点拥有时间文本(开始时间,截止时间...)
  196. # print("parent_date_node >>> ", parent.tag, parent.attrib, len(list(parent.itersiblings())))
  197. # 从父节点开始,查询且删除先辈节点中拥有时间文本的先辈节点
  198. break_loop = remove_ancestors_date_tag(parent)
  199. else:
  200. # 情况2:无兄弟节点,从自身开始,查询且删除先辈节点中拥有时间文本的先辈节点
  201. break_loop = remove_ancestors_date_tag(node)
  202. def clean_node(element):
  203. for node, _ in iter_node(element):
  204. if is_empty_element(node):
  205. # print(' clean_node >>> ', node.tag, node.attrib, extract_text(node))
  206. remove_node(node)
  207. if node.tag.lower() == 'a' and list(node.iterchildren()):
  208. # 剔除a标签包含的单个或多个短语标签、文本标签,保留内部文本
  209. for child in node.iterchildren():
  210. etree.strip_tags(node, child.tag)
  211. def extract_data(source, base_url):
  212. element = html2element(source)
  213. children = element.xpath('/html/body/child::*')
  214. result = {}
  215. for index, child in enumerate(children):
  216. data = []
  217. for node in child.iterdescendants('a'):
  218. title = extract_text(node)
  219. href = node.attrib.get('href')
  220. # if href is None or href.find('javascript') == 0 or href.find('//') == 0:
  221. # continue
  222. href = urljoin(base_url, href)
  223. if is_title(title):
  224. item = (title, href)
  225. data.append(item)
  226. key = "{}_{}".format(child.tag.lower(), index)
  227. result[key] = data
  228. print(result)
  229. for key, items in result.items():
  230. print(f"=============== {base_url} && {key} ===============")
  231. for val in items:
  232. print(val)
  233. print()
  234. return result
  235. def process_page(source):
  236. element = html2element(source)
  237. # web网页预处理(web页面去噪,会改变原始dom结构)
  238. element = pre_parse(element)
  239. show_html(element, file='2预处理.html')
  240. # 整理节点
  241. element = trim_node(element)
  242. show_html(element, file='3整理body节点.html')
  243. # 剔除节点
  244. strip_node(element)
  245. show_html(element, file='4剔除节点.html')
  246. # 删除导航节点
  247. remove_nav_node(element)
  248. show_html(element, file='5删除导航条.html')
  249. # 删除时间节点
  250. remove_date_node(element)
  251. show_html(element, file='6删除时间块.html')
  252. # 清理节点
  253. clean_node(element)
  254. show_html(element, file='7清理dom.html')
  255. return element2html(element)
  256. def bfs(response, base_url):
  257. source = response.text
  258. show_html(source, file='1原始页.html')
  259. source = process_page(source)
  260. items = extract_data(source, base_url)
  261. return items
  262. if __name__ == '__main__':
  263. d = Downloader()
  264. # url = 'http://zbpt.zycqjy.com/rest/sub_list_nav.cs#'
  265. # url = 'http://fgw.hubei.gov.cn/fbjd/xxgkml/xkfw/xzxkjg/xmbaqk/'
  266. # url = 'https://fzggw.zj.gov.cn/col/col1599544/index.html'
  267. # url = 'http://113.200.193.24:8009/Main/Projects#'
  268. # url = 'http://jjc.usx.edu.cn/zbxx.htm#'
  269. # url = 'https://www.xxggzy.cn/jyxx/089003/089003001/moreinfo_len6.html'
  270. # url = 'http://www.hdzbgs.com/List.aspx?id=12'
  271. # url = 'https://ggzy.qiannan.gov.cn/zfcg_500203/zbgg_5060411/index.html'
  272. # url = 'http://www.lzlcgroup.com/cms/column/index/id/57.html'
  273. # url = 'http://ggzy.zjlg.gov.cn:86/TPFront/jyxx/004002/'
  274. # url = 'https://www.elongbiao.com/List/NoticeP/9'
  275. # url = 'https://www.elongbiao.com/List/Notice/12' # 多时间文本 算法优化一次
  276. # url = 'http://lytjj.longyan.gov.cn/xxgk/tjgg/'
  277. # url = 'http://www.lydeyy.com/plus/list.php?tid=36' # 时间文本 算法优化一次
  278. # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007004/moreinfo.html' # 算法优化一次
  279. # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007002/007002004/moreinfo.html'
  280. # url = 'http://www.qdwater.com.cn:8010/bid2/front/toWinBidIndexPage'
  281. # url = 'http://ly.fjycw.com/NewsList.aspx?GUID=48-48-55'
  282. # url = 'http://www.hljcg.gov.cn/welcome.jsp?dq=2302' # TODO 首页抽取,待优化
  283. url = 'https://ggzy.longyan.gov.cn/lyztb/zqcg/008004/moreinfo.html '
  284. # javascript 渲染页面
  285. # url = 'http://zhaobiao.elongcheng.com:82/' # 详情所在 onclick
  286. r = d.get(url)
  287. print(r)
  288. bfs(r, url)