3 năm trước cách đây · 7c3ce60b45
--- a/find_source/crawler/services/channel.py
+++ b/find_source/crawler/services/channel.py
@@ -1,11 +1,12 @@
 
				-import re
			
 
				+import os
			
 
				+import pathlib
			
 
				 from urllib.parse import urljoin
			
 
				 
			
 
				 from lxml.html import etree, HtmlElement
			
 
				 
			
 
				+from common.tools import sha1, detect_encoding
			
 
				 from crawler.analysis import TimeExtractor
			
 
				-from crawler.defaults import USELESS_ATTR, FOOTER_TEXTS, CATEGORY_TEXTS, \
			
 
				-    PAGE_TEXTS, LOGIN_TEXTS
			
 
				+from crawler.defaults import PAGE_TEXTS, LOGIN_TEXTS, NAV_TEXTS
			
 
				 from crawler.download import Downloader
			
 
				 from crawler.utils import (
			
 
				     element2html,
			
@@ -17,9 +18,6 @@ from crawler.utils import (
 
				     is_empty_element,
			
 
				     is_title,
			
 
				 )
			
 
				-import pathlib
			
 
				-import os
			
 
				-from common.tools import sha1, detect_encoding
			
 
				 
			
 
				 _base_path = pathlib.Path(__file__).parent
			
 
				 
			
@@ -51,7 +49,7 @@ def extract_text(node: HtmlElement):
 
				     return "".join(f"{text}".split())
			
 
				 
			
 
				 
			
 
				-def query_descendants_tag_date_total(node: HtmlElement, tag=None):
			
 
				+def tag_date_total(node: HtmlElement, tag=None):
			
 
				     count = 0
			
 
				     # 先辈节点与目标节点名称相同并且包含时间文本的个数
			
 
				     contains_date_nodes = []
			
@@ -70,36 +68,51 @@ def query_descendants_tag_date_total(node: HtmlElement, tag=None):
 
				 
			
 
				 
			
 
				 def remove_ancestors_date_tag(node: HtmlElement):
			
 
				-    break_loop = False
			
 
				-    tag = node.tag.lower()  # TODO 是否一定需要后裔节点的类型
			
 
				+    # tag = node.tag.lower()  # TODO 是否一定需要后裔节点的类型
			
 
				     prev_node = node
			
 
				     # 情况1: 先辈节点下直接放置全部的时间节点，直接删除
			
 
				+    remove_count = 0
			
 
				     for ancestor in node.iterancestors():
			
 
				-        # total = query_descendants_tag_date_total(ancestor, tag) #TODO 当node为父节点时，会影响识别，？若是node为其本身的时候，是否会议还能发异常？
			
 
				-        total = query_descendants_tag_date_total(ancestor)
			
 
				+        is_remove = False
			
 
				+        total = tag_date_total(ancestor)
			
 
				         # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
			
 
				         if total > 3:
			
 
				             # 情况2: ancestor节点下面不直接放置全部的时间节点；
			
 
				             # 首先查询出时间文本大于3及以上的节点，此时直接删除ancestor节点，会导致
			
 
				             # ancestor节点边界变大，包括不想期望保留节点一起被删除。此处遍历查询prev_node
			
 
				-            # 兄弟节点（ancestor子节点）寻找时间文本大于3及以上的兄弟节点，若不存
			
 
				+            # 兄弟节点（ancestor后裔节点）寻找时间文本大于3及以上的兄弟节点，若不存
			
 
				             # 时间文本大于3及以上的情况，此时再处理ancestor节点，达到增大边界效果，删除即可
			
 
				             for sibling in prev_node.itersiblings():
			
 
				-                # count = query_descendants_tag_date_total(sibling, tag)
			
 
				-                count = query_descendants_tag_date_total(sibling)
			
 
				-                if count > 3:
			
 
				+                sibling_tag_date_total = tag_date_total(sibling)
			
 
				+                if sibling_tag_date_total > 3:
			
 
				                     remove_node(sibling)
			
 
				-                    break_loop = True
			
 
				-                    print("remove tag >>> ", sibling.tag, sibling.attrib, )
			
 
				-                    break
			
 
				-            else:
			
 
				+                    is_remove = True
			
 
				+                    # print("remove sibling tag >>> ", sibling.tag, sibling.attrib, )
			
 
				+            # 查询出时间文本大于3及以上，若prev_node的没有兄弟节点，此时直接删除该ancestor
			
 
				+            if not is_remove:
			
 
				+                remove_node(ancestor)
			
 
				+                # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
			
 
				+                is_remove = True
			
 
				+        elif 1 < total <= 2:
			
 
				+            # 逐条删除（查询出时间文本条数，从子节点边界范围进行查询、剔除，防止）
			
 
				+            for child in ancestor.iterchildren():
			
 
				+                child_tag_date_total = tag_date_total(child)
			
 
				+                if child_tag_date_total > 0:
			
 
				+                    remove_node(child)
			
 
				+                    # print("remove child tag >>> ", child.tag, child.attrib, )
			
 
				+                    is_remove = True
			
 
				+            if not is_remove:
			
 
				                 remove_node(ancestor)
			
 
				-                print("remove tag >>> ", ancestor.tag, ancestor.attrib, )
			
 
				-                break_loop = True
			
 
				+                # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
			
 
				+                is_remove = True
			
 
				         else:
			
 
				-            # 保存上一次查询的先辈节点
			
 
				+            # 保存上一次查询的先辈节点（时间文本小于等于3）
			
 
				             prev_node = ancestor
			
 
				-    return break_loop
			
 
				+
			
 
				+        if is_remove:
			
 
				+            remove_count += 1
			
 
				+
			
 
				+    return True if remove_count > 0 else False
			
 
				 
			
 
				 
			
 
				 def show_html(page, *, file=None, bash_url=None):
			
@@ -175,9 +188,8 @@ def remove_nav_node(element: HtmlElement):
 
				     for node, _ in iter_node(element):
			
 
				         text = extract_text(node)
			
 
				         # print('nav_node >>> ', node.tag, text)
			
 
				-        # 板块标签&页脚标签
			
 
				-        texts = {*CATEGORY_TEXTS, *FOOTER_TEXTS}
			
 
				-        for item in texts:
			
 
				+        # （板块文本|页脚文本|导航文本）
			
 
				+        for item in NAV_TEXTS:
			
 
				             if item in text:
			
 
				                 remove_node(node)
			
 
				         # 登录标签
			
@@ -202,9 +214,9 @@ def remove_nav_node(element: HtmlElement):
 
				 
			
 
				 
			
 
				 def remove_date_node(element: HtmlElement):
			
 
				-    break_loop = False
			
 
				+    retrieve = False
			
 
				     for node, _ in iter_node(element):
			
 
				-        if break_loop:
			
 
				+        if retrieve:
			
 
				             break
			
 
				         publish_time = TimeExtractor().extractor(node)
			
 
				         # print('date_node >>> ', node.tag, node.attrib, publish_time)
			
@@ -216,7 +228,7 @@ def remove_date_node(element: HtmlElement):
 
				                 # 存在多个兄弟节点，所以通过分析共同的父节点
			
 
				                 parent = node.getparent()
			
 
				                 # 统计父节点的拥有时间文本的后裔节点个数
			
 
				-                total = query_descendants_tag_date_total(parent)
			
 
				+                total = tag_date_total(parent)
			
 
				                 if total > 3:
			
 
				                     # 简单场景：仅仅单个兄弟节点拥有时间文本(发布时间)
			
 
				                     remove_node(parent)
			
@@ -224,10 +236,10 @@ def remove_date_node(element: HtmlElement):
 
				                     # 复杂场景：多个兄弟节点拥有时间文本(开始时间，截止时间...)
			
 
				                     # print("parent_date_node >>> ", parent.tag, parent.attrib, len(list(parent.itersiblings())))
			
 
				                     # 从父节点开始，查询且删除先辈节点中拥有时间文本的先辈节点
			
 
				-                    break_loop = remove_ancestors_date_tag(parent)
			
 
				+                    retrieve = remove_ancestors_date_tag(parent)
			
 
				             else:
			
 
				                 # 情况2：无兄弟节点，从自身开始，查询且删除先辈节点中拥有时间文本的先辈节点
			
 
				-                break_loop = remove_ancestors_date_tag(node)
			
 
				+                retrieve = remove_ancestors_date_tag(node)
			
 
				 
			
 
				 
			
 
				 def clean_node(element):
			
@@ -251,10 +263,8 @@ def extract_data(source, base_url):
 
				         for node in child.iterdescendants('a'):
			
 
				             title = extract_text(node)
			
 
				             href = node.attrib.get('href')
			
 
				-            # if href is None or href.find('javascript') == 0 or href.find('//') == 0:
			
 
				-            #     continue
			
 
				             href = urljoin(base_url, href)
			
 
				-            if is_title(title):
			
 
				+            if is_title(title) and len(title) <= 15:
			
 
				                 item = (title, href)
			
 
				                 data.append(item)
			
 
				         key = "{}_{}".format(child.tag.lower(), index)
			
@@ -273,61 +283,30 @@ def process_page(source):
 
				     element = html2element(source)
			
 
				     # web网页预处理（web页面去噪，会改变原始dom结构）
			
 
				     element = pre_parse(element)
			
 
				-    show_html(element, file='2预处理.html')
			
 
				+    # show_html(element, file='2预处理.html')
			
 
				     # 整理节点
			
 
				     element = trim_node(element)
			
 
				-    show_html(element, file='3整理body节点.html')
			
 
				+    # show_html(element, file='3整理body节点.html')
			
 
				     # 剔除节点
			
 
				     strip_node(element)
			
 
				-    show_html(element, file='4剔除节点.html')
			
 
				+    # show_html(element, file='4剔除节点.html')
			
 
				     # 删除导航节点
			
 
				     remove_nav_node(element)
			
 
				-    show_html(element, file='5删除导航条.html')
			
 
				+    # show_html(element, file='5删除导航条.html')
			
 
				     # 删除时间节点
			
 
				     remove_date_node(element)
			
 
				-    show_html(element, file='6删除时间块.html')
			
 
				+    # show_html(element, file='6删除时间块.html')
			
 
				     # 清理节点
			
 
				     clean_node(element)
			
 
				-    show_html(element, file='7清理dom.html')
			
 
				+    # show_html(element, file='7清理dom.html')
			
 
				     return element2html(element)
			
 
				 
			
 
				 
			
 
				 def bfs(response, base_url):
			
 
				     source = response.text
			
 
				-    show_html(source, file='1原始页.html')
			
 
				+    # show_html(source, file='1原始页.html')
			
 
				+    if len(source) == 0:
			
 
				+        return {}
			
 
				     source = process_page(source)
			
 
				     items = extract_data(source, base_url)
			
 
				     return items
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    d = Downloader()
			
 
				-    # url = 'http://zbpt.zycqjy.com/rest/sub_list_nav.cs#'
			
 
				-    # url = 'http://fgw.hubei.gov.cn/fbjd/xxgkml/xkfw/xzxkjg/xmbaqk/'
			
 
				-    # url = 'https://fzggw.zj.gov.cn/col/col1599544/index.html'
			
 
				-    # url = 'http://113.200.193.24:8009/Main/Projects#'
			
 
				-    # url = 'http://jjc.usx.edu.cn/zbxx.htm#'
			
 
				-    # url = 'https://www.xxggzy.cn/jyxx/089003/089003001/moreinfo_len6.html'
			
 
				-    # url = 'http://www.hdzbgs.com/List.aspx?id=12'
			
 
				-    # url = 'https://ggzy.qiannan.gov.cn/zfcg_500203/zbgg_5060411/index.html'
			
 
				-    # url = 'http://www.lzlcgroup.com/cms/column/index/id/57.html'
			
 
				-    # url = 'http://ggzy.zjlg.gov.cn:86/TPFront/jyxx/004002/'
			
 
				-    # url = 'https://www.elongbiao.com/List/NoticeP/9'
			
 
				-    # url = 'https://www.elongbiao.com/List/Notice/12'  # 多时间文本 算法优化一次
			
 
				-    # url = 'http://lytjj.longyan.gov.cn/xxgk/tjgg/'
			
 
				-    # url = 'http://www.lydeyy.com/plus/list.php?tid=36'  # 时间文本 算法优化一次
			
 
				-    # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007004/moreinfo.html' # 算法优化一次
			
 
				-    # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007002/007002004/moreinfo.html'
			
 
				-    # url = 'http://www.qdwater.com.cn:8010/bid2/front/toWinBidIndexPage'
			
 
				-    # url = 'http://ly.fjycw.com/NewsList.aspx?GUID=48-48-55'
			
 
				-
			
 
				-    # url = 'http://www.hljcg.gov.cn/welcome.jsp?dq=2302'  # TODO 首页抽取，待优化
			
 
				-    url = 'https://ggzy.longyan.gov.cn/lyztb/zqcg/008004/moreinfo.html '
			
 
				-
			
 
				-
			
 
				-    # javascript 渲染页面
			
 
				-    # url = 'http://zhaobiao.elongcheng.com:82/'  # 详情所在 onclick
			
 
				-
			
 
				-    r = d.get(url)
			
 
				-    print(r)
			
 
				-    bfs(r, url)