Browse Source

栏目提取 - 增加标题长度限制(15个字符以内)

dongzhaorui 3 năm trước cách đây
mục cha
commit
7c3ce60b45
1 tập tin đã thay đổi với 53 bổ sung74 xóa
  1. 53 74
      find_source/crawler/services/channel.py

+ 53 - 74
find_source/crawler/services/channel.py

@@ -1,11 +1,12 @@
-import re
+import os
+import pathlib
 from urllib.parse import urljoin
 
 from lxml.html import etree, HtmlElement
 
+from common.tools import sha1, detect_encoding
 from crawler.analysis import TimeExtractor
-from crawler.defaults import USELESS_ATTR, FOOTER_TEXTS, CATEGORY_TEXTS, \
-    PAGE_TEXTS, LOGIN_TEXTS
+from crawler.defaults import PAGE_TEXTS, LOGIN_TEXTS, NAV_TEXTS
 from crawler.download import Downloader
 from crawler.utils import (
     element2html,
@@ -17,9 +18,6 @@ from crawler.utils import (
     is_empty_element,
     is_title,
 )
-import pathlib
-import os
-from common.tools import sha1, detect_encoding
 
 _base_path = pathlib.Path(__file__).parent
 
@@ -51,7 +49,7 @@ def extract_text(node: HtmlElement):
     return "".join(f"{text}".split())
 
 
-def query_descendants_tag_date_total(node: HtmlElement, tag=None):
+def tag_date_total(node: HtmlElement, tag=None):
     count = 0
     # 先辈节点与目标节点名称相同并且包含时间文本的个数
     contains_date_nodes = []
@@ -70,36 +68,51 @@ def query_descendants_tag_date_total(node: HtmlElement, tag=None):
 
 
 def remove_ancestors_date_tag(node: HtmlElement):
-    break_loop = False
-    tag = node.tag.lower()  # TODO 是否一定需要后裔节点的类型
+    # tag = node.tag.lower()  # TODO 是否一定需要后裔节点的类型
     prev_node = node
     # 情况1: 先辈节点下直接放置全部的时间节点,直接删除
+    remove_count = 0
     for ancestor in node.iterancestors():
-        # total = query_descendants_tag_date_total(ancestor, tag) #TODO 当node为父节点时,会影响识别,?若是node为其本身的时候,是否会议还能发异常?
-        total = query_descendants_tag_date_total(ancestor)
+        is_remove = False
+        total = tag_date_total(ancestor)
         # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
         if total > 3:
             # 情况2: ancestor节点下面不直接放置全部的时间节点;
             # 首先查询出时间文本大于3及以上的节点,此时直接删除ancestor节点,会导致
             # ancestor节点边界变大,包括不想期望保留节点一起被删除。此处遍历查询prev_node
-            # 兄弟节点(ancestor节点)寻找时间文本大于3及以上的兄弟节点,若不存
+            # 兄弟节点(ancestor后裔节点)寻找时间文本大于3及以上的兄弟节点,若不存
             # 时间文本大于3及以上的情况,此时再处理ancestor节点,达到增大边界效果,删除即可
             for sibling in prev_node.itersiblings():
-                # count = query_descendants_tag_date_total(sibling, tag)
-                count = query_descendants_tag_date_total(sibling)
-                if count > 3:
+                sibling_tag_date_total = tag_date_total(sibling)
+                if sibling_tag_date_total > 3:
                     remove_node(sibling)
-                    break_loop = True
-                    print("remove tag >>> ", sibling.tag, sibling.attrib, )
-                    break
-            else:
+                    is_remove = True
+                    # print("remove sibling tag >>> ", sibling.tag, sibling.attrib, )
+            # 查询出时间文本大于3及以上,若prev_node的没有兄弟节点,此时直接删除该ancestor
+            if not is_remove:
+                remove_node(ancestor)
+                # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
+                is_remove = True
+        elif 1 < total <= 2:
+            # 逐条删除(查询出时间文本条数,从子节点边界范围进行查询、剔除,防止)
+            for child in ancestor.iterchildren():
+                child_tag_date_total = tag_date_total(child)
+                if child_tag_date_total > 0:
+                    remove_node(child)
+                    # print("remove child tag >>> ", child.tag, child.attrib, )
+                    is_remove = True
+            if not is_remove:
                 remove_node(ancestor)
-                print("remove tag >>> ", ancestor.tag, ancestor.attrib, )
-                break_loop = True
+                # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
+                is_remove = True
         else:
-            # 保存上一次查询的先辈节点
+            # 保存上一次查询的先辈节点(时间文本小于等于3)
             prev_node = ancestor
-    return break_loop
+
+        if is_remove:
+            remove_count += 1
+
+    return True if remove_count > 0 else False
 
 
 def show_html(page, *, file=None, bash_url=None):
@@ -175,9 +188,8 @@ def remove_nav_node(element: HtmlElement):
     for node, _ in iter_node(element):
         text = extract_text(node)
         # print('nav_node >>> ', node.tag, text)
-        # 板块标签&页脚标签
-        texts = {*CATEGORY_TEXTS, *FOOTER_TEXTS}
-        for item in texts:
+        # (板块文本|页脚文本|导航文本)
+        for item in NAV_TEXTS:
             if item in text:
                 remove_node(node)
         # 登录标签
@@ -202,9 +214,9 @@ def remove_nav_node(element: HtmlElement):
 
 
 def remove_date_node(element: HtmlElement):
-    break_loop = False
+    retrieve = False
     for node, _ in iter_node(element):
-        if break_loop:
+        if retrieve:
             break
         publish_time = TimeExtractor().extractor(node)
         # print('date_node >>> ', node.tag, node.attrib, publish_time)
@@ -216,7 +228,7 @@ def remove_date_node(element: HtmlElement):
                 # 存在多个兄弟节点,所以通过分析共同的父节点
                 parent = node.getparent()
                 # 统计父节点的拥有时间文本的后裔节点个数
-                total = query_descendants_tag_date_total(parent)
+                total = tag_date_total(parent)
                 if total > 3:
                     # 简单场景:仅仅单个兄弟节点拥有时间文本(发布时间)
                     remove_node(parent)
@@ -224,10 +236,10 @@ def remove_date_node(element: HtmlElement):
                     # 复杂场景:多个兄弟节点拥有时间文本(开始时间,截止时间...)
                     # print("parent_date_node >>> ", parent.tag, parent.attrib, len(list(parent.itersiblings())))
                     # 从父节点开始,查询且删除先辈节点中拥有时间文本的先辈节点
-                    break_loop = remove_ancestors_date_tag(parent)
+                    retrieve = remove_ancestors_date_tag(parent)
             else:
                 # 情况2:无兄弟节点,从自身开始,查询且删除先辈节点中拥有时间文本的先辈节点
-                break_loop = remove_ancestors_date_tag(node)
+                retrieve = remove_ancestors_date_tag(node)
 
 
 def clean_node(element):
@@ -251,10 +263,8 @@ def extract_data(source, base_url):
         for node in child.iterdescendants('a'):
             title = extract_text(node)
             href = node.attrib.get('href')
-            # if href is None or href.find('javascript') == 0 or href.find('//') == 0:
-            #     continue
             href = urljoin(base_url, href)
-            if is_title(title):
+            if is_title(title) and len(title) <= 15:
                 item = (title, href)
                 data.append(item)
         key = "{}_{}".format(child.tag.lower(), index)
@@ -273,61 +283,30 @@ def process_page(source):
     element = html2element(source)
     # web网页预处理(web页面去噪,会改变原始dom结构)
     element = pre_parse(element)
-    show_html(element, file='2预处理.html')
+    # show_html(element, file='2预处理.html')
     # 整理节点
     element = trim_node(element)
-    show_html(element, file='3整理body节点.html')
+    # show_html(element, file='3整理body节点.html')
     # 剔除节点
     strip_node(element)
-    show_html(element, file='4剔除节点.html')
+    # show_html(element, file='4剔除节点.html')
     # 删除导航节点
     remove_nav_node(element)
-    show_html(element, file='5删除导航条.html')
+    # show_html(element, file='5删除导航条.html')
     # 删除时间节点
     remove_date_node(element)
-    show_html(element, file='6删除时间块.html')
+    # show_html(element, file='6删除时间块.html')
     # 清理节点
     clean_node(element)
-    show_html(element, file='7清理dom.html')
+    # show_html(element, file='7清理dom.html')
     return element2html(element)
 
 
 def bfs(response, base_url):
     source = response.text
-    show_html(source, file='1原始页.html')
+    # show_html(source, file='1原始页.html')
+    if len(source) == 0:
+        return {}
     source = process_page(source)
     items = extract_data(source, base_url)
     return items
-
-
-if __name__ == '__main__':
-    d = Downloader()
-    # url = 'http://zbpt.zycqjy.com/rest/sub_list_nav.cs#'
-    # url = 'http://fgw.hubei.gov.cn/fbjd/xxgkml/xkfw/xzxkjg/xmbaqk/'
-    # url = 'https://fzggw.zj.gov.cn/col/col1599544/index.html'
-    # url = 'http://113.200.193.24:8009/Main/Projects#'
-    # url = 'http://jjc.usx.edu.cn/zbxx.htm#'
-    # url = 'https://www.xxggzy.cn/jyxx/089003/089003001/moreinfo_len6.html'
-    # url = 'http://www.hdzbgs.com/List.aspx?id=12'
-    # url = 'https://ggzy.qiannan.gov.cn/zfcg_500203/zbgg_5060411/index.html'
-    # url = 'http://www.lzlcgroup.com/cms/column/index/id/57.html'
-    # url = 'http://ggzy.zjlg.gov.cn:86/TPFront/jyxx/004002/'
-    # url = 'https://www.elongbiao.com/List/NoticeP/9'
-    # url = 'https://www.elongbiao.com/List/Notice/12'  # 多时间文本 算法优化一次
-    # url = 'http://lytjj.longyan.gov.cn/xxgk/tjgg/'
-    # url = 'http://www.lydeyy.com/plus/list.php?tid=36'  # 时间文本 算法优化一次
-    # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007004/moreinfo.html' # 算法优化一次
-    # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007002/007002004/moreinfo.html'
-    # url = 'http://www.qdwater.com.cn:8010/bid2/front/toWinBidIndexPage'
-    # url = 'http://ly.fjycw.com/NewsList.aspx?GUID=48-48-55'
-
-    # url = 'http://www.hljcg.gov.cn/welcome.jsp?dq=2302'  # TODO 首页抽取,待优化
-    url = 'https://ggzy.longyan.gov.cn/lyztb/zqcg/008004/moreinfo.html '
-
-
-    # javascript 渲染页面
-    # url = 'http://zhaobiao.elongcheng.com:82/'  # 详情所在 onclick
-
-    r = d.get(url)
-    print(r)
-    bfs(r, url)