|
@@ -1,11 +1,12 @@
|
|
|
-import re
|
|
|
+import os
|
|
|
+import pathlib
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
from lxml.html import etree, HtmlElement
|
|
|
|
|
|
+from common.tools import sha1, detect_encoding
|
|
|
from crawler.analysis import TimeExtractor
|
|
|
-from crawler.defaults import USELESS_ATTR, FOOTER_TEXTS, CATEGORY_TEXTS, \
|
|
|
- PAGE_TEXTS, LOGIN_TEXTS
|
|
|
+from crawler.defaults import PAGE_TEXTS, LOGIN_TEXTS, NAV_TEXTS
|
|
|
from crawler.download import Downloader
|
|
|
from crawler.utils import (
|
|
|
element2html,
|
|
@@ -17,9 +18,6 @@ from crawler.utils import (
|
|
|
is_empty_element,
|
|
|
is_title,
|
|
|
)
|
|
|
-import pathlib
|
|
|
-import os
|
|
|
-from common.tools import sha1, detect_encoding
|
|
|
|
|
|
_base_path = pathlib.Path(__file__).parent
|
|
|
|
|
@@ -51,7 +49,7 @@ def extract_text(node: HtmlElement):
|
|
|
return "".join(f"{text}".split())
|
|
|
|
|
|
|
|
|
-def query_descendants_tag_date_total(node: HtmlElement, tag=None):
|
|
|
+def tag_date_total(node: HtmlElement, tag=None):
|
|
|
count = 0
|
|
|
# 先辈节点与目标节点名称相同并且包含时间文本的个数
|
|
|
contains_date_nodes = []
|
|
@@ -70,36 +68,51 @@ def query_descendants_tag_date_total(node: HtmlElement, tag=None):
|
|
|
|
|
|
|
|
|
def remove_ancestors_date_tag(node: HtmlElement):
|
|
|
- break_loop = False
|
|
|
- tag = node.tag.lower() # TODO 是否一定需要后裔节点的类型
|
|
|
+ # tag = node.tag.lower() # TODO 是否一定需要后裔节点的类型
|
|
|
prev_node = node
|
|
|
# 情况1: 先辈节点下直接放置全部的时间节点,直接删除
|
|
|
+ remove_count = 0
|
|
|
for ancestor in node.iterancestors():
|
|
|
- # total = query_descendants_tag_date_total(ancestor, tag) #TODO 当node为父节点时,会影响识别,?若是node为其本身的时候,是否会议还能发异常?
|
|
|
- total = query_descendants_tag_date_total(ancestor)
|
|
|
+ is_remove = False
|
|
|
+ total = tag_date_total(ancestor)
|
|
|
# print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
|
|
|
if total > 3:
|
|
|
# 情况2: ancestor节点下面不直接放置全部的时间节点;
|
|
|
# 首先查询出时间文本大于3及以上的节点,此时直接删除ancestor节点,会导致
|
|
|
# ancestor节点边界变大,包括不想期望保留节点一起被删除。此处遍历查询prev_node
|
|
|
- # 兄弟节点(ancestor子节点)寻找时间文本大于3及以上的兄弟节点,若不存
|
|
|
+ # 兄弟节点(ancestor后裔节点)寻找时间文本大于3及以上的兄弟节点,若不存
|
|
|
# 时间文本大于3及以上的情况,此时再处理ancestor节点,达到增大边界效果,删除即可
|
|
|
for sibling in prev_node.itersiblings():
|
|
|
- # count = query_descendants_tag_date_total(sibling, tag)
|
|
|
- count = query_descendants_tag_date_total(sibling)
|
|
|
- if count > 3:
|
|
|
+ sibling_tag_date_total = tag_date_total(sibling)
|
|
|
+ if sibling_tag_date_total > 3:
|
|
|
remove_node(sibling)
|
|
|
- break_loop = True
|
|
|
- print("remove tag >>> ", sibling.tag, sibling.attrib, )
|
|
|
- break
|
|
|
- else:
|
|
|
+ is_remove = True
|
|
|
+ # print("remove sibling tag >>> ", sibling.tag, sibling.attrib, )
|
|
|
+ # 查询出时间文本大于3及以上,若prev_node的没有兄弟节点,此时直接删除该ancestor
|
|
|
+ if not is_remove:
|
|
|
+ remove_node(ancestor)
|
|
|
+ # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
|
|
|
+ is_remove = True
|
|
|
+ elif 1 < total <= 2:
|
|
|
+ # 逐条删除(查询出时间文本条数,从子节点边界范围进行查询、剔除,防止)
|
|
|
+ for child in ancestor.iterchildren():
|
|
|
+ child_tag_date_total = tag_date_total(child)
|
|
|
+ if child_tag_date_total > 0:
|
|
|
+ remove_node(child)
|
|
|
+ # print("remove child tag >>> ", child.tag, child.attrib, )
|
|
|
+ is_remove = True
|
|
|
+ if not is_remove:
|
|
|
remove_node(ancestor)
|
|
|
- print("remove tag >>> ", ancestor.tag, ancestor.attrib, )
|
|
|
- break_loop = True
|
|
|
+ # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
|
|
|
+ is_remove = True
|
|
|
else:
|
|
|
- # 保存上一次查询的先辈节点
|
|
|
+ # 保存上一次查询的先辈节点(时间文本小于等于3)
|
|
|
prev_node = ancestor
|
|
|
- return break_loop
|
|
|
+
|
|
|
+ if is_remove:
|
|
|
+ remove_count += 1
|
|
|
+
|
|
|
+ return True if remove_count > 0 else False
|
|
|
|
|
|
|
|
|
def show_html(page, *, file=None, bash_url=None):
|
|
@@ -175,9 +188,8 @@ def remove_nav_node(element: HtmlElement):
|
|
|
for node, _ in iter_node(element):
|
|
|
text = extract_text(node)
|
|
|
# print('nav_node >>> ', node.tag, text)
|
|
|
- # 板块标签&页脚标签
|
|
|
- texts = {*CATEGORY_TEXTS, *FOOTER_TEXTS}
|
|
|
- for item in texts:
|
|
|
+ # (板块文本|页脚文本|导航文本)
|
|
|
+ for item in NAV_TEXTS:
|
|
|
if item in text:
|
|
|
remove_node(node)
|
|
|
# 登录标签
|
|
@@ -202,9 +214,9 @@ def remove_nav_node(element: HtmlElement):
|
|
|
|
|
|
|
|
|
def remove_date_node(element: HtmlElement):
|
|
|
- break_loop = False
|
|
|
+ retrieve = False
|
|
|
for node, _ in iter_node(element):
|
|
|
- if break_loop:
|
|
|
+ if retrieve:
|
|
|
break
|
|
|
publish_time = TimeExtractor().extractor(node)
|
|
|
# print('date_node >>> ', node.tag, node.attrib, publish_time)
|
|
@@ -216,7 +228,7 @@ def remove_date_node(element: HtmlElement):
|
|
|
# 存在多个兄弟节点,所以通过分析共同的父节点
|
|
|
parent = node.getparent()
|
|
|
# 统计父节点的拥有时间文本的后裔节点个数
|
|
|
- total = query_descendants_tag_date_total(parent)
|
|
|
+ total = tag_date_total(parent)
|
|
|
if total > 3:
|
|
|
# 简单场景:仅仅单个兄弟节点拥有时间文本(发布时间)
|
|
|
remove_node(parent)
|
|
@@ -224,10 +236,10 @@ def remove_date_node(element: HtmlElement):
|
|
|
# 复杂场景:多个兄弟节点拥有时间文本(开始时间,截止时间...)
|
|
|
# print("parent_date_node >>> ", parent.tag, parent.attrib, len(list(parent.itersiblings())))
|
|
|
# 从父节点开始,查询且删除先辈节点中拥有时间文本的先辈节点
|
|
|
- break_loop = remove_ancestors_date_tag(parent)
|
|
|
+ retrieve = remove_ancestors_date_tag(parent)
|
|
|
else:
|
|
|
# 情况2:无兄弟节点,从自身开始,查询且删除先辈节点中拥有时间文本的先辈节点
|
|
|
- break_loop = remove_ancestors_date_tag(node)
|
|
|
+ retrieve = remove_ancestors_date_tag(node)
|
|
|
|
|
|
|
|
|
def clean_node(element):
|
|
@@ -251,10 +263,8 @@ def extract_data(source, base_url):
|
|
|
for node in child.iterdescendants('a'):
|
|
|
title = extract_text(node)
|
|
|
href = node.attrib.get('href')
|
|
|
- # if href is None or href.find('javascript') == 0 or href.find('//') == 0:
|
|
|
- # continue
|
|
|
href = urljoin(base_url, href)
|
|
|
- if is_title(title):
|
|
|
+ if is_title(title) and len(title) <= 15:
|
|
|
item = (title, href)
|
|
|
data.append(item)
|
|
|
key = "{}_{}".format(child.tag.lower(), index)
|
|
@@ -273,61 +283,30 @@ def process_page(source):
|
|
|
element = html2element(source)
|
|
|
# web网页预处理(web页面去噪,会改变原始dom结构)
|
|
|
element = pre_parse(element)
|
|
|
- show_html(element, file='2预处理.html')
|
|
|
+ # show_html(element, file='2预处理.html')
|
|
|
# 整理节点
|
|
|
element = trim_node(element)
|
|
|
- show_html(element, file='3整理body节点.html')
|
|
|
+ # show_html(element, file='3整理body节点.html')
|
|
|
# 剔除节点
|
|
|
strip_node(element)
|
|
|
- show_html(element, file='4剔除节点.html')
|
|
|
+ # show_html(element, file='4剔除节点.html')
|
|
|
# 删除导航节点
|
|
|
remove_nav_node(element)
|
|
|
- show_html(element, file='5删除导航条.html')
|
|
|
+ # show_html(element, file='5删除导航条.html')
|
|
|
# 删除时间节点
|
|
|
remove_date_node(element)
|
|
|
- show_html(element, file='6删除时间块.html')
|
|
|
+ # show_html(element, file='6删除时间块.html')
|
|
|
# 清理节点
|
|
|
clean_node(element)
|
|
|
- show_html(element, file='7清理dom.html')
|
|
|
+ # show_html(element, file='7清理dom.html')
|
|
|
return element2html(element)
|
|
|
|
|
|
|
|
|
def bfs(response, base_url):
|
|
|
source = response.text
|
|
|
- show_html(source, file='1原始页.html')
|
|
|
+ # show_html(source, file='1原始页.html')
|
|
|
+ if len(source) == 0:
|
|
|
+ return {}
|
|
|
source = process_page(source)
|
|
|
items = extract_data(source, base_url)
|
|
|
return items
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == '__main__':
|
|
|
- d = Downloader()
|
|
|
- # url = 'http://zbpt.zycqjy.com/rest/sub_list_nav.cs#'
|
|
|
- # url = 'http://fgw.hubei.gov.cn/fbjd/xxgkml/xkfw/xzxkjg/xmbaqk/'
|
|
|
- # url = 'https://fzggw.zj.gov.cn/col/col1599544/index.html'
|
|
|
- # url = 'http://113.200.193.24:8009/Main/Projects#'
|
|
|
- # url = 'http://jjc.usx.edu.cn/zbxx.htm#'
|
|
|
- # url = 'https://www.xxggzy.cn/jyxx/089003/089003001/moreinfo_len6.html'
|
|
|
- # url = 'http://www.hdzbgs.com/List.aspx?id=12'
|
|
|
- # url = 'https://ggzy.qiannan.gov.cn/zfcg_500203/zbgg_5060411/index.html'
|
|
|
- # url = 'http://www.lzlcgroup.com/cms/column/index/id/57.html'
|
|
|
- # url = 'http://ggzy.zjlg.gov.cn:86/TPFront/jyxx/004002/'
|
|
|
- # url = 'https://www.elongbiao.com/List/NoticeP/9'
|
|
|
- # url = 'https://www.elongbiao.com/List/Notice/12' # 多时间文本 算法优化一次
|
|
|
- # url = 'http://lytjj.longyan.gov.cn/xxgk/tjgg/'
|
|
|
- # url = 'http://www.lydeyy.com/plus/list.php?tid=36' # 时间文本 算法优化一次
|
|
|
- # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007004/moreinfo.html' # 算法优化一次
|
|
|
- # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007002/007002004/moreinfo.html'
|
|
|
- # url = 'http://www.qdwater.com.cn:8010/bid2/front/toWinBidIndexPage'
|
|
|
- # url = 'http://ly.fjycw.com/NewsList.aspx?GUID=48-48-55'
|
|
|
-
|
|
|
- # url = 'http://www.hljcg.gov.cn/welcome.jsp?dq=2302' # TODO 首页抽取,待优化
|
|
|
- url = 'https://ggzy.longyan.gov.cn/lyztb/zqcg/008004/moreinfo.html '
|
|
|
-
|
|
|
-
|
|
|
- # javascript 渲染页面
|
|
|
- # url = 'http://zhaobiao.elongcheng.com:82/' # 详情所在 onclick
|
|
|
-
|
|
|
- r = d.get(url)
|
|
|
- print(r)
|
|
|
- bfs(r, url)
|