|
@@ -0,0 +1,333 @@
|
|
|
+import re
|
|
|
+from urllib.parse import urljoin
|
|
|
+
|
|
|
+from lxml.html import etree, HtmlElement
|
|
|
+
|
|
|
+from crawler.analysis import TimeExtractor
|
|
|
+from crawler.defaults import USELESS_ATTR, FOOTER_TEXTS, CATEGORY_TEXTS, \
|
|
|
+ PAGE_TEXTS, LOGIN_TEXTS
|
|
|
+from crawler.download import Downloader
|
|
|
+from crawler.utils import (
|
|
|
+ element2html,
|
|
|
+ html2element,
|
|
|
+ iter_node,
|
|
|
+ drop_tag,
|
|
|
+ remove_node,
|
|
|
+ pre_parse,
|
|
|
+ is_empty_element,
|
|
|
+ is_title,
|
|
|
+)
|
|
|
+import pathlib
|
|
|
+import os
|
|
|
+from common.tools import sha1, detect_encoding
|
|
|
+
|
|
|
+_base_path = pathlib.Path(__file__).parent
|
|
|
+
|
|
|
+
|
|
|
+def analysis(origin_lst, target_lst):
|
|
|
+ results = []
|
|
|
+ for target_ in target_lst:
|
|
|
+ source: str = target_['contenthtml']
|
|
|
+ _c = 0
|
|
|
+ for item in origin_lst:
|
|
|
+ href, channel = item['href'], item['channel']
|
|
|
+ if source.count(channel) > 0 or source.count(href) > 0:
|
|
|
+ _c += 1
|
|
|
+
|
|
|
+ if _c > 0:
|
|
|
+ results.append({
|
|
|
+ 'similarity': _c,
|
|
|
+ 'contenthtml': source,
|
|
|
+ 'depth': target_['depth']
|
|
|
+ })
|
|
|
+
|
|
|
+ results = sorted(results, key=lambda x: x['similarity'], reverse=True)
|
|
|
+ _t = max(results, key=lambda dic: dic['depth'])
|
|
|
+ return _t
|
|
|
+
|
|
|
+
|
|
|
+def extract_text(node: HtmlElement):
|
|
|
+ text = (node.text or node.tail or node.xpath('string(.)'))
|
|
|
+ return "".join(f"{text}".split())
|
|
|
+
|
|
|
+
|
|
|
+def query_descendants_tag_date_total(node: HtmlElement, tag=None):
|
|
|
+ count = 0
|
|
|
+ # 先辈节点与目标节点名称相同并且包含时间文本的个数
|
|
|
+ contains_date_nodes = []
|
|
|
+ if tag is not None:
|
|
|
+ descendants = list(node.iterdescendants(tag))
|
|
|
+ else:
|
|
|
+ descendants = list(node.iterdescendants())
|
|
|
+
|
|
|
+ for descendant in descendants:
|
|
|
+ pt = TimeExtractor().extractor(descendant)
|
|
|
+ children = len(list(descendant.iterchildren())) > 0
|
|
|
+ if pt != '' and not children and descendant not in contains_date_nodes:
|
|
|
+ contains_date_nodes.append(descendant)
|
|
|
+ count += 1
|
|
|
+ return count
|
|
|
+
|
|
|
+
|
|
|
+def remove_ancestors_date_tag(node: HtmlElement):
|
|
|
+ break_loop = False
|
|
|
+ tag = node.tag.lower() # TODO 是否一定需要后裔节点的类型
|
|
|
+ prev_node = node
|
|
|
+ # 情况1: 先辈节点下直接放置全部的时间节点,直接删除
|
|
|
+ for ancestor in node.iterancestors():
|
|
|
+ # total = query_descendants_tag_date_total(ancestor, tag) #TODO 当node为父节点时,会影响识别,?若是node为其本身的时候,是否会议还能发异常?
|
|
|
+ total = query_descendants_tag_date_total(ancestor)
|
|
|
+ # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
|
|
|
+ if total > 3:
|
|
|
+ # 情况2: ancestor节点下面不直接放置全部的时间节点;
|
|
|
+ # 首先查询出时间文本大于3及以上的节点,此时直接删除ancestor节点,会导致
|
|
|
+ # ancestor节点边界变大,包括不想期望保留节点一起被删除。此处遍历查询prev_node
|
|
|
+ # 兄弟节点(ancestor子节点)寻找时间文本大于3及以上的兄弟节点,若不存
|
|
|
+ # 时间文本大于3及以上的情况,此时再处理ancestor节点,达到增大边界效果,删除即可
|
|
|
+ for sibling in prev_node.itersiblings():
|
|
|
+ # count = query_descendants_tag_date_total(sibling, tag)
|
|
|
+ count = query_descendants_tag_date_total(sibling)
|
|
|
+ if count > 3:
|
|
|
+ remove_node(sibling)
|
|
|
+ break_loop = True
|
|
|
+ print("remove tag >>> ", sibling.tag, sibling.attrib, )
|
|
|
+ break
|
|
|
+ else:
|
|
|
+ remove_node(ancestor)
|
|
|
+ print("remove tag >>> ", ancestor.tag, ancestor.attrib, )
|
|
|
+ break_loop = True
|
|
|
+ else:
|
|
|
+ # 保存上一次查询的先辈节点
|
|
|
+ prev_node = ancestor
|
|
|
+ return break_loop
|
|
|
+
|
|
|
+
|
|
|
+def show_html(page, *, file=None, bash_url=None):
|
|
|
+ if bash_url is None:
|
|
|
+ bash_url = ''
|
|
|
+
|
|
|
+ if isinstance(page, HtmlElement):
|
|
|
+ source = element2html(page)
|
|
|
+ elif isinstance(page, bytes):
|
|
|
+ source = page.decode(detect_encoding(page), 'surrogatepass')
|
|
|
+ else:
|
|
|
+ source = page
|
|
|
+
|
|
|
+ if file is None:
|
|
|
+ dirname = 'html'
|
|
|
+ os.makedirs(dirname, mode=0o777, exist_ok=True)
|
|
|
+ file = f'{dirname}/{sha1(bash_url)}.html'
|
|
|
+ else:
|
|
|
+ file = file
|
|
|
+
|
|
|
+ with open(file, 'w') as fp:
|
|
|
+ fp.write(source)
|
|
|
+
|
|
|
+
|
|
|
+def trim_node(element: HtmlElement):
|
|
|
+ """
|
|
|
+ 整理节点 (body子节点更新为div,在更新过程中子节点会合并文本)
|
|
|
+
|
|
|
+ :param element:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ children = element.xpath('/html/body/child::*')
|
|
|
+ for child in children:
|
|
|
+ for node, _ in iter_node(child):
|
|
|
+ # print('trim_node >>> ', node.tag, node.attrib)
|
|
|
+ if node.tag.lower() == 'div':
|
|
|
+ break
|
|
|
+ drop_tag(node)
|
|
|
+ return element
|
|
|
+
|
|
|
+
|
|
|
+def strip_node(element: HtmlElement):
|
|
|
+ """
|
|
|
+ 剔除节点,若不是标签a且无时间文本,关键词标题,标签a个数,移除或者剔除
|
|
|
+
|
|
|
+ :param element:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ for node, _ in iter_node(element):
|
|
|
+ # 节点文本(剔除空白、换行、回车符号)
|
|
|
+ text = "".join("".join(node.xpath('./text()')).split())
|
|
|
+ # 删除掉没有文本、发布时间、发布标题、href属性的节点及父节点
|
|
|
+ if node.tag.lower() != 'a':
|
|
|
+ # 关键词文本
|
|
|
+ non_title = is_title(text) is False
|
|
|
+ # 后裔a节点数量
|
|
|
+ total_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
|
|
|
+ # 时间文本
|
|
|
+ publish_time = TimeExtractor().extractor(node)
|
|
|
+ # print('>>> ', node.tag, node.attrib, text)
|
|
|
+ if non_title and total_tag_gt_0 and publish_time == '':
|
|
|
+ # print('strip_node >>> ', node.tag, node.attrib, text)
|
|
|
+ parent = node.getparent()
|
|
|
+ if parent is not None and parent.tag.lower() == 'a':
|
|
|
+ etree.strip_tags(parent, node.tag)
|
|
|
+ elif parent is not None and parent.tag.lower() == 'td':
|
|
|
+ remove_node(parent)
|
|
|
+ else:
|
|
|
+ remove_node(node)
|
|
|
+
|
|
|
+
|
|
|
+def remove_nav_node(element: HtmlElement):
|
|
|
+ for node, _ in iter_node(element):
|
|
|
+ text = extract_text(node)
|
|
|
+ # print('nav_node >>> ', node.tag, text)
|
|
|
+ # 板块标签&页脚标签
|
|
|
+ texts = {*CATEGORY_TEXTS, *FOOTER_TEXTS}
|
|
|
+ for item in texts:
|
|
|
+ if item in text:
|
|
|
+ remove_node(node)
|
|
|
+ # 登录标签
|
|
|
+ if text in LOGIN_TEXTS:
|
|
|
+ remove_node(node)
|
|
|
+ # 翻页标签栏
|
|
|
+ if text in PAGE_TEXTS:
|
|
|
+ tag = node.tag.lower()
|
|
|
+ siblings = list(node.itersiblings(tag))
|
|
|
+ # 通过查询相同标签兄弟节点,兄弟节点包含在一个先辈节点内;翻页标签不包含在同一先辈节点
|
|
|
+ if len(siblings) > 3 and node.tag.lower() == tag:
|
|
|
+ remove_node(node.getparent())
|
|
|
+ else:
|
|
|
+ for ancestor in node.iterancestors():
|
|
|
+ # 先辈节点包含两个以上同类节点,则认为该先辈节点包含全部想要删除标签
|
|
|
+ if len(list(ancestor.iterdescendants(tag))) > 2:
|
|
|
+ remove_node(ancestor)
|
|
|
+ break
|
|
|
+ # 位置标签
|
|
|
+ if '位置' in text:
|
|
|
+ remove_node(node)
|
|
|
+
|
|
|
+
|
|
|
+def remove_date_node(element: HtmlElement):
|
|
|
+ break_loop = False
|
|
|
+ for node, _ in iter_node(element):
|
|
|
+ if break_loop:
|
|
|
+ break
|
|
|
+ publish_time = TimeExtractor().extractor(node)
|
|
|
+ # print('date_node >>> ', node.tag, node.attrib, publish_time)
|
|
|
+ # 首先找拥有时间文本最深层级节点(拥有时间文本,不存在子节点)
|
|
|
+ if publish_time != '' and len(list(node.iterchildren())) == 0:
|
|
|
+ # print("date_node >>> ", node.tag, node.attrib, len(list(node.itersiblings())))
|
|
|
+ # 时间文本有可能来自兄弟节点,而不是自身
|
|
|
+ if len(list(node.itersiblings())) > 0:
|
|
|
+ # 存在多个兄弟节点,所以通过分析共同的父节点
|
|
|
+ parent = node.getparent()
|
|
|
+ # 统计父节点的拥有时间文本的后裔节点个数
|
|
|
+ total = query_descendants_tag_date_total(parent)
|
|
|
+ if total > 3:
|
|
|
+ # 简单场景:仅仅单个兄弟节点拥有时间文本(发布时间)
|
|
|
+ remove_node(parent)
|
|
|
+ else:
|
|
|
+ # 复杂场景:多个兄弟节点拥有时间文本(开始时间,截止时间...)
|
|
|
+ # print("parent_date_node >>> ", parent.tag, parent.attrib, len(list(parent.itersiblings())))
|
|
|
+ # 从父节点开始,查询且删除先辈节点中拥有时间文本的先辈节点
|
|
|
+ break_loop = remove_ancestors_date_tag(parent)
|
|
|
+ else:
|
|
|
+ # 情况2:无兄弟节点,从自身开始,查询且删除先辈节点中拥有时间文本的先辈节点
|
|
|
+ break_loop = remove_ancestors_date_tag(node)
|
|
|
+
|
|
|
+
|
|
|
+def clean_node(element):
|
|
|
+ for node, _ in iter_node(element):
|
|
|
+ if is_empty_element(node):
|
|
|
+ # print(' clean_node >>> ', node.tag, node.attrib, extract_text(node))
|
|
|
+ remove_node(node)
|
|
|
+
|
|
|
+ if node.tag.lower() == 'a' and list(node.iterchildren()):
|
|
|
+ # 剔除a标签包含的单个或多个短语标签、文本标签,保留内部文本
|
|
|
+ for child in node.iterchildren():
|
|
|
+ etree.strip_tags(node, child.tag)
|
|
|
+
|
|
|
+
|
|
|
+def extract_data(source, base_url):
|
|
|
+ element = html2element(source)
|
|
|
+ children = element.xpath('/html/body/child::*')
|
|
|
+ result = {}
|
|
|
+ for index, child in enumerate(children):
|
|
|
+ data = []
|
|
|
+ for node in child.iterdescendants('a'):
|
|
|
+ title = extract_text(node)
|
|
|
+ href = node.attrib.get('href')
|
|
|
+ # if href is None or href.find('javascript') == 0 or href.find('//') == 0:
|
|
|
+ # continue
|
|
|
+ href = urljoin(base_url, href)
|
|
|
+ if is_title(title):
|
|
|
+ item = (title, href)
|
|
|
+ data.append(item)
|
|
|
+ key = "{}_{}".format(child.tag.lower(), index)
|
|
|
+ result[key] = data
|
|
|
+
|
|
|
+ print(result)
|
|
|
+ for key, items in result.items():
|
|
|
+ print(f"=============== {base_url} && {key} ===============")
|
|
|
+ for val in items:
|
|
|
+ print(val)
|
|
|
+ print()
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def process_page(source):
|
|
|
+ element = html2element(source)
|
|
|
+ # web网页预处理(web页面去噪,会改变原始dom结构)
|
|
|
+ element = pre_parse(element)
|
|
|
+ show_html(element, file='2预处理.html')
|
|
|
+ # 整理节点
|
|
|
+ element = trim_node(element)
|
|
|
+ show_html(element, file='3整理body节点.html')
|
|
|
+ # 剔除节点
|
|
|
+ strip_node(element)
|
|
|
+ show_html(element, file='4剔除节点.html')
|
|
|
+ # 删除导航节点
|
|
|
+ remove_nav_node(element)
|
|
|
+ show_html(element, file='5删除导航条.html')
|
|
|
+ # 删除时间节点
|
|
|
+ remove_date_node(element)
|
|
|
+ show_html(element, file='6删除时间块.html')
|
|
|
+ # 清理节点
|
|
|
+ clean_node(element)
|
|
|
+ show_html(element, file='7清理dom.html')
|
|
|
+ return element2html(element)
|
|
|
+
|
|
|
+
|
|
|
+def bfs(response, base_url):
|
|
|
+ source = response.text
|
|
|
+ show_html(source, file='1原始页.html')
|
|
|
+ source = process_page(source)
|
|
|
+ items = extract_data(source, base_url)
|
|
|
+ return items
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ d = Downloader()
|
|
|
+ # url = 'http://zbpt.zycqjy.com/rest/sub_list_nav.cs#'
|
|
|
+ # url = 'http://fgw.hubei.gov.cn/fbjd/xxgkml/xkfw/xzxkjg/xmbaqk/'
|
|
|
+ # url = 'https://fzggw.zj.gov.cn/col/col1599544/index.html'
|
|
|
+ # url = 'http://113.200.193.24:8009/Main/Projects#'
|
|
|
+ # url = 'http://jjc.usx.edu.cn/zbxx.htm#'
|
|
|
+ # url = 'https://www.xxggzy.cn/jyxx/089003/089003001/moreinfo_len6.html'
|
|
|
+ # url = 'http://www.hdzbgs.com/List.aspx?id=12'
|
|
|
+ # url = 'https://ggzy.qiannan.gov.cn/zfcg_500203/zbgg_5060411/index.html'
|
|
|
+ # url = 'http://www.lzlcgroup.com/cms/column/index/id/57.html'
|
|
|
+ # url = 'http://ggzy.zjlg.gov.cn:86/TPFront/jyxx/004002/'
|
|
|
+ # url = 'https://www.elongbiao.com/List/NoticeP/9'
|
|
|
+ # url = 'https://www.elongbiao.com/List/Notice/12' # 多时间文本 算法优化一次
|
|
|
+ # url = 'http://lytjj.longyan.gov.cn/xxgk/tjgg/'
|
|
|
+ # url = 'http://www.lydeyy.com/plus/list.php?tid=36' # 时间文本 算法优化一次
|
|
|
+ # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007004/moreinfo.html' # 算法优化一次
|
|
|
+ # url = 'https://ggzy.longyan.gov.cn/lyztb/gcjs/007002/007002004/moreinfo.html'
|
|
|
+ # url = 'http://www.qdwater.com.cn:8010/bid2/front/toWinBidIndexPage'
|
|
|
+ # url = 'http://ly.fjycw.com/NewsList.aspx?GUID=48-48-55'
|
|
|
+
|
|
|
+ # url = 'http://www.hljcg.gov.cn/welcome.jsp?dq=2302' # TODO 首页抽取,待优化
|
|
|
+ url = 'https://ggzy.longyan.gov.cn/lyztb/zqcg/008004/moreinfo.html '
|
|
|
+
|
|
|
+
|
|
|
+ # javascript 渲染页面
|
|
|
+ # url = 'http://zhaobiao.elongcheng.com:82/' # 详情所在 onclick
|
|
|
+
|
|
|
+ r = d.get(url)
|
|
|
+ print(r)
|
|
|
+ bfs(r, url)
|