import json import os import pathlib from urllib.parse import urljoin from lxml.html import etree, HtmlElement from common.tools import sha1, detect_encoding from crawler.analysis import TimeExtractor from crawler.defaults import PAGE_TEXTS, LOGIN_TEXTS, NAV_TEXTS from crawler.download import Downloader from crawler.utils import ( element2html, html2element, iter_node, drop_tag, remove_node, pre_parse, is_empty_element, check_text_by_words, ) _base_path = pathlib.Path(__file__).parent def analysis(origin_lst, target_lst): results = [] for target_ in target_lst: source: str = target_['contenthtml'] _c = 0 for item in origin_lst: href, channel = item['href'], item['channel'] if source.count(channel) > 0 or source.count(href) > 0: _c += 1 if _c > 0: results.append({ 'similarity': _c, 'contenthtml': source, 'depth': target_['depth'] }) results = sorted(results, key=lambda x: x['similarity'], reverse=True) _t = max(results, key=lambda dic: dic['depth']) return _t def extract_text(node: HtmlElement): text = (node.text or node.tail or node.xpath('string(.)')) return "".join(f"{text}".split()) def tag_date_total(node: HtmlElement, tag=None): count = 0 # 先辈节点与目标节点名称相同并且包含时间文本的个数 contains_date_nodes = [] if tag is not None: descendants = list(node.iterdescendants(tag)) else: descendants = list(node.iterdescendants()) for descendant in descendants: pt = TimeExtractor().extractor(descendant) children = len(list(descendant.iterchildren())) > 0 if pt != '' and not children and descendant not in contains_date_nodes: contains_date_nodes.append(descendant) count += 1 return count def remove_ancestors_date_tag(node: HtmlElement): prev_node = node # 情况1: 先辈节点下直接放置全部的时间节点,直接删除 remove_count = 0 for ancestor in node.iterancestors(): if ancestor.tag.lower() in ['body', 'html']: continue is_remove = False total = tag_date_total(ancestor) # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total) if total > 3: # 情况2: ancestor节点下面不直接放置全部的时间节点; # 首先查询出时间文本大于3及以上的节点,此时直接删除ancestor节点,会导致 # ancestor节点边界变大,包括不想期望保留节点一起被删除。此处遍历查询prev_node # 兄弟节点(ancestor后裔节点)寻找时间文本大于3及以上的兄弟节点,若不存 # 时间文本大于3及以上的情况,此时再处理ancestor节点,达到增大边界效果,删除即可 for sibling in prev_node.itersiblings(): sibling_tag_date_total = tag_date_total(sibling) if sibling_tag_date_total > 3: remove_node(sibling) is_remove = True # print("remove sibling tag >>> ", sibling.tag, sibling.attrib, ) # 查询出时间文本大于3及以上,若prev_node的没有兄弟节点,此时直接删除该ancestor if not is_remove: remove_node(ancestor) # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, ) is_remove = True elif 1 < total <= 2: # 逐条删除(查询出时间文本条数,从子节点边界范围进行查询、剔除,防止) for child in ancestor.iterchildren(): child_tag_date_total = tag_date_total(child) if child_tag_date_total > 0: remove_node(child) # print("remove child tag >>> ", child.tag, child.attrib, ) is_remove = True if not is_remove: remove_node(ancestor) # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, ) is_remove = True else: # 保存上一次查询的先辈节点(时间文本小于等于3) prev_node = ancestor if is_remove: remove_count += 1 return True if remove_count > 0 else False def show_html(page, *, file=None, bash_url=None): if bash_url is None: bash_url = '' if isinstance(page, HtmlElement): source = element2html(page) elif isinstance(page, bytes): source = page.decode(detect_encoding(page), 'surrogatepass') else: source = page if file is None: dirname = 'html' os.makedirs(dirname, mode=0o777, exist_ok=True) file = f'{dirname}/{sha1(bash_url)}.html' else: file = file with open(file, 'w') as fp: fp.write(source) def trim_node(element: HtmlElement): """ 整理节点 (body子节点更新为div,在更新过程中子节点会合并文本) :param element: :return: """ children = element.xpath('/html/body/child::*') for child in children: for node, _ in iter_node(child): # print('trim_node >>> ', node.tag, node.attrib) if node.tag.lower() == 'div': break drop_tag(node) return element def strip_node(element: HtmlElement): """ 剔除节点,若不是标签a且无时间文本,关键词标题,标签a个数,移除或者剔除 :param element: :return: """ for node, _ in iter_node(element): # 删除掉没有文本、发布时间、发布标题、href属性的节点及父节点 if node.tag.lower() != 'a': # 节点文本(剔除左右空白、换行、回车符号) text = "".join("".join(node.xpath('./text()')).strip()) # 关键词文本 non_title = check_text_by_words(text) is False # 后裔a节点数量 sub_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0 # 时间文本 publish_time = TimeExtractor().extractor(node) # print('>>> ', node.tag, node.attrib, text) if non_title and sub_tag_gt_0 and publish_time == '': # print('strip_node >>> ', node.tag, node.attrib, text) parent = node.getparent() if parent is not None: if parent.tag.lower() == 'a': etree.strip_tags(parent, node.tag) elif parent.tag.lower() == 'td': if not node.getchildren(): if len(text) == 0: remove_node(parent) else: etree.strip_tags(parent, node.tag) else: name = [child.tag for child in node.getchildren()] etree.strip_tags(parent, *name) else: remove_node(node) def remove_nav_node(element: HtmlElement): for node, _ in iter_node(element): text = extract_text(node) # print('nav_node >>> ', node.tag, text) # (板块文本|页脚文本|导航文本) for item in NAV_TEXTS: if item in text: remove_node(node) # 登录标签 if text in LOGIN_TEXTS: remove_node(node) # 翻页标签栏 if text in PAGE_TEXTS: tag = node.tag.lower() siblings = list(node.itersiblings(tag)) # 通过查询相同标签兄弟节点,兄弟节点包含在一个先辈节点内;翻页标签不包含在同一先辈节点 if len(siblings) > 3 and node.tag.lower() == tag: remove_node(node.getparent()) else: for ancestor in node.iterancestors(): # 先辈节点包含两个以上同类节点,则认为该先辈节点包含全部想要删除标签 if len(list(ancestor.iterdescendants(tag))) > 2: remove_node(ancestor) break # 位置标签 if '位置' in text: remove_node(node) def remove_date_node(element: HtmlElement): retrieve = False for node, _ in iter_node(element): if retrieve: break publish_time = TimeExtractor().extractor(node) # print('date_node >>> ', node.tag, node.attrib, publish_time) # 首先找拥有时间文本最深层级节点(拥有时间文本,不存在子节点) if publish_time != '' and len(list(node.iterchildren())) == 0: # print("date_node >>> ", node.tag, node.attrib, len(list(node.itersiblings()))) # 时间文本有可能来自兄弟节点,而不是自身 if len(list(node.itersiblings())) > 0: # 存在多个兄弟节点,所以通过分析共同的父节点 parent = node.getparent() # 统计父节点的拥有时间文本的后裔节点个数 total = tag_date_total(parent) if total > 3: # 简单场景:仅仅单个兄弟节点拥有时间文本(发布时间) remove_node(parent) else: # 复杂场景:多个兄弟节点拥有时间文本(开始时间,截止时间...) # print("parent_date_node >>> ", parent.tag, parent.attrib, len(list(parent.itersiblings()))) # 从父节点开始,查询且删除先辈节点中拥有时间文本的先辈节点 retrieve = remove_ancestors_date_tag(parent) else: # 情况2:无兄弟节点,从自身开始,查询且删除先辈节点中拥有时间文本的先辈节点 retrieve = remove_ancestors_date_tag(node) def clean_node(element): for node, _ in iter_node(element): if is_empty_element(node): # print(' clean_node >>> ', node.tag, node.attrib, extract_text(node)) remove_node(node) if node.tag.lower() == 'a' and list(node.iterchildren()): # 剔除a标签包含的单个或多个短语标签、文本标签,保留内部文本 for child in node.iterchildren(): etree.strip_tags(node, child.tag) def extract_data(source, base_url): element = html2element(source) children = element.xpath('/html/body/child::*') result = {} for index, child in enumerate(children): data = [] for node in child.iterdescendants('a'): title = extract_text(node) href = urljoin(base_url, node.attrib.get('href')) if check_text_by_words(title) and len(title) <= 15: item = (title, href) data.append(item) key = "{}_{}".format(child.tag.lower(), index) result[key] = data print(result) for key, items in result.items(): print(f"=============== {base_url} && {key} ===============") for val in items: print(val) print() return result def process_page(source): element = html2element(source) # web网页预处理(web页面去噪,会改变原始dom结构) element = pre_parse(element) # show_html(element, file='2预处理.html') # 整理节点 element = trim_node(element) # show_html(element, file='3整理body节点.html') # 剔除节点 strip_node(element) # show_html(element, file='4剔除节点.html') # 删除导航节点 remove_nav_node(element) # show_html(element, file='5删除导航条.html') # 删除时间节点 remove_date_node(element) # show_html(element, file='6删除时间块.html') # 清理节点 clean_node(element) # show_html(element, file='7清理dom.html') return element2html(element) def bfs(response, base_url): try: source = response.json().get('html', '') except json.decoder.JSONDecodeError: source = response.text # show_html(source, file='1原始页.html') if len(source) == 0: return {} source = process_page(source) items = extract_data(source, base_url) return items