data_spider
/
topic_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
							import os
import pathlib
from urllib.parse import urljoin

from lxml.html import etree, HtmlElement

from common.tools import sha1, detect_encoding
from crawler.analysis import TimeExtractor
from crawler.defaults import PAGE_TEXTS, LOGIN_TEXTS, NAV_TEXTS
from crawler.download import Downloader
from crawler.utils import (
    element2html,
    html2element,
    iter_node,
    drop_tag,
    remove_node,
    pre_parse,
    is_empty_element,
    is_title,
)

_base_path = pathlib.Path(__file__).parent


def analysis(origin_lst, target_lst):
    results = []
    for target_ in target_lst:
        source: str = target_['contenthtml']
        _c = 0
        for item in origin_lst:
            href, channel = item['href'], item['channel']
            if source.count(channel) > 0 or source.count(href) > 0:
                _c += 1

        if _c > 0:
            results.append({
                'similarity': _c,
                'contenthtml': source,
                'depth': target_['depth']
            })

    results = sorted(results, key=lambda x: x['similarity'], reverse=True)
    _t = max(results, key=lambda dic: dic['depth'])
    return _t


def extract_text(node: HtmlElement):
    text = (node.text or node.tail or node.xpath('string(.)'))
    return "".join(f"{text}".split())


def tag_date_total(node: HtmlElement, tag=None):
    count = 0
    # 先辈节点与目标节点名称相同并且包含时间文本的个数
    contains_date_nodes = []
    if tag is not None:
        descendants = list(node.iterdescendants(tag))
    else:
        descendants = list(node.iterdescendants())

    for descendant in descendants:
        pt = TimeExtractor().extractor(descendant)
        children = len(list(descendant.iterchildren())) > 0
        if pt != '' and not children and descendant not in contains_date_nodes:
            contains_date_nodes.append(descendant)
            count += 1
    return count


def remove_ancestors_date_tag(node: HtmlElement):
    # tag = node.tag.lower()  # TODO 是否一定需要后裔节点的类型
    prev_node = node
    # 情况1: 先辈节点下直接放置全部的时间节点，直接删除
    remove_count = 0
    for ancestor in node.iterancestors():
        is_remove = False
        total = tag_date_total(ancestor)
        # print("ancestors_date_tag >>> ", ancestor.tag, ancestor.attrib, total)
        if total > 3:
            # 情况2: ancestor节点下面不直接放置全部的时间节点；
            # 首先查询出时间文本大于3及以上的节点，此时直接删除ancestor节点，会导致
            # ancestor节点边界变大，包括不想期望保留节点一起被删除。此处遍历查询prev_node
            # 兄弟节点（ancestor后裔节点）寻找时间文本大于3及以上的兄弟节点，若不存
            # 时间文本大于3及以上的情况，此时再处理ancestor节点，达到增大边界效果，删除即可
            for sibling in prev_node.itersiblings():
                sibling_tag_date_total = tag_date_total(sibling)
                if sibling_tag_date_total > 3:
                    remove_node(sibling)
                    is_remove = True
                    # print("remove sibling tag >>> ", sibling.tag, sibling.attrib, )
            # 查询出时间文本大于3及以上，若prev_node的没有兄弟节点，此时直接删除该ancestor
            if not is_remove:
                remove_node(ancestor)
                # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
                is_remove = True
        elif 1 < total <= 2:
            # 逐条删除（查询出时间文本条数，从子节点边界范围进行查询、剔除，防止）
            for child in ancestor.iterchildren():
                child_tag_date_total = tag_date_total(child)
                if child_tag_date_total > 0:
                    remove_node(child)
                    # print("remove child tag >>> ", child.tag, child.attrib, )
                    is_remove = True
            if not is_remove:
                remove_node(ancestor)
                # print("remove ancestor tag >>> ", ancestor.tag, ancestor.attrib, )
                is_remove = True
        else:
            # 保存上一次查询的先辈节点（时间文本小于等于3）
            prev_node = ancestor

        if is_remove:
            remove_count += 1

    return True if remove_count > 0 else False


def show_html(page, *, file=None, bash_url=None):
    if bash_url is None:
        bash_url = ''

    if isinstance(page, HtmlElement):
        source = element2html(page)
    elif isinstance(page, bytes):
        source = page.decode(detect_encoding(page), 'surrogatepass')
    else:
        source = page

    if file is None:
        dirname = 'html'
        os.makedirs(dirname, mode=0o777, exist_ok=True)
        file = f'{dirname}/{sha1(bash_url)}.html'
    else:
        file = file

    with open(file, 'w') as fp:
        fp.write(source)


def trim_node(element: HtmlElement):
    """
        整理节点 （body子节点更新为div,在更新过程中子节点会合并文本）

    :param element:
    :return:
    """
    children = element.xpath('/html/body/child::*')
    for child in children:
        for node, _ in iter_node(child):
            # print('trim_node >>> ', node.tag, node.attrib)
            if node.tag.lower() == 'div':
                break
            drop_tag(node)
    return element


def strip_node(element: HtmlElement):
    """
        剔除节点，若不是标签a且无时间文本,关键词标题,标签a个数,移除或者剔除

    :param element:
    :return:
    """
    for node, _ in iter_node(element):
        # 节点文本（剔除空白、换行、回车符号）
        text = "".join("".join(node.xpath('./text()')).split())
        # 删除掉没有文本、发布时间、发布标题、href属性的节点及父节点
        if node.tag.lower() != 'a':
            # 关键词文本
            non_title = is_title(text) is False
            # 后裔a节点数量
            total_tag_gt_0 = len(list(node.iterdescendants('a'))) == 0
            # 时间文本
            publish_time = TimeExtractor().extractor(node)
            # print('>>> ', node.tag, node.attrib, text)
            if non_title and total_tag_gt_0 and publish_time == '':
                # print('strip_node >>> ', node.tag, node.attrib, text)
                parent = node.getparent()
                if parent is not None and parent.tag.lower() == 'a':
                    etree.strip_tags(parent, node.tag)
                elif parent is not None and parent.tag.lower() == 'td':
                    remove_node(parent)
                else:
                    remove_node(node)


def remove_nav_node(element: HtmlElement):
    for node, _ in iter_node(element):
        text = extract_text(node)
        # print('nav_node >>> ', node.tag, text)
        # （板块文本|页脚文本|导航文本）
        for item in NAV_TEXTS:
            if item in text:
                remove_node(node)
        # 登录标签
        if text in LOGIN_TEXTS:
            remove_node(node)
        # 翻页标签栏
        if text in PAGE_TEXTS:
            tag = node.tag.lower()
            siblings = list(node.itersiblings(tag))
            # 通过查询相同标签兄弟节点,兄弟节点包含在一个先辈节点内；翻页标签不包含在同一先辈节点
            if len(siblings) > 3 and node.tag.lower() == tag:
                remove_node(node.getparent())
            else:
                for ancestor in node.iterancestors():
                    # 先辈节点包含两个以上同类节点，则认为该先辈节点包含全部想要删除标签
                    if len(list(ancestor.iterdescendants(tag))) > 2:
                        remove_node(ancestor)
                        break
        # 位置标签
        if '位置' in text:
            remove_node(node)


def remove_date_node(element: HtmlElement):
    retrieve = False
    for node, _ in iter_node(element):
        if retrieve:
            break
        publish_time = TimeExtractor().extractor(node)
        # print('date_node >>> ', node.tag, node.attrib, publish_time)
        # 首先找拥有时间文本最深层级节点（拥有时间文本，不存在子节点）
        if publish_time != '' and len(list(node.iterchildren())) == 0:
            # print("date_node >>> ", node.tag, node.attrib, len(list(node.itersiblings())))
            # 时间文本有可能来自兄弟节点，而不是自身
            if len(list(node.itersiblings())) > 0:
                # 存在多个兄弟节点，所以通过分析共同的父节点
                parent = node.getparent()
                # 统计父节点的拥有时间文本的后裔节点个数
                total = tag_date_total(parent)
                if total > 3:
                    # 简单场景：仅仅单个兄弟节点拥有时间文本(发布时间)
                    remove_node(parent)
                else:
                    # 复杂场景：多个兄弟节点拥有时间文本(开始时间，截止时间...)
                    # print("parent_date_node >>> ", parent.tag, parent.attrib, len(list(parent.itersiblings())))
                    # 从父节点开始，查询且删除先辈节点中拥有时间文本的先辈节点
                    retrieve = remove_ancestors_date_tag(parent)
            else:
                # 情况2：无兄弟节点，从自身开始，查询且删除先辈节点中拥有时间文本的先辈节点
                retrieve = remove_ancestors_date_tag(node)


def clean_node(element):
    for node, _ in iter_node(element):
        if is_empty_element(node):
            # print(' clean_node >>> ', node.tag, node.attrib, extract_text(node))
            remove_node(node)

        if node.tag.lower() == 'a' and list(node.iterchildren()):
            # 剔除a标签包含的单个或多个短语标签、文本标签，保留内部文本
            for child in node.iterchildren():
                etree.strip_tags(node, child.tag)


def extract_data(source, base_url):
    element = html2element(source)
    children = element.xpath('/html/body/child::*')
    result = {}
    for index, child in enumerate(children):
        data = []
        for node in child.iterdescendants('a'):
            title = extract_text(node)
            href = node.attrib.get('href')
            href = urljoin(base_url, href)
            if is_title(title) and len(title) <= 15:
                item = (title, href)
                data.append(item)
        key = "{}_{}".format(child.tag.lower(), index)
        result[key] = data

    print(result)
    for key, items in result.items():
        print(f"=============== {base_url} && {key} ===============")
        for val in items:
            print(val)
        print()
    return result


def process_page(source):
    element = html2element(source)
    # web网页预处理（web页面去噪，会改变原始dom结构）
    element = pre_parse(element)
    # show_html(element, file='2预处理.html')
    # 整理节点
    element = trim_node(element)
    # show_html(element, file='3整理body节点.html')
    # 剔除节点
    strip_node(element)
    # show_html(element, file='4剔除节点.html')
    # 删除导航节点
    remove_nav_node(element)
    # show_html(element, file='5删除导航条.html')
    # 删除时间节点
    remove_date_node(element)
    # show_html(element, file='6删除时间块.html')
    # 清理节点
    clean_node(element)
    # show_html(element, file='7清理dom.html')
    return element2html(element)


def bfs(response, base_url):
    source = response.text
    # show_html(source, file='1原始页.html')
    if len(source) == 0:
        return {}
    source = process_page(source)
    items = extract_data(source, base_url)
    return items