data_spider
/
poc_qlm


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
							# -*- coding: utf-8 -*-
import re

from lxml.html import fromstring, HtmlElement, tostring

__all__ = ['cleaner', 'drop_tree_by_lxml']

'''独立元素'''
INDEPENDENT_TAGS = {
    '<head>[\s\S]*?</head>': '',
    '<html>|<html [^>]*>|</html>': '',
    '<body>|<body [^>]*>|</body>': '',
    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
    '\\xa0|\\u3000': '',  # 空格
    '<!--[\s\S]*?-->': '',  # 注释
    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
    '<input>': '',  # 输入框
    '</input>': '',  # 输入框
    '<img[^>]*>': '<br>',  # 图片
}
'''行内元素'''
INLINE_TAGS = {
    '<a>|<a [^>]*>|</a>': '',  # 超链接
    '<link>|<link [^>]*>|</link>': '',  # 超链接
    '<span>|<span [^>]*>|</span>': '',  # span
    '<label>|<label [^>]*>|</label>': '<br>',  # label
    '<font>|<font [^>]*>|</font>': '',  # font
    'data:image(.*?) ': '',  # 图片base64
}
'''块级元素'''
BLOCK_TAGS = {
    '<div>\s*?</div>': '',
    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
    '<p>|<p [^>]*>': '<br>',  # 段落
    '</p>': '',  # 段落
    '<div>|<div [^>]*>': '<br>',  # 分割
    '</div>': '',  # 分割 division
    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
}
'''其他'''
OTHER = {
    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
    '<epointform>': '',
    '<!doctype html>|<!doctype html [^>]*>': '',
    '【关闭】|关闭': '',
    '【打印】|打印本页': '',
    '【字体：[\s\S]*】': '',
    '文章来源：[\u4e00-\u9fa5]+': '',
    '浏览次数：.*[<]+': '',
    '（责任编辑：.*?）': '',
    '分享到[：]': '',
}
'''样式'''
CSS_STYLE = {
    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
}
'''空白符'''
BLANKS = {
    '\n\s*\n': '\n',
    '\s*\n\s*': '\n',
    '[^\S\n]': ' ',
    '\s+': ' ',
}
'''css标签集合'''
TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
'''css属性集合'''
ATTRS = {'id', 'class', 'style', 'width'}
'''特殊样式的标签'''
SPECIAL_TAGS = {
    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
}


def _repair_tag():
    """异常的标签组合,用来替换非标准页面的标签"""
    _repairs = {}
    for tag in TAGS:
        for attr in ATTRS:
            key = '{}{}'.format(tag, attr)
            val = '{} {}'.format(tag, attr)
            _repairs[key] = val
    return _repairs


def _escape_character(html):
    """转义字符"""
    html = html.replace('&lt;', '<')
    html = html.replace('&gt;', '>')
    html = html.replace('&quot;', '"')
    html = html.replace('&amp;', '&')
    return html


def _lowercase_tag(html):
    """元素标签转成小写，不影响页面文本"""
    tags = re.findall("<[^>]+>", html)
    tag_sets = set(tags)

    if len(tag_sets) > 10000:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, 'lxml')
        html = str(soup.body.next_element)
    else:
        for tag in tag_sets:
            html = html.replace(tag, str(tag).lower())

    repair_tags = _repair_tag()  # 标签修复
    for err, right in repair_tags.items():
        html = html.replace(err, right)

    return html


def _clear_special_tag(html):
    """删除特殊元素标签"""
    for tag, repl in SPECIAL_TAGS.items():
        html = tag.sub(repl, html)
    return html


def _clear_input_tag(html, display=False):
    """提取value值，替换input标签"""
    if not display:
        html = html.replace('<input', '<input style="border-color: transparent;"')  # 不显示输入框边框

    tag = re.compile(r'<input .*?>', re.S)
    value = re.compile(r'value=["|\'](.*?)["|\']')

    lst = re.findall(tag, html) or []
    for ipt in lst:
        val = re.findall(value, ipt)
        if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
            html = html.replace(ipt, val[0])
    return html


def drop_tree_by_lxml(html, feature):
    tree: HtmlElement = fromstring(html)
    tag_lst = tree.xpath(feature)
    for tag in tag_lst:
        tag.drop_tree()

    html = tostring(tree, encoding='utf8').decode('utf8')
    return html


def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
    """
    源码清洗

    :param html: 清洗的页面
    :param special: 额外指定页面清洗规则
    :param completely: 是否完全清洗页面
    :param del_tag: 删除标签
    :return: 页面源码
    """
    special = set() if special is None else special
    OTHER.update(special)
    remove_tags = {
        **INDEPENDENT_TAGS,
        **INLINE_TAGS,
        **BLOCK_TAGS,
        **OTHER,
        **CSS_STYLE,
        **BLANKS,
    }

    html = _lowercase_tag(html)
    if del_tag:
        html = _clear_special_tag(html)

    for tag, repl in remove_tags.items():
        html = re.sub(tag, repl, html)

    if completely:
        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)

    html = _escape_character(html)
    html = _clear_input_tag(html, **kwargs)
    return html