9 月之前 · ddc8ed7109
--- a/zbytb/crawler/clean_html.py
+++ b/zbytb/crawler/clean_html.py
@@ -1,3 +1,4 @@
 
				+# -*- coding: utf-8 -*-
			
 
				 import re
			
 
				 
			
 
				 __all__ = ['cleaner', 'clean_js']
			
@@ -14,21 +15,26 @@ INDEPENDENT_TAGS = {
 
				     '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				     '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				     '<input>': '',  # 输入框
			
 
				+    '</input>': '',  # 输入框
			
 
				     '<img[^>]*>': '<br>',  # 图片
			
 
				 }
			
 
				 # 行内元素
			
 
				 INLINE_TAGS = {
			
 
				     '<a>|<a [^>]*>|</a>': '',  # 超链接
			
 
				+    '<link>|<link [^>]*>|</link>': '',  # 超链接
			
 
				     '<span>|<span [^>]*>|</span>': '',  # span
			
 
				     '<label>|<label [^>]*>|</label>': '<br>',  # label
			
 
				     '<font>|<font [^>]*>|</font>': '',  # font
			
 
				+    'data:image(.*?) ': '',            # 图片base64
			
 
				 }
			
 
				 # 块级元素
			
 
				 BLOCK_TAGS = {
			
 
				-    '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
			
 
				-    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				-    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
			
 
				-    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
			
 
				+    '<div>\s*?</div>':'',
			
 
				+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				+    '<p>|<p [^>]*>': '<br>',  # 段落
			
 
				+    '</p>': '',  # 段落
			
 
				+    '<div>|<div [^>]*>': '<br>',  # 分割 division
			
 
				+    '</div>': '',  # 分割 division
			
 
				     '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
			
 
				 }
			
 
				 # 其他
			
@@ -53,6 +59,7 @@ CSS_STYLE = {
 
				     'class="[\s\S]*?"|class ="[\s\S]*?"': '',
			
 
				     'align="[\s\S]*?"|align ="[\s\S]*?"': '',
			
 
				     'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
			
 
				+
			
 
				 }
			
 
				 # 空白符
			
 
				 BLANKS = {
			
@@ -65,6 +72,10 @@ BLANKS = {
 
				 TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
			
 
				 # css属性集合
			
 
				 ATTRS = {'id', 'class', 'style', 'width'}
			
 
				+# 特殊样式+指定样式的标签
			
 
				+SPECIAL_TAGS = {
			
 
				+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
			
 
				+}
			
 
				 
			
 
				 
			
 
				 def _repair_tag():
			
@@ -84,14 +95,36 @@ def _escape_character(html):
 
				     html = html.replace('&gt;', '>')
			
 
				     html = html.replace('&quot;', '"')
			
 
				     html = html.replace('&amp;', '&')
			
 
				+    # 不显示输入框边框
			
 
				+    html = html.replace('<input', '<input style="border-color: transparent;"')
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _clean_input(html):
			
 
				+    """提取value值，替换input标签"""
			
 
				+    inputTxt = re.compile(r'<input .*?>', re.S)
			
 
				+    valueTxt = re.compile(r'value=["|\'](.*?)["|\']')
			
 
				+
			
 
				+    input_list = re.findall(inputTxt, html) or []
			
 
				+    for ipt in input_list:
			
 
				+        val = re.findall(valueTxt, ipt)
			
 
				+        if val and "hidden" not in ipt and "hide" not in ipt and "display: none" not in ipt:
			
 
				+            html = html.replace(ipt,val[0])
			
 
				     return html
			
 
				 
			
 
				 
			
 
				 def _lowercase_tag(html):
			
 
				-    """标签归一化处理（全部小写）"""
			
 
				+    """标签归一化处理（全部小写 + 标签修复）"""
			
 
				     tags = re.findall("<[^>]+>", html)
			
 
				-    for tag in tags:
			
 
				-        html = html.replace(tag, str(tag).lower())
			
 
				+    tag_sets = set(tags)
			
 
				+
			
 
				+    if len(tag_sets) > 10000:
			
 
				+        from bs4 import BeautifulSoup
			
 
				+        soup = BeautifulSoup(html, "lxml")
			
 
				+        html = str(soup.body.next_element)
			
 
				+    else:
			
 
				+        for tag in tag_sets:
			
 
				+            html = html.replace(tag, str(tag).lower())
			
 
				 
			
 
				     repair_tags = _repair_tag()
			
 
				     for err, right in repair_tags.items():
			
@@ -100,6 +133,13 @@ def _lowercase_tag(html):
 
				     return html
			
 
				 
			
 
				 
			
 
				+def _del_tag(html):
			
 
				+    """删除特殊+指定样式的标签"""
			
 
				+    for tag, repl in SPECIAL_TAGS.items():
			
 
				+        html = tag.sub(repl, html)
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				 def cleaner(html, special=None, completely=False):
			
 
				     """
			
 
				     数据清洗
			
@@ -121,6 +161,7 @@ def cleaner(html, special=None, completely=False):
 
				         **BLANKS,
			
 
				     }
			
 
				     html = _lowercase_tag(html)
			
 
				+    # html = _del_tag(html)  # 优先处理
			
 
				     for tag, repl in remove_tags.items():
			
 
				         html = re.sub(tag, repl, html)
			
 
				 
			
@@ -133,6 +174,7 @@ def cleaner(html, special=None, completely=False):
 
				         html = re.sub(r'([,|.|。|，|；|;|?|&|$|#|@|!|！|%|*|\'|"|‘|’|“|￥|？| ]*?)$', "", html.strip())  # 清除文本末尾符号
			
 
				 
			
 
				     html = _escape_character(html)
			
 
				+    html = _clean_input(html)  # 处理 input
			
 
				     return html
			
 
				 
			
 
				 
			
@@ -147,3 +189,4 @@ def clean_js(html):
 
				     for tag, repl in remove_tags.items():
			
 
				         html = re.sub(tag, repl, html)
			
 
				     return html
			
 
				+
			
--- a/zbytb/crawler/clean_html_old.py
+++ b/zbytb/crawler/clean_html_old.py
@@ -0,0 +1,149 @@
 
				+import re
			
 
				+
			
 
				+__all__ = ['cleaner', 'clean_js']
			
 
				+
			
 
				+# 独立元素
			
 
				+INDEPENDENT_TAGS = {
			
 
				+    '<head>[\s\S]*?</head>': '',
			
 
				+    '<html>|<html [^>]*>|</html>': '',
			
 
				+    '<body>|<body [^>]*>|</body>': '',
			
 
				+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
			
 
				+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				+    '\\xa0|\\u3000': '',  # 空格
			
 
				+    '<!--[\s\S]*?-->': '',  # 注释
			
 
				+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				+    '<input>': '',  # 输入框
			
 
				+    '<img[^>]*>': '<br>',  # 图片
			
 
				+}
			
 
				+# 行内元素
			
 
				+INLINE_TAGS = {
			
 
				+    '<a>|<a [^>]*>|</a>': '',  # 超链接
			
 
				+    '<span>|<span [^>]*>|</span>': '',  # span
			
 
				+    '<label>|<label [^>]*>|</label>': '<br>',  # label
			
 
				+    '<font>|<font [^>]*>|</font>': '',  # font
			
 
				+}
			
 
				+# 块级元素
			
 
				+BLOCK_TAGS = {
			
 
				+    '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
			
 
				+    # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				+    '<p>|<p [^>]*>|</p>': '<br>',  # 段落
			
 
				+    '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
			
 
				+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
			
 
				+}
			
 
				+# 其他
			
 
				+OTHER = {
			
 
				+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
			
 
				+    '<epointform>': '',
			
 
				+    '<!doctype html>|<!doctype html [^>]*>': '',
			
 
				+    '【关闭】|关闭': '',
			
 
				+    '【打印】|打印本页': '',
			
 
				+    '【字体：[\s\S]*】': '',
			
 
				+    '文章来源：[\u4e00-\u9fa5]+': '',
			
 
				+    '浏览次数：.*[<]+': '',
			
 
				+    '（责任编辑：.*?）': '',
			
 
				+    '分享到[：]': '',
			
 
				+    '阅读数[:：]\d+': '',
			
 
				+}
			
 
				+# 样式
			
 
				+CSS_STYLE = {
			
 
				+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
			
 
				+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
			
 
				+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
			
 
				+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
			
 
				+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
			
 
				+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
			
 
				+}
			
 
				+# 空白符
			
 
				+BLANKS = {
			
 
				+    '\n\s*\n': '\n',
			
 
				+    '\s*\n\s*': '\n',
			
 
				+    '[^\S\n]': ' ',
			
 
				+    '\s+': ' ',
			
 
				+}
			
 
				+# css标签集合
			
 
				+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
			
 
				+# css属性集合
			
 
				+ATTRS = {'id', 'class', 'style', 'width'}
			
 
				+
			
 
				+
			
 
				+def _repair_tag():
			
 
				+    """异常的标签组合,用来替换非标准页面的标签"""
			
 
				+    _repairs = {}
			
 
				+    for tag in TAGS:
			
 
				+        for attr in ATTRS:
			
 
				+            key = '{}{}'.format(tag, attr)
			
 
				+            val = '{} {}'.format(tag, attr)
			
 
				+            _repairs[key] = val
			
 
				+    return _repairs
			
 
				+
			
 
				+
			
 
				+def _escape_character(html):
			
 
				+    """转义字符"""
			
 
				+    html = html.replace('&lt;', '<')
			
 
				+    html = html.replace('&gt;', '>')
			
 
				+    html = html.replace('&quot;', '"')
			
 
				+    html = html.replace('&amp;', '&')
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _lowercase_tag(html):
			
 
				+    """标签归一化处理（全部小写）"""
			
 
				+    tags = re.findall("<[^>]+>", html)
			
 
				+    for tag in tags:
			
 
				+        html = html.replace(tag, str(tag).lower())
			
 
				+
			
 
				+    repair_tags = _repair_tag()
			
 
				+    for err, right in repair_tags.items():
			
 
				+        html = html.replace(err, right)
			
 
				+
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def cleaner(html, special=None, completely=False):
			
 
				+    """
			
 
				+    数据清洗
			
 
				+
			
 
				+    :param html: 清洗的页面
			
 
				+    :param special: 额外指定页面清洗规则
			
 
				+    :param completely: 是否完全清洗页面
			
 
				+    :return: 清洗后的页面源码
			
 
				+    """
			
 
				+    if special is None:
			
 
				+        special = {}
			
 
				+    OTHER.update(special)
			
 
				+    remove_tags = {
			
 
				+        **INDEPENDENT_TAGS,
			
 
				+        **INLINE_TAGS,
			
 
				+        **BLOCK_TAGS,
			
 
				+        **OTHER,
			
 
				+        **CSS_STYLE,
			
 
				+        **BLANKS,
			
 
				+    }
			
 
				+    html = _lowercase_tag(html)
			
 
				+    for tag, repl in remove_tags.items():
			
 
				+        html = re.sub(tag, repl, html)
			
 
				+
			
 
				+    if completely:
			
 
				+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
			
 
				+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
			
 
				+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				+
			
 
				+    if html:
			
 
				+        html = re.sub(r'([,|.|。|，|；|;|?|&|$|#|@|!|！|%|*|\'|"|‘|’|“|￥|？| ]*?)$', "", html.strip())  # 清除文本末尾符号
			
 
				+
			
 
				+    html = _escape_character(html)
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def clean_js(html):
			
 
				+    remove_tags = {
			
 
				+        '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				+        '\\xa0|\\u3000': '',  # 空格
			
 
				+        '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				+        **BLANKS,
			
 
				+    }
			
 
				+    html = _lowercase_tag(html)
			
 
				+    for tag, repl in remove_tags.items():
			
 
				+        html = re.sub(tag, repl, html)
			
 
				+    return html