|
@@ -1,3 +1,4 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
import re
|
|
|
|
|
|
__all__ = ['cleaner', 'clean_js']
|
|
@@ -14,21 +15,26 @@ INDEPENDENT_TAGS = {
|
|
|
'<style[^<>]*>[\s\S]*?</style>': '', # 样式
|
|
|
'<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
|
|
|
'<input>': '', # 输入框
|
|
|
+ '</input>': '', # 输入框
|
|
|
'<img[^>]*>': '<br>', # 图片
|
|
|
}
|
|
|
# 行内元素
|
|
|
INLINE_TAGS = {
|
|
|
'<a>|<a [^>]*>|</a>': '', # 超链接
|
|
|
+ '<link>|<link [^>]*>|</link>': '', # 超链接
|
|
|
'<span>|<span [^>]*>|</span>': '', # span
|
|
|
'<label>|<label [^>]*>|</label>': '<br>', # label
|
|
|
'<font>|<font [^>]*>|</font>': '', # font
|
|
|
+ 'data:image(.*?) ': '', # 图片base64
|
|
|
}
|
|
|
# 块级元素
|
|
|
BLOCK_TAGS = {
|
|
|
- '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '', # 标题
|
|
|
- # '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
|
|
|
- '<p>|<p [^>]*>|</p>': '<br>', # 段落
|
|
|
- '<div>|<div [^>]*>|</div>': '<br>', # 分割 division
|
|
|
+ '<div>\s*?</div>':'',
|
|
|
+ '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
|
|
|
+ '<p>|<p [^>]*>': '<br>', # 段落
|
|
|
+ '</p>': '', # 段落
|
|
|
+ '<div>|<div [^>]*>': '<br>', # 分割 division
|
|
|
+ '</div>': '', # 分割 division
|
|
|
'<o:p>|<o:p [^>]*>|</o:p>': '' # OFFICE微软WORD段落
|
|
|
}
|
|
|
# 其他
|
|
@@ -53,6 +59,7 @@ CSS_STYLE = {
|
|
|
'class="[\s\S]*?"|class ="[\s\S]*?"': '',
|
|
|
'align="[\s\S]*?"|align ="[\s\S]*?"': '',
|
|
|
'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
|
|
|
+
|
|
|
}
|
|
|
# 空白符
|
|
|
BLANKS = {
|
|
@@ -65,6 +72,10 @@ BLANKS = {
|
|
|
TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
|
|
|
# css属性集合
|
|
|
ATTRS = {'id', 'class', 'style', 'width'}
|
|
|
+# 特殊样式+指定样式的标签
|
|
|
+SPECIAL_TAGS = {
|
|
|
+ re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
|
|
|
+}
|
|
|
|
|
|
|
|
|
def _repair_tag():
|
|
@@ -84,14 +95,36 @@ def _escape_character(html):
|
|
|
html = html.replace('>', '>')
|
|
|
html = html.replace('"', '"')
|
|
|
html = html.replace('&', '&')
|
|
|
+ # 不显示输入框边框
|
|
|
+ html = html.replace('<input', '<input style="border-color: transparent;"')
|
|
|
+ return html
|
|
|
+
|
|
|
+
|
|
|
+def _clean_input(html):
|
|
|
+ """提取value值,替换input标签"""
|
|
|
+ inputTxt = re.compile(r'<input .*?>', re.S)
|
|
|
+ valueTxt = re.compile(r'value=["|\'](.*?)["|\']')
|
|
|
+
|
|
|
+ input_list = re.findall(inputTxt, html) or []
|
|
|
+ for ipt in input_list:
|
|
|
+ val = re.findall(valueTxt, ipt)
|
|
|
+ if val and "hidden" not in ipt and "hide" not in ipt and "display: none" not in ipt:
|
|
|
+ html = html.replace(ipt,val[0])
|
|
|
return html
|
|
|
|
|
|
|
|
|
def _lowercase_tag(html):
|
|
|
- """标签归一化处理(全部小写)"""
|
|
|
+ """标签归一化处理(全部小写 + 标签修复)"""
|
|
|
tags = re.findall("<[^>]+>", html)
|
|
|
- for tag in tags:
|
|
|
- html = html.replace(tag, str(tag).lower())
|
|
|
+ tag_sets = set(tags)
|
|
|
+
|
|
|
+ if len(tag_sets) > 10000:
|
|
|
+ from bs4 import BeautifulSoup
|
|
|
+ soup = BeautifulSoup(html, "lxml")
|
|
|
+ html = str(soup.body.next_element)
|
|
|
+ else:
|
|
|
+ for tag in tag_sets:
|
|
|
+ html = html.replace(tag, str(tag).lower())
|
|
|
|
|
|
repair_tags = _repair_tag()
|
|
|
for err, right in repair_tags.items():
|
|
@@ -100,6 +133,13 @@ def _lowercase_tag(html):
|
|
|
return html
|
|
|
|
|
|
|
|
|
+def _del_tag(html):
|
|
|
+ """删除特殊+指定样式的标签"""
|
|
|
+ for tag, repl in SPECIAL_TAGS.items():
|
|
|
+ html = tag.sub(repl, html)
|
|
|
+ return html
|
|
|
+
|
|
|
+
|
|
|
def cleaner(html, special=None, completely=False):
|
|
|
"""
|
|
|
数据清洗
|
|
@@ -121,6 +161,7 @@ def cleaner(html, special=None, completely=False):
|
|
|
**BLANKS,
|
|
|
}
|
|
|
html = _lowercase_tag(html)
|
|
|
+ # html = _del_tag(html) # 优先处理
|
|
|
for tag, repl in remove_tags.items():
|
|
|
html = re.sub(tag, repl, html)
|
|
|
|
|
@@ -133,6 +174,7 @@ def cleaner(html, special=None, completely=False):
|
|
|
html = re.sub(r'([,|.|。|,|;|;|?|&|$|#|@|!|!|%|*|\'|"|‘|’|“|¥|?| ]*?)$', "", html.strip()) # 清除文本末尾符号
|
|
|
|
|
|
html = _escape_character(html)
|
|
|
+ html = _clean_input(html) # 处理 input
|
|
|
return html
|
|
|
|
|
|
|
|
@@ -147,3 +189,4 @@ def clean_js(html):
|
|
|
for tag, repl in remove_tags.items():
|
|
|
html = re.sub(tag, repl, html)
|
|
|
return html
|
|
|
+
|