|
@@ -15,6 +15,7 @@ INDEPENDENT_TAGS = {
|
|
|
'<style[^<>]*>[\s\S]*?</style>': '', # 样式
|
|
|
'<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
|
|
|
'<input>': '', # 输入框
|
|
|
+ '</input>': '', # 输入框
|
|
|
'<img[^>]*>': '<br>', # 图片
|
|
|
}
|
|
|
# 行内元素
|
|
@@ -73,7 +74,7 @@ TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
|
|
|
ATTRS = {'id', 'class', 'style', 'width'}
|
|
|
# 特殊样式+指定样式的标签
|
|
|
SPECIAL_TAGS = {
|
|
|
- # re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
|
|
|
+ re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
|
|
|
}
|
|
|
|
|
|
|
|
@@ -99,6 +100,19 @@ def _escape_character(html):
|
|
|
return html
|
|
|
|
|
|
|
|
|
+def _clean_input(html):
|
|
|
+ """提取value值,替换input标签"""
|
|
|
+ inputTxt = re.compile(r'<input .*?>', re.S)
|
|
|
+ valueTxt = re.compile(r'value=["|\'](.*?)["|\']')
|
|
|
+
|
|
|
+ input_list = re.findall(inputTxt, html) or []
|
|
|
+ for ipt in input_list:
|
|
|
+ val = re.findall(valueTxt, ipt)
|
|
|
+ if val and "hidden" not in ipt and "hide" not in ipt and "display: none" not in ipt:
|
|
|
+ html = html.replace(ipt,val[0])
|
|
|
+ return html
|
|
|
+
|
|
|
+
|
|
|
def _lowercase_tag(html):
|
|
|
"""标签归一化处理(全部小写 + 标签修复)"""
|
|
|
tags = re.findall("<[^>]+>", html)
|
|
@@ -147,7 +161,7 @@ def cleaner(html, special=None, completely=False):
|
|
|
**BLANKS,
|
|
|
}
|
|
|
html = _lowercase_tag(html)
|
|
|
- html = _del_tag(html) # 优先处理
|
|
|
+ # html = _del_tag(html) # 优先处理
|
|
|
for tag, repl in remove_tags.items():
|
|
|
html = re.sub(tag, repl, html)
|
|
|
|
|
@@ -157,4 +171,5 @@ def cleaner(html, special=None, completely=False):
|
|
|
html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
|
|
|
|
|
|
html = _escape_character(html)
|
|
|
+ html = _clean_input(html) # 处理 input
|
|
|
return html
|