|
@@ -1,4 +1,6 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
import re
|
|
|
+
|
|
|
__all__ = ['cleaner']
|
|
|
|
|
|
# 独立元素
|
|
@@ -69,6 +71,10 @@ BLANKS = {
|
|
|
TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
|
|
|
# css属性集合
|
|
|
ATTRS = {'id', 'class', 'style', 'width'}
|
|
|
+# 特殊样式+指定样式的标签
|
|
|
+SPECIAL_TAGS = {
|
|
|
+ re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
|
|
|
+}
|
|
|
|
|
|
|
|
|
def _repair_tag():
|
|
@@ -113,6 +119,13 @@ def _lowercase_tag(html):
|
|
|
return html
|
|
|
|
|
|
|
|
|
+def _del_tag(html):
|
|
|
+ """删除特殊+指定样式的标签"""
|
|
|
+ for tag, repl in SPECIAL_TAGS.items():
|
|
|
+ html = tag.sub(repl, html)
|
|
|
+ return html
|
|
|
+
|
|
|
+
|
|
|
def cleaner(html, special=None, completely=False):
|
|
|
"""
|
|
|
数据清洗
|
|
@@ -124,7 +137,6 @@ def cleaner(html, special=None, completely=False):
|
|
|
"""
|
|
|
if special is None:
|
|
|
special = {}
|
|
|
-
|
|
|
OTHER.update(special)
|
|
|
remove_tags = {
|
|
|
**INDEPENDENT_TAGS,
|
|
@@ -135,6 +147,7 @@ def cleaner(html, special=None, completely=False):
|
|
|
**BLANKS,
|
|
|
}
|
|
|
html = _lowercase_tag(html)
|
|
|
+ html = _del_tag(html) # 优先处理
|
|
|
for tag, repl in remove_tags.items():
|
|
|
html = re.sub(tag, repl, html)
|
|
|
|