|
@@ -21,11 +21,11 @@ INLINE_TAGS = {
|
|
|
'<a>|<a [^>]*>|</a>': '', # 超链接
|
|
|
'<span>|<span [^>]*>|</span>': '', # span
|
|
|
'<label>|<label [^>]*>|</label>': '<br>', # label
|
|
|
- '<font>|<font [^>]*>|</font>': '', # font
|
|
|
+ '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '', # font
|
|
|
}
|
|
|
# 块级元素
|
|
|
BLOCK_TAGS = {
|
|
|
- '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '', # 标题
|
|
|
+ # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '', # 标题
|
|
|
# '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
|
|
|
'<p>|<p [^>]*>|</p>': '<br>', # 段落
|
|
|
'<div>|<div [^>]*>|</div>': '<br>', # 分割 division
|
|
@@ -100,6 +100,12 @@ def _lowercase_tag(html):
|
|
|
return html
|
|
|
|
|
|
|
|
|
+def _clean_whitespace(html):
|
|
|
+ html = html.replace('\\n', '')
|
|
|
+ # html = html.replace(' ', '')
|
|
|
+ return html
|
|
|
+
|
|
|
+
|
|
|
def cleaner(html, special=None, completely=False):
|
|
|
"""
|
|
|
数据清洗
|
|
@@ -130,4 +136,5 @@ def cleaner(html, special=None, completely=False):
|
|
|
html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
|
|
|
|
|
|
html = _escape_character(html)
|
|
|
+ html = _clean_whitespace(html)
|
|
|
return html
|