|
@@ -100,12 +100,6 @@ def _lowercase_tag(html):
|
|
|
return html
|
|
|
|
|
|
|
|
|
-def _clean_whitespace(html):
|
|
|
- html = html.replace('\\n', '')
|
|
|
- # html = html.replace(' ', '')
|
|
|
- return html
|
|
|
-
|
|
|
-
|
|
|
def cleaner(html, special=None, completely=False):
|
|
|
"""
|
|
|
数据清洗
|
|
@@ -136,5 +130,4 @@ def cleaner(html, special=None, completely=False):
|
|
|
html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
|
|
|
|
|
|
html = _escape_character(html)
|
|
|
- html = _clean_whitespace(html)
|
|
|
return html
|