Эх сурвалжийг харах

取消html清洗 - 换行符清洗

dongzhaorui@topnet.net.cn 3 жил өмнө
parent
commit
a3f575b4bf

+ 0 - 7
ybw/crawler/clean_html.py

@@ -100,12 +100,6 @@ def _lowercase_tag(html):
     return html
 
 
-def _clean_whitespace(html):
-    html = html.replace('\\n', '')
-    # html = html.replace('  ', '')
-    return html
-
-
 def cleaner(html, special=None, completely=False):
     """
     数据清洗
@@ -136,5 +130,4 @@ def cleaner(html, special=None, completely=False):
         html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
 
     html = _escape_character(html)
-    html = _clean_whitespace(html)
     return html