Browse Source

取消html清洗 - 换行符清洗

dongzhaorui@topnet.net.cn 3 years ago
parent
commit
a3f575b4bf
1 changed files with 0 additions and 7 deletions
  1. 0 7
      ybw/crawler/clean_html.py

+ 0 - 7
ybw/crawler/clean_html.py

@@ -100,12 +100,6 @@ def _lowercase_tag(html):
     return html
     return html
 
 
 
 
-def _clean_whitespace(html):
-    html = html.replace('\\n', '')
-    # html = html.replace('  ', '')
-    return html
-
-
 def cleaner(html, special=None, completely=False):
 def cleaner(html, special=None, completely=False):
     """
     """
     数据清洗
     数据清洗
@@ -136,5 +130,4 @@ def cleaner(html, special=None, completely=False):
         html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
         html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
 
 
     html = _escape_character(html)
     html = _escape_character(html)
-    html = _clean_whitespace(html)
     return html
     return html