Răsfoiți Sursa

去除html清洗 - 标题标签清洗

dongzhaorui@topnet.net.cn 3 ani în urmă
părinte
comite
31fba16f23
2 a modificat fișierele cu 17 adăugiri și 5 ștergeri
  1. 9 2
      ybw/crawler/clean_html.py
  2. 8 3
      ybw/detail_spider.py

+ 9 - 2
ybw/crawler/clean_html.py

@@ -21,11 +21,11 @@ INLINE_TAGS = {
     '<a>|<a [^>]*>|</a>': '',  # 超链接
     '<span>|<span [^>]*>|</span>': '',  # span
     '<label>|<label [^>]*>|</label>': '<br>',  # label
-    '<font>|<font [^>]*>|</font>': '',  # font
+    '<font>|<font [^>]*>|<font[\s\S][^>]*>|</font>': '',  # font
 }
 # 块级元素
 BLOCK_TAGS = {
-    '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
+    # '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '',  # 标题
     # '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
     '<p>|<p [^>]*>|</p>': '<br>',  # 段落
     '<div>|<div [^>]*>|</div>': '<br>',  # 分割 division
@@ -100,6 +100,12 @@ def _lowercase_tag(html):
     return html
 
 
+def _clean_whitespace(html):
+    html = html.replace('\\n', '')
+    # html = html.replace('  ', '')
+    return html
+
+
 def cleaner(html, special=None, completely=False):
     """
     数据清洗
@@ -130,4 +136,5 @@ def cleaner(html, special=None, completely=False):
         html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
 
     html = _escape_character(html)
+    html = _clean_whitespace(html)
     return html

+ 8 - 3
ybw/detail_spider.py

@@ -158,12 +158,17 @@ class DetailSpider:
             valid_node = node
 
         html = page_source(valid_node)
-
-        '''检查文本内容'''
+        '''检查原始页面内容'''
         CheckText(html)
         item["contenthtml"] = html
-        item["detail"] = cleaner(html)
+        special = {
+            '若附件无法下载,你可以尝试使用360极速浏览器进行下载!': '',
+            'DD000E;|EE000F;|FF000E;': '',
+        }
+        item["detail"] = cleaner(html, special)
         item["comeintime"] = int2long(int(time.time()))
+        '''检查清洗之后的详情'''
+        CheckText(item["detail"])
         del item['count'], item['crawl']
         if 'crawl_status' in item:
             del item['crawl_status']