瀏覽代碼

update:添加删除标签display:none及其子集规则

dongzhaorui 2 年之前
父節點
當前提交
365bea6f75
共有 1 個文件被更改,包括 14 次插入1 次删除
  1. 14 1
      FworkSpider/untils/clean_html.py

+ 14 - 1
FworkSpider/untils/clean_html.py

@@ -1,4 +1,6 @@
+# -*- coding: utf-8 -*-
 import re
 import re
+
 __all__ = ['cleaner']
 __all__ = ['cleaner']
 
 
 # 独立元素
 # 独立元素
@@ -69,6 +71,10 @@ BLANKS = {
 TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
 TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
 # css属性集合
 # css属性集合
 ATTRS = {'id', 'class', 'style', 'width'}
 ATTRS = {'id', 'class', 'style', 'width'}
+# 特殊样式+指定样式的标签
+SPECIAL_TAGS = {
+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
+}
 
 
 
 
 def _repair_tag():
 def _repair_tag():
@@ -113,6 +119,13 @@ def _lowercase_tag(html):
     return html
     return html
 
 
 
 
+def _del_tag(html):
+    """删除特殊+指定样式的标签"""
+    for tag, repl in SPECIAL_TAGS.items():
+        html = tag.sub(repl, html)
+    return html
+
+
 def cleaner(html, special=None, completely=False):
 def cleaner(html, special=None, completely=False):
     """
     """
     数据清洗
     数据清洗
@@ -124,7 +137,6 @@ def cleaner(html, special=None, completely=False):
     """
     """
     if special is None:
     if special is None:
         special = {}
         special = {}
-
     OTHER.update(special)
     OTHER.update(special)
     remove_tags = {
     remove_tags = {
         **INDEPENDENT_TAGS,
         **INDEPENDENT_TAGS,
@@ -135,6 +147,7 @@ def cleaner(html, special=None, completely=False):
         **BLANKS,
         **BLANKS,
     }
     }
     html = _lowercase_tag(html)
     html = _lowercase_tag(html)
+    html = _del_tag(html)  # 优先处理
     for tag, repl in remove_tags.items():
     for tag, repl in remove_tags.items():
         html = re.sub(tag, repl, html)
         html = re.sub(tag, repl, html)