소스 검색

提取value值,替换input标签

lizongze 1 년 전
부모
커밋
8abb4bc522
1개의 변경된 파일17개의 추가작업 그리고 2개의 파일을 삭제
  1. 17 2
      FworkSpider/untils/clean_html.py

+ 17 - 2
FworkSpider/untils/clean_html.py

@@ -15,6 +15,7 @@ INDEPENDENT_TAGS = {
     '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
     '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
     '<input>': '',  # 输入框
+    '</input>': '',  # 输入框
     '<img[^>]*>': '<br>',  # 图片
 }
 # 行内元素
@@ -73,7 +74,7 @@ TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
 ATTRS = {'id', 'class', 'style', 'width'}
 # 特殊样式+指定样式的标签
 SPECIAL_TAGS = {
-    # re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
 }
 
 
@@ -99,6 +100,19 @@ def _escape_character(html):
     return html
 
 
+def _clean_input(html):
+    """提取value值,替换input标签"""
+    inputTxt = re.compile(r'<input .*?>', re.S)
+    valueTxt = re.compile(r'value=["|\'](.*?)["|\']')
+
+    input_list = re.findall(inputTxt, html) or []
+    for ipt in input_list:
+        val = re.findall(valueTxt, ipt)
+        if val and "hidden" not in ipt and "hide" not in ipt and "display: none" not in ipt:
+            html = html.replace(ipt,val[0])
+    return html
+
+
 def _lowercase_tag(html):
     """标签归一化处理(全部小写 + 标签修复)"""
     tags = re.findall("<[^>]+>", html)
@@ -147,7 +161,7 @@ def cleaner(html, special=None, completely=False):
         **BLANKS,
     }
     html = _lowercase_tag(html)
-    html = _del_tag(html)  # 优先处理
+    # html = _del_tag(html)  # 优先处理
     for tag, repl in remove_tags.items():
         html = re.sub(tag, repl, html)
 
@@ -157,4 +171,5 @@ def cleaner(html, special=None, completely=False):
         html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
 
     html = _escape_character(html)
+    html = _clean_input(html)  # 处理 input
     return html