1 anno fa · 8abb4bc522
--- a/FworkSpider/untils/clean_html.py
+++ b/FworkSpider/untils/clean_html.py
@@ -15,6 +15,7 @@ INDEPENDENT_TAGS = {
 
				     '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				     '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				     '<input>': '',  # 输入框
			
 
				+    '</input>': '',  # 输入框
			
 
				     '<img[^>]*>': '<br>',  # 图片
			
 
				 }
			
 
				 # 行内元素
			
@@ -73,7 +74,7 @@ TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
 
				 ATTRS = {'id', 'class', 'style', 'width'}
			
 
				 # 特殊样式+指定样式的标签
			
 
				 SPECIAL_TAGS = {
			
 
				-    # re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
			
 
				+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
			
 
				 }
			
 
				 
			
 
				 
			
@@ -99,6 +100,19 @@ def _escape_character(html):
 
				     return html
			
 
				 
			
 
				 
			
 
				+def _clean_input(html):
			
 
				+    """提取value值，替换input标签"""
			
 
				+    inputTxt = re.compile(r'<input .*?>', re.S)
			
 
				+    valueTxt = re.compile(r'value=["|\'](.*?)["|\']')
			
 
				+
			
 
				+    input_list = re.findall(inputTxt, html) or []
			
 
				+    for ipt in input_list:
			
 
				+        val = re.findall(valueTxt, ipt)
			
 
				+        if val and "hidden" not in ipt and "hide" not in ipt and "display: none" not in ipt:
			
 
				+            html = html.replace(ipt,val[0])
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				 def _lowercase_tag(html):
			
 
				     """标签归一化处理（全部小写 + 标签修复）"""
			
 
				     tags = re.findall("<[^>]+>", html)
			
@@ -147,7 +161,7 @@ def cleaner(html, special=None, completely=False):
 
				         **BLANKS,
			
 
				     }
			
 
				     html = _lowercase_tag(html)
			
 
				-    html = _del_tag(html)  # 优先处理
			
 
				+    # html = _del_tag(html)  # 优先处理
			
 
				     for tag, repl in remove_tags.items():
			
 
				         html = re.sub(tag, repl, html)
			
 
				 
			
@@ -157,4 +171,5 @@ def cleaner(html, special=None, completely=False):
 
				         html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				 
			
 
				     html = _escape_character(html)
			
 
				+    html = _clean_input(html)  # 处理 input
			
 
				     return html