소스 검색

fixbug - 完善清洗页面标签与属性导致的lxml.etree.ParserError问题

dongzhaorui 2 년 전
부모
커밋
d464f4e7cb
2개의 변경된 파일24개의 추가작업 그리고 18개의 파일을 삭제
  1. 2 2
      find_source/crawler/engines.py
  2. 22 16
      find_source/crawler/utils.py

+ 2 - 2
find_source/crawler/engines.py

@@ -7,7 +7,7 @@ from constants import (
 )
 from crawler.analysis import parse_urls
 from crawler.download import Downloader
-from crawler.utils import get_url, html2element
+from crawler.utils import join_url, html2element
 from settings import ENGINE_FEATURE_RETRIEVES
 
 
@@ -71,7 +71,7 @@ class BingSearchEngine(JySearchEngine):
             'FORM': 'PERE',
             'pq': 'intitle:'
         }
-        url = get_url(base_url, params)
+        url = join_url(base_url, params)
         # 下载
         response = self.downloader(url)
         # 解析

+ 22 - 16
find_source/crawler/utils.py

@@ -1,3 +1,4 @@
+import io
 import operator
 import re
 import zlib
@@ -107,7 +108,7 @@ def label_split(val):
     return result
 
 
-def get_url(url: str, parameters: dict):
+def join_url(url: str, parameters: dict):
     """
     拼接url与所带参数
 
@@ -119,17 +120,6 @@ def get_url(url: str, parameters: dict):
     return urljoin(url, _data)
 
 
-def clean_html(source: str):
-    html_str = re.sub(r'<!--[\s\S]*?-->', '', source)
-    html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
-    html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
-    html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
-    html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
-    html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
-    html_str = re.sub(r'<img[^>]*>', '', html_str)
-    return html_str
-
-
 def extract_text(source: str):
     soup = BeautifulSoup(source, "lxml")
     return soup.get_text()
@@ -150,11 +140,17 @@ def verify_text(val: str, length=50):
     return True
 
 
-def element2html(element: HtmlElement) -> str:
-    return unescape(tostring(element, encoding="utf-8").decode())
+def clean_whitespace(text: str):
+    """清洗空白符 \n=换行 \r=回车 \v=垂直制表符 \f=换页"""
+    obj = io.StringIO()
+    for i in text:
+        # 不要剔除 "" 和 "\t" 空白符,保持页面标签名称与属性分离
+        if i not in '\n\r\v\f':
+            obj.write(i)
+    return obj.getvalue()
 
 
-def html2element(source: str, base_url=None) -> HtmlElement:
+def clean_html(source):
     html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', source)
     html_str = re.sub('<!--[\s\S]*?-->', '', html_str)  # 清除注释
     html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)  # 清除样式
@@ -162,12 +158,22 @@ def html2element(source: str, base_url=None) -> HtmlElement:
     html_str = re.sub('</?br.*?>', '', html_str)
     html_str = re.sub(r'<\?xml.*?>', '', html_str)
     html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
+    html_str = clean_whitespace(html_str)
+    return html_str
+
+
+def html2element(source: str, base_url=None) -> HtmlElement:
+    html_str = clean_html(source)
     if len(html_str) == 0:
-        # 防止因清洗页面元素,实例elementHtml对象时报错
+        # 防止清洗页面垃圾标签,导致fromstring抛出lxml.etree.ParserError
         html_str = '''<html lang="en"></html>'''
     return fromstring(html_str, base_url=base_url)
 
 
+def element2html(element: HtmlElement) -> str:
+    return unescape(tostring(element, encoding="utf-8").decode())
+
+
 def iter_node(element: HtmlElement, depth=1):
     yield element, depth
     depth += 1