|
@@ -1,3 +1,4 @@
|
|
|
+import io
|
|
|
import operator
|
|
|
import re
|
|
|
import zlib
|
|
@@ -107,7 +108,7 @@ def label_split(val):
|
|
|
return result
|
|
|
|
|
|
|
|
|
-def get_url(url: str, parameters: dict):
|
|
|
+def join_url(url: str, parameters: dict):
|
|
|
"""
|
|
|
拼接url与所带参数
|
|
|
|
|
@@ -119,17 +120,6 @@ def get_url(url: str, parameters: dict):
|
|
|
return urljoin(url, _data)
|
|
|
|
|
|
|
|
|
-def clean_html(source: str):
|
|
|
- html_str = re.sub(r'<!--[\s\S]*?-->', '', source)
|
|
|
- html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
|
|
|
- html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
|
|
|
- html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
|
|
|
- html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
|
|
|
- html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
|
|
|
- html_str = re.sub(r'<img[^>]*>', '', html_str)
|
|
|
- return html_str
|
|
|
-
|
|
|
-
|
|
|
def extract_text(source: str):
|
|
|
soup = BeautifulSoup(source, "lxml")
|
|
|
return soup.get_text()
|
|
@@ -150,11 +140,17 @@ def verify_text(val: str, length=50):
|
|
|
return True
|
|
|
|
|
|
|
|
|
-def element2html(element: HtmlElement) -> str:
|
|
|
- return unescape(tostring(element, encoding="utf-8").decode())
|
|
|
+def clean_whitespace(text: str):
|
|
|
+ """清洗空白符 \n=换行 \r=回车 \v=垂直制表符 \f=换页"""
|
|
|
+ obj = io.StringIO()
|
|
|
+ for i in text:
|
|
|
+ # 不要剔除 "" 和 "\t" 空白符,保持页面标签名称与属性分离
|
|
|
+ if i not in '\n\r\v\f':
|
|
|
+ obj.write(i)
|
|
|
+ return obj.getvalue()
|
|
|
|
|
|
|
|
|
-def html2element(source: str, base_url=None) -> HtmlElement:
|
|
|
+def clean_html(source):
|
|
|
html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', source)
|
|
|
html_str = re.sub('<!--[\s\S]*?-->', '', html_str) # 清除注释
|
|
|
html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str) # 清除样式
|
|
@@ -162,12 +158,22 @@ def html2element(source: str, base_url=None) -> HtmlElement:
|
|
|
html_str = re.sub('</?br.*?>', '', html_str)
|
|
|
html_str = re.sub(r'<\?xml.*?>', '', html_str)
|
|
|
html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
|
|
|
+ html_str = clean_whitespace(html_str)
|
|
|
+ return html_str
|
|
|
+
|
|
|
+
|
|
|
+def html2element(source: str, base_url=None) -> HtmlElement:
|
|
|
+ html_str = clean_html(source)
|
|
|
if len(html_str) == 0:
|
|
|
- # 防止因清洗页面元素,实例elementHtml对象时报错
|
|
|
+ # 防止清洗页面垃圾标签,导致fromstring抛出lxml.etree.ParserError
|
|
|
html_str = '''<html lang="en"></html>'''
|
|
|
return fromstring(html_str, base_url=base_url)
|
|
|
|
|
|
|
|
|
+def element2html(element: HtmlElement) -> str:
|
|
|
+ return unescape(tostring(element, encoding="utf-8").decode())
|
|
|
+
|
|
|
+
|
|
|
def iter_node(element: HtmlElement, depth=1):
|
|
|
yield element, depth
|
|
|
depth += 1
|