|
@@ -3,6 +3,7 @@ import hashlib
|
|
|
import re
|
|
|
import time
|
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
from lxml.html import HtmlElement, fromstring, tostring
|
|
|
|
|
|
|
|
@@ -10,12 +11,12 @@ def element2html(element: HtmlElement) -> str:
|
|
|
return tostring(element, encoding="utf-8").decode()
|
|
|
|
|
|
|
|
|
-def html2element(html_str: str) -> HtmlElement:
|
|
|
+def html2element(html_str: str, base_url=None) -> HtmlElement:
|
|
|
html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
|
|
|
html_str = re.sub('</?br.*?>', '', html_str)
|
|
|
html_str = re.sub(r'<\?xml.*?>', '', html_str)
|
|
|
html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
|
|
|
- return fromstring(html_str)
|
|
|
+ return fromstring(html_str, base_url=base_url)
|
|
|
|
|
|
|
|
|
def valid_element(node: HtmlElement, feature: str):
|
|
@@ -36,6 +37,22 @@ def remove_node(node: HtmlElement):
|
|
|
parent.remove(node)
|
|
|
|
|
|
|
|
|
+def clean_html(html_str: str):
|
|
|
+ html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
|
|
|
+ html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
|
|
|
+ html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
|
|
|
+ html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
|
|
|
+ html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
|
|
|
+ html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
|
|
|
+ html_str = re.sub(r'<img[^>]*>', '', html_str)
|
|
|
+ return html_str
|
|
|
+
|
|
|
+
|
|
|
+def extract_text(html_str: str):
|
|
|
+ soup = BeautifulSoup(html_str, "lxml")
|
|
|
+ return soup.get_text()
|
|
|
+
|
|
|
+
|
|
|
def verify_text(val: str, length=50):
|
|
|
"""检查数字、字母、中文的个数"""
|
|
|
if val is None:
|