|
@@ -1,10 +1,11 @@
|
|
|
import re
|
|
|
from urllib.parse import urlencode, urljoin
|
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+from lxml.html import HtmlElement, fromstring, tostring
|
|
|
from urllib3 import get_host
|
|
|
|
|
|
from common.log import logger
|
|
|
-from common.tools import html2element
|
|
|
|
|
|
|
|
|
def err_details(worker):
|
|
@@ -83,3 +84,83 @@ def get_url(url: str, parameters: dict):
|
|
|
"""
|
|
|
_data = '?' + urlencode(parameters)
|
|
|
return urljoin(url, _data)
|
|
|
+
|
|
|
+
|
|
|
+def iter_node(element: HtmlElement, depth=1):
|
|
|
+ yield element, depth
|
|
|
+ depth += 1
|
|
|
+ for sub_element in element:
|
|
|
+ if isinstance(sub_element, HtmlElement):
|
|
|
+ yield from iter_node(sub_element, depth)
|
|
|
+
|
|
|
+
|
|
|
+def element2html(element: HtmlElement) -> str:
|
|
|
+ return tostring(element, encoding="utf-8").decode()
|
|
|
+
|
|
|
+
|
|
|
+def html2element(html_str: str, base_url=None) -> HtmlElement:
|
|
|
+ html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
|
|
|
+ html_str = re.sub('</?br.*?>', '', html_str)
|
|
|
+ html_str = re.sub(r'<\?xml.*?>', '', html_str)
|
|
|
+ html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
|
|
|
+ return fromstring(html_str, base_url=base_url)
|
|
|
+
|
|
|
+
|
|
|
+def valid_element(node: HtmlElement, feature: str):
|
|
|
+ if len(node.xpath(feature)) > 0:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def remove_node(node: HtmlElement):
|
|
|
+ """
|
|
|
+ this is a in-place operation, not necessary to return
|
|
|
+ :param node:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ parent = node.getparent()
|
|
|
+ if parent is not None:
|
|
|
+ parent.remove(node)
|
|
|
+
|
|
|
+
|
|
|
+def drop_tag(node: HtmlElement):
|
|
|
+ """
|
|
|
+ only delete the tag, but merge its text to parent.
|
|
|
+ :param node:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ parent = node.getparent()
|
|
|
+ if parent is not None:
|
|
|
+ node.drop_tag()
|
|
|
+
|
|
|
+
|
|
|
+def clean_html(html_str: str):
|
|
|
+ html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
|
|
|
+ html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
|
|
|
+ html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
|
|
|
+ html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
|
|
|
+ html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
|
|
|
+ html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
|
|
|
+ html_str = re.sub(r'<img[^>]*>', '', html_str)
|
|
|
+ return html_str
|
|
|
+
|
|
|
+
|
|
|
+def extract_text(html_str: str):
|
|
|
+ soup = BeautifulSoup(html_str, "lxml")
|
|
|
+ return soup.get_text()
|
|
|
+
|
|
|
+
|
|
|
+def verify_text(val: str, length=50):
|
|
|
+ """检查数字、字母、中文的个数"""
|
|
|
+ if val is None:
|
|
|
+ return False
|
|
|
+ sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
|
|
|
+ for pattern in sub_pattern:
|
|
|
+ val = re.sub(pattern, '', val)
|
|
|
+ # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
|
|
|
+ if len(val) < length:
|
|
|
+ '''无效文本'''
|
|
|
+ return False
|
|
|
+ '''有效文本'''
|
|
|
+ return True
|