dongzhaorui 3 years ago
parent
commit
8ff4c2363d
2 changed files with 82 additions and 66 deletions
  1. 0 65
      find_source/common/tools.py
  2. 82 1
      find_source/crawler/utils.py

+ 0 - 65
find_source/common/tools.py

@@ -1,72 +1,7 @@
 import datetime
 import hashlib
-import re
 import time
 
-from bs4 import BeautifulSoup
-from lxml.html import HtmlElement, fromstring, tostring
-
-
-def element2html(element: HtmlElement) -> str:
-    return tostring(element, encoding="utf-8").decode()
-
-
-def html2element(html_str: str, base_url=None) -> HtmlElement:
-    html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
-    html_str = re.sub('</?br.*?>', '', html_str)
-    html_str = re.sub(r'<\?xml.*?>', '', html_str)
-    html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
-    return fromstring(html_str, base_url=base_url)
-
-
-def valid_element(node: HtmlElement, feature: str):
-    if len(node.xpath(feature)) > 0:
-        return True
-    else:
-        return False
-
-
-def remove_node(node: HtmlElement):
-    """
-    this is a in-place operation, not necessary to return
-    :param node:
-    :return:
-    """
-    parent = node.getparent()
-    if parent is not None:
-        parent.remove(node)
-
-
-def clean_html(html_str: str):
-    html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
-    html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
-    html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
-    html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
-    html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
-    html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
-    html_str = re.sub(r'<img[^>]*>', '', html_str)
-    return html_str
-
-
-def extract_text(html_str: str):
-    soup = BeautifulSoup(html_str, "lxml")
-    return soup.get_text()
-
-
-def verify_text(val: str, length=50):
-    """检查数字、字母、中文的个数"""
-    if val is None:
-        return False
-    sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
-    for pattern in sub_pattern:
-        val = re.sub(pattern, '', val)
-    # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
-    if len(val) < length:
-        '''无效文本'''
-        return False
-    '''有效文本'''
-    return True
-
 
 def sha1(text: str):
     """

+ 82 - 1
find_source/crawler/utils.py

@@ -1,10 +1,11 @@
 import re
 from urllib.parse import urlencode, urljoin
 
+from bs4 import BeautifulSoup
+from lxml.html import HtmlElement, fromstring, tostring
 from urllib3 import get_host
 
 from common.log import logger
-from common.tools import html2element
 
 
 def err_details(worker):
@@ -83,3 +84,83 @@ def get_url(url: str, parameters: dict):
     """
     _data = '?' + urlencode(parameters)
     return urljoin(url, _data)
+
+
+def iter_node(element: HtmlElement, depth=1):
+    yield element, depth
+    depth += 1
+    for sub_element in element:
+        if isinstance(sub_element, HtmlElement):
+            yield from iter_node(sub_element, depth)
+
+
+def element2html(element: HtmlElement) -> str:
+    return tostring(element, encoding="utf-8").decode()
+
+
+def html2element(html_str: str, base_url=None) -> HtmlElement:
+    html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
+    html_str = re.sub('</?br.*?>', '', html_str)
+    html_str = re.sub(r'<\?xml.*?>', '', html_str)
+    html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
+    return fromstring(html_str, base_url=base_url)
+
+
+def valid_element(node: HtmlElement, feature: str):
+    if len(node.xpath(feature)) > 0:
+        return True
+    else:
+        return False
+
+
+def remove_node(node: HtmlElement):
+    """
+    this is a in-place operation, not necessary to return
+    :param node:
+    :return:
+    """
+    parent = node.getparent()
+    if parent is not None:
+        parent.remove(node)
+
+
+def drop_tag(node: HtmlElement):
+    """
+    only delete the tag, but merge its text to parent.
+    :param node:
+    :return:
+    """
+    parent = node.getparent()
+    if parent is not None:
+        node.drop_tag()
+
+
+def clean_html(html_str: str):
+    html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
+    html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
+    html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
+    html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
+    html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
+    html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
+    html_str = re.sub(r'<img[^>]*>', '', html_str)
+    return html_str
+
+
+def extract_text(html_str: str):
+    soup = BeautifulSoup(html_str, "lxml")
+    return soup.get_text()
+
+
+def verify_text(val: str, length=50):
+    """检查数字、字母、中文的个数"""
+    if val is None:
+        return False
+    sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
+    for pattern in sub_pattern:
+        val = re.sub(pattern, '', val)
+    # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
+    if len(val) < length:
+        '''无效文本'''
+        return False
+    '''有效文本'''
+    return True