dongzhaorui 3 years ago
parent
commit
2096fc8cb5

+ 5 - 0
find_source/common/tools.py

@@ -1,5 +1,6 @@
 import datetime
 import hashlib
+import json
 import time
 
 
@@ -69,3 +70,7 @@ def delay_by(delay=0, method='seconds', fmt="%Y-%m-%d %H:%M:%S"):
     else:
         _timedelta = datetime.timedelta(seconds=delay)
     return (_current_now + _timedelta).strftime(fmt)
+
+
+def detect_encoding(b):
+    return json.detect_encoding(b)

+ 52 - 0
find_source/crawler/analysis/TimeExtractor.py

@@ -0,0 +1,52 @@
+import re
+
+from lxml.html import HtmlElement
+
+DATETIME_PATTERN = [
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
+    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
+    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
+    "(\d{1,2}[-|/|.]\d{1,2})",
+    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
+    "(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
+    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
+    "(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
+    "(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
+    "(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
+    "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
+    "(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
+    "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
+    "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
+    "(\d{4}年\d{1,2}月\d{1,2}日)",
+    "(\d{2}年\d{1,2}月\d{1,2}日)",
+    "(\d{1,2}月\d{1,2}日)"
+]
+
+
+class TimeExtractor:
+    def __init__(self):
+        self.time_pattern = DATETIME_PATTERN
+
+    def extractor(self, element: HtmlElement) -> str:
+        # text = ''.join(element.xpath('.//text()'))
+        text = ''.join(element.xpath('string(.)'))
+        for dt in self.time_pattern:
+            dt_obj = re.search(dt, text)
+            if dt_obj:
+                return dt_obj.group(1)
+        else:
+            return ''

+ 3 - 1
find_source/crawler/analysis/__init__.py

@@ -1,9 +1,11 @@
 from .DomAnalysis import DomAnalysis
+from .TimeExtractor import TimeExtractor
 
 __all__ = [
     'parse_urls',
     'parser_items',
-    'Parser'
+    'Parser',
+    'TimeExtractor'
 ]
 
 

+ 57 - 0
find_source/crawler/defaults.py

@@ -0,0 +1,57 @@
+
+
+TAGS_CAN_BE_REMOVE_IF_EMPTY = ['section', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']
+
+USELESS_TAG = [
+    'style',
+    'script',
+    'link',
+    'video',
+    'iframe',
+    'source',
+    'picture',
+    'blockquote',
+    'input',
+    'footer',
+    'img',
+]
+USELESS_ATTR = {
+    'share',
+    'contribution',
+    'copyright',
+    'copy-right',
+    'disclaimer',
+    'recommend',
+    'related',
+    'footer',
+    'foot',
+    'comment',
+    'hearder',
+    'social',
+    'submeta',
+    'report-infor',
+    'tfooter',
+    'logo',
+    'bottom',
+    'nav',
+    'top',
+    'position',
+    'location',
+    'page',
+    'navigation',
+}
+
+
+KEYWORDS = {
+    '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
+    '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
+    '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
+    '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
+    '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
+    '终止', '系统'
+}
+
+FOOTER_TEXTS = {'网安备', '关于我们', '地图', '建议意见', '法律声明', '信箱'}
+CATEGORY_TEXTS = {'政策', '办事指南', '首页', '党'}
+PAGE_TEXTS = {'尾页', '下页', '下一页'}
+LOGIN_TEXTS = {'忘记密码', '登录', '注册'}

+ 112 - 42
find_source/crawler/utils.py

@@ -1,11 +1,18 @@
 import re
+from html import unescape
 from urllib.parse import urlencode, urljoin
 
 from bs4 import BeautifulSoup
-from lxml.html import HtmlElement, fromstring, tostring
+from lxml.html import etree, HtmlElement, fromstring, tostring
 from urllib3 import get_host
 
 from common.log import logger
+from crawler.defaults import (
+    USELESS_TAG,
+    USELESS_ATTR,
+    TAGS_CAN_BE_REMOVE_IF_EMPTY,
+    KEYWORDS,
+)
 
 
 def err_details(worker):
@@ -39,8 +46,8 @@ def extract_domain(url):
     return f"{host}" if port is None else f"{host}:{port}"
 
 
-def extract_page_title(html):
-    element = html2element(html)
+def extract_page_title(source):
+    element = html2element(source)
     nodes = element.xpath('/html/head/title/text()|//title/text()')
     if len(nodes) > 1:
         return "".join(";".join(nodes).split())
@@ -86,31 +93,59 @@ def get_url(url: str, parameters: dict):
     return urljoin(url, _data)
 
 
-def iter_node(element: HtmlElement, depth=1):
-    yield element, depth
-    depth += 1
-    for sub_element in element:
-        if isinstance(sub_element, HtmlElement):
-            yield from iter_node(sub_element, depth)
+def clean_html(source: str):
+    html_str = re.sub(r'<!--[\s\S]*?-->', '', source)
+    html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
+    html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
+    html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
+    html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
+    html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
+    html_str = re.sub(r'<img[^>]*>', '', html_str)
+    return html_str
+
+
+def extract_text(source: str):
+    soup = BeautifulSoup(source, "lxml")
+    return soup.get_text()
+
+
+def verify_text(val: str, length=50):
+    """检查数字、字母、中文的个数"""
+    if val is None:
+        return False
+    sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
+    for pattern in sub_pattern:
+        val = re.sub(pattern, '', val)
+    # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
+    if len(val) < length:
+        '''无效文本'''
+        return False
+    '''有效文本'''
+    return True
 
 
 def element2html(element: HtmlElement) -> str:
-    return tostring(element, encoding="utf-8").decode()
+    return unescape(tostring(element, encoding="utf-8").decode())
 
 
-def html2element(html_str: str, base_url=None) -> HtmlElement:
-    html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', html_str)
+def html2element(source: str, base_url=None) -> HtmlElement:
+    html_str = re.sub('\ufeff|\xa0|\u3000|\x00', '', source)
+    html_str = re.sub('<!--[\s\S]*?-->', '', html_str)  # 清除注释
+    html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)  # 清除样式
+    html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>', '', html_str)  # 清除js
     html_str = re.sub('</?br.*?>', '', html_str)
     html_str = re.sub(r'<\?xml.*?>', '', html_str)
     html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
     return fromstring(html_str, base_url=base_url)
 
 
-def valid_element(node: HtmlElement, feature: str):
-    if len(node.xpath(feature)) > 0:
-        return True
-    else:
-        return False
+def iter_node(element: HtmlElement, depth=1):
+    yield element, depth
+    depth += 1
+    for sub_element in element:
+        if isinstance(sub_element, HtmlElement):
+            yield from iter_node(sub_element, depth)
+    # print('退出', depth)
 
 
 def remove_node(node: HtmlElement):
@@ -121,7 +156,8 @@ def remove_node(node: HtmlElement):
     """
     parent = node.getparent()
     if parent is not None:
-        parent.remove(node)
+        node.drop_tree()
+        # parent.remove(node)
 
 
 def drop_tag(node: HtmlElement):
@@ -135,32 +171,66 @@ def drop_tag(node: HtmlElement):
         node.drop_tag()
 
 
-def clean_html(html_str: str):
-    html_str = re.sub(r'<!--[\s\S]*?-->', '', html_str)
-    html_str = re.sub(r'<html>|<html [^>]*>|</html>', '', html_str)
-    html_str = re.sub(r'<head>[\s\S]*?</head>', '', html_str)
-    html_str = re.sub(r'<script[^<>]*>[\s\S]*?</script>|</script>', '', html_str)
-    html_str = re.sub(r'<link[^<>]*>[\s\S]*?', '', html_str)
-    html_str = re.sub(r'<style[^<>]*>[\s\S]*?</style>', '', html_str)
-    html_str = re.sub(r'<img[^>]*>', '', html_str)
-    return html_str
+def is_empty_element(node: HtmlElement):
+    return not node.getchildren() and not node.text
 
 
-def extract_text(html_str: str):
-    soup = BeautifulSoup(html_str, "lxml")
-    return soup.get_text()
+def normalize_node(element: HtmlElement):
+    etree.strip_elements(element, *USELESS_TAG, with_tail=False)
+    # 节点预处理,删除节点与更新节点的操作在同一循环发生时,更新节点的操作不会生效,原因:?
+    # 空节点合并、噪声节点剔除
+    for node, _ in iter_node(element):
+        if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
+            remove_node(node)
 
+        if node.tag.lower() == 'p':
+            etree.strip_tags(node, 'span')
+            etree.strip_tags(node, 'strong')
 
-def verify_text(val: str, length=50):
+        # if a div tag does not contain any sub node, it could be converted to p node.
+        if node.tag.lower() == 'div' and not node.getchildren():
+            node.tag = 'p'
+
+        if node.tag.lower() == 'span' and not node.getchildren():
+            node.tag = 'p'
+
+        # remove empty p tag
+        if node.tag.lower() == 'p' and not node.xpath('.//img'):
+            if not (node.text and node.text.strip()):
+                drop_tag(node)
+
+        # Delete inline styles
+        style = node.get('style')
+        if style:
+            del node.attrib['style']
+
+    # 删除包含干扰属性的节点(完全匹配)
+    for node, _ in iter_node(element):
+        attr = (node.get('id') or node.get('class'))
+        if attr:
+            if attr.lower() in USELESS_ATTR:
+                remove_node(node)
+                break
+
+    # # 删除无效节点(模糊匹配)
+    # for node, _ in iter_node(element):
+    #     attrib = (node.get('id') or node.get('class'))
+    #     if attrib:
+    #         for attr in USELESS_ATTR:
+    #             if re.match(attr, attrib.lower()) is not None:
+    #                 remove_node(node)
+    #                 break
+
+
+def pre_parse(element):
+    normalize_node(element)
+    return element
+
+
+def is_title(val: str):
     """检查数字、字母、中文的个数"""
-    if val is None:
-        return False
-    sub_pattern = ['<[^>]+>', '[^0-9a-zA-Z\u4e00-\u9fa5]+']
-    for pattern in sub_pattern:
-        val = re.sub(pattern, '', val)
-    # 若文本长度小于指定文本长度(length),表示页面内容无详情内容
-    if len(val) < length:
-        '''无效文本'''
-        return False
-    '''有效文本'''
-    return True
+    for keyword in KEYWORDS:
+        search = re.search(keyword, val)
+        if search is not None:
+            return True
+    return False