Browse Source

添加web页面文本检索方法

dongzhaorui 3 years ago
parent
commit
5fbf36cfb8
2 changed files with 59 additions and 18 deletions
  1. 15 1
      find_source/crawler/defaults.py
  2. 44 17
      find_source/crawler/utils.py

+ 15 - 1
find_source/crawler/defaults.py

@@ -66,7 +66,21 @@ VALID_WORDS = {
     '邀请', '通知公告', '备案', '询价', '谈判', '中小企业', '分散采购', '中(终)止', '购买', '竟价',
     '竞争性谈判', '定点', '耕地', '拍卖公告', '物资', '省', '市', '县/区',
 }
-
+# 页面文本内容检索词
+PAGE_TEXT_CHECK_WORDS = {
+    '招标', '流标', '评标', '询价', '中标候选人', '抽签', '谈判', '中选', '意见征询',
+    '更正公告', '废标', '补遗', '议价', '邀请', '资格预审', '竞标', '变更', '遴选',
+    '磋商', '项目', '评审', '询比', '开标', '澄清', '比选', '中止', '采购', '竟价',
+    '招投标', '拟建', '成交', '中标', '竞争性谈判', '工程', '验收公告', '更正',
+    '单一来源', '变更公告', '合同', '违规', '评判', '监理', '竞价', '答疑',
+    '终止', '系统'
+}
+# 页面文本内容过滤词
+PAGE_TEXT_FILTER_WORDS = {
+    '基金', '保险', '通知', '面试', '进入', '律师事务所', '征求', '课题申报', '。',
+    '影视项目', '习近平', '主席', '领导人', '建党', '组织', '首个', '正式启动', '必填',
+    '代表队', '!', '表彰'
+}
 
 FOOTER_TEXTS = {}
 PAGE_TEXTS = {'尾页', '下页', '下一页'}

+ 44 - 17
find_source/crawler/utils.py

@@ -2,8 +2,8 @@ import re
 from html import unescape
 from urllib.parse import urlencode, urljoin
 
+import tldextract
 from bs4 import BeautifulSoup
-from lxml.etree import ParseError
 from lxml.html import etree, HtmlElement, fromstring, tostring
 from urllib3 import get_host
 
@@ -13,7 +13,9 @@ from crawler.defaults import (
     USELESS_ATTR,
     TAGS_CAN_BE_REMOVE_IF_EMPTY,
     VALID_WORDS,
-    VOID_WORDS
+    VOID_WORDS,
+    PAGE_TEXT_CHECK_WORDS,
+    PAGE_TEXT_FILTER_WORDS
 )
 
 
@@ -24,6 +26,12 @@ def err_details(worker):
     return worker
 
 
+def split_domain(val: str):
+    if re.match(r'\d+', val) is None:
+        return re.split(r'[\\.:]', val)
+    return [val]
+
+
 def extract_host(url):
     """
 
@@ -33,19 +41,23 @@ def extract_host(url):
     return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
 
 
-def split_domain(val: str):
-    if re.match(r'\d+', val) is None:
-        return re.split(r'[\\.:]', val)
-    return [val]
-
-
 def extract_domain(url):
     """
+    抽取一级域名,使用点连接域和后缀字段(如果提供的域名是ipv4,就返回ipv4;)
 
-    # >>> base_url = extract_domain('http://192.168.3.207:8080/')
+    # >>> extract_domain('http://192.168.3.207:8080/')
+    192.168.3.207
+    # >>> extract_domain('http://forums.bbc.co.uk')
+    'bbc.co.uk'
     """
-    _, host, port = get_host(url)
-    return f"{host}" if port is None else f"{host}:{port}"
+    ext = tldextract.extract(url)
+    return ext.registered_domain or ext.ipv4
+
+
+def extract_fqdn(url):
+    """返回一个完全限定的域名"""
+    ext = tldextract.extract(url)
+    return ext.fqdn or ext.ipv4
 
 
 def extract_page_title(source):
@@ -53,7 +65,7 @@ def extract_page_title(source):
     try:
         element = html2element(source)
         node = element.xpath('/html/head/title/text()|//title/text()')
-    except ParseError:
+    except etree.ParserError:
         pass
     if len(node) > 1:
         return "".join(";".join(node).split())
@@ -73,11 +85,10 @@ def is_url(url):
 
 
 def is_domain(domain):
-    _regex = re.compile(
-        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
-        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
-        r'(?::\d+)?', re.IGNORECASE)
-    return re.match(_regex, domain) is not None
+    ext = tldextract.extract(domain)
+    if not ext.domain:
+        return False
+    return True
 
 
 def label_split(val):
@@ -142,6 +153,9 @@ def html2element(source: str, base_url=None) -> HtmlElement:
     html_str = re.sub('</?br.*?>', '', html_str)
     html_str = re.sub(r'<\?xml.*?>', '', html_str)
     html_str = re.sub(r'<[!]DOCTYPE.*?>', '', html_str)
+    if len(html_str) == 0:
+        # 防止因清洗页面元素,实例elementHtml对象时报错
+        html_str = '''<html lang="en"></html>'''
     return fromstring(html_str, base_url=base_url)
 
 
@@ -239,3 +253,16 @@ def check_text_by_words(val: str):
         if search is not None:
             return True
     return False
+
+
+def check_page_by_words(val: str):
+    if 7 < len(val) < 100:
+        for word in PAGE_TEXT_FILTER_WORDS:
+            search = re.search(word, val)
+            if search is not None:
+                return False
+        for keyword in PAGE_TEXT_CHECK_WORDS:
+            search = re.search(keyword, val)
+            if search is not None:
+                return True
+    return False