dongzhaorui 3 жил өмнө
parent
commit
9c8e88f949

+ 7 - 6
find_source/build_spider.py

@@ -1,19 +1,20 @@
 from crawler import BreadthCrawler
-from crawler.search import BingSearchEngine
+from crawler.engines import BingSearchEngine
 
 
 def main():
     engines_lst = [BingSearchEngine()]
     BreadthCrawler(
-        allow_load_filter=True,
-        engines=engines_lst,
+        init_validator=True,
         url_weight=20,
         org_weight=5,
         keyword_weight=15,
-        max_search_page=30,
+        engines=engines_lst,
+        max_query_page=30,
+        loop_sync_interval=1200,
+        loop_query_interval=30,
+        loop_excavate_interval=10,
         excavate_workers=1,
-        loop_search_interval=30,
-        loop_excavate_interval=10
     ).start()
 
 

+ 1 - 1
find_source/common/databases.py

@@ -56,7 +56,7 @@ def update_one(collection: Collection, item):
 
 def insert_one(collection: Collection, item):
     collection.insert_one(item)
-    logger.info(f'[入库成功 - {collection.name}]{item["_id"]}')
+    logger.info(f'[{collection.name} - 入库成功] ObjectId("{item["_id"]}")')
 
 
 def insert_many(collection: Collection, items):

+ 19 - 10
find_source/crawler/Task.py

@@ -24,11 +24,10 @@ class Task(UserDict):
             groups='',
             origin='',
             weight=1,
-            sensitive=False,
-            duplication=False,
-            requirement=False,
-            create_at=None,
-            update_at=None,
+            status_code=None,
+            err_reason='',
+            create_at=int2long(int(time.time())),
+            update_at=int2long(int(time.time())),
     ):
         super(Task, self).__init__(
             name=name,
@@ -40,9 +39,19 @@ class Task(UserDict):
             groups=groups,
             origin=origin,
             weight=weight,
-            sensitive=sensitive,
-            duplication=duplication,
-            requirement=requirement,
-            create_at=(create_at or int2long(int(time.time()))),
-            update_at=(update_at or int2long(int(time.time()))),
+            status_code=status_code,
+            err_reason=err_reason,
+            create_at=create_at,
+            update_at=update_at,
         )
+
+    def _update_at(self):
+        super(Task, self).__setitem__('update_at', int2long(int(time.time())))
+
+    def __setitem__(self, key, value):
+        self._update_at()
+        super(Task, self).__setitem__(str(key), value)
+
+    def __getitem__(self, item):
+        self._update_at()
+        return super(Task, self).__getitem__(item)

+ 0 - 0
find_source/crawler/search/engine.py → find_source/crawler/engines.py


+ 0 - 59
find_source/crawler/retrieve/verify.py

@@ -1,59 +0,0 @@
-import threading
-import time
-
-from common.log import logger
-from crawler.bloom_filter.RedisBloomFilter import RedisFilter
-from settings import (
-    MGO_REMOVAL_DUPLICATE,
-    REQUIREMENT_PHRASE
-)
-
-
-def _requirement_phrase(val: str):
-    """关键词"""
-    for word in REQUIREMENT_PHRASE:
-        if val.find(word) != -1:
-            return True
-    return False
-
-
-class Validator:
-
-    def __init__(self):
-        self._rbf = RedisFilter(redis_key='RemovalDuplicate_')
-        self._rbf.start(1000000000, 0.00001)
-        self._requirement_phrase = _requirement_phrase
-        self._loop_Interval = 7200
-
-    def _sync_data_rubbish(self):
-        while True:
-            count = 0
-            cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
-            try:
-                for item in cursor.sort([('_id', -1)]):
-                    domain = item['domain']
-                    if not isinstance(domain, str):
-                        MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']})
-                        continue
-                    if not self._rbf.is_exists(domain):
-                        self._rbf.add(domain)
-                        count += 1
-            finally:
-                logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条')
-                time.sleep(self._loop_Interval)
-
-    def load_filter(self):
-        logger.info(f'[过滤器]初始化加载')
-        threading.Thread(
-            target=self._sync_data_rubbish,
-            name='RemovalDuplicate_'
-        ).start()
-
-    def add_url(self, url: str):
-        self._rbf.add(url)
-
-    def requirement_word(self, val):
-        return self._requirement_phrase(val)
-
-    def url(self, url: str):
-        return self._rbf.is_exists(url)

+ 2 - 2
find_source/crawler/schedule.py

@@ -37,10 +37,10 @@ class Scheduler:
 
     def _get_task(self, classify: str):
         if classify.lower() == 'query':
-            return self.mrq.pop_task([REDIS_QUERY])
+            return self.mrq.pop_task([REDIS_QUERY], priority=True)
 
         elif classify.lower() == 'excavate':
-            return self.mrq.pop_task([REDIS_EXCAVATE])
+            return self.mrq.pop_task([REDIS_EXCAVATE], priority=True)
 
     def get_excavate_task(self):
         return self._get_task('excavate')

+ 8 - 2
find_source/crawler/utils.py

@@ -23,6 +23,12 @@ def extract_base_url(url):
     return f"{_s}://{_h}/" if _p is None else f"{_s}://{_h}:{_p}/"
 
 
+def parser_domain(val: str):
+    if re.match(r'\d+', val) is None:
+        return re.split(r'[\\.:]', val)
+    return [val]
+
+
 def extract_domain(url):
     """
 
@@ -36,8 +42,8 @@ def extract_page_title(html):
     element = html2element(html)
     nodes = element.xpath('/html/head/title/text()')
     if len(nodes) > 1:
-        return "".format(nodes[-1]).strip()
-    return "".join(nodes).strip()
+        return "".join("".format(nodes[-1]).split())
+    return "".join("".join(nodes).split())
 
 
 def is_url(url):

+ 31 - 0
find_source/crawler/validate.py

@@ -0,0 +1,31 @@
+from crawler.bloom_filter.RedisBloomFilter import RedisFilter
+from settings import REQUIREMENT_PHRASE
+
+
+class Validator:
+
+    def __init__(self, redis_key='Validator_'):
+        self._validator_name = redis_key
+        self._rbf = RedisFilter(redis_key=self._validator_name)
+        self._rbf.start(1000000000, 0.00001)
+
+    @staticmethod
+    def _requirement_phrase(val: str):
+        """关键词"""
+        for word in REQUIREMENT_PHRASE:
+            if val.find(word) != -1:
+                return True
+        return False
+
+    def add_data(self, val: str):
+        return self._rbf.add(val)
+
+    def data(self, val: str):
+        return self._rbf.is_exists(val)
+
+    def phrase(self, val: str):
+        return self._requirement_phrase(val)
+
+    @property
+    def length(self):
+        return len(self._rbf)

+ 6 - 4
find_source/settings.py

@@ -5,10 +5,12 @@ from common.databases import mongo_table, redis_client
 MGO_DATABASE = 'shujuziyuan'
 '''垃圾表'''
 MGO_REMOVAL_DUPLICATE = mongo_table(db=MGO_DATABASE, name='removal_duplicate')
-'''新发现的域名'''
-MGO_DOMAIN = mongo_table(db=MGO_DATABASE, name='new_domains')
-'''检索单位组织|关键词'''
-MGO_SEARCH = mongo_table(db=MGO_DATABASE, name='retrieve_search')
+'''数据挖掘到的结果'''
+MGO_DOMAIN = mongo_table(db=MGO_DATABASE, name='data_excavate')
+'''数据查询到的单位组织|关键词'''
+MGO_QUERY = mongo_table(db=MGO_DATABASE, name='data_query')
+''''数据采集记录'''
+MGO_RECORDS = mongo_table(db=MGO_DATABASE, name='excavate_records')
 '''【组织|单位】'''
 MGO_ORGS = mongo_table(db=MGO_DATABASE, name='retrieve_orgs')
 '''关键词'''