dongzhaorui 3 years ago
parent
commit
371bf5653d
1 changed files with 42 additions and 18 deletions
  1. 42 18
      find_source/crawler/services/basics.py

+ 42 - 18
find_source/crawler/services/basics.py

@@ -71,7 +71,7 @@ class BasicSearch:
     def loops_interval(interval):
         t_name = threading.currentThread().getName()
         next_run_time = delay_by((interval or 300))
-        logger.debug(f'程序运行结束:<{t_name}>,下次运行时间:<{next_run_time}>')
+        logger.debug(f'运行结束:<{t_name}>,下次运行时间:{next_run_time}')
         time.sleep(interval)
 
     @staticmethod
@@ -92,6 +92,20 @@ class BasicSearch:
         }
         return item
 
+    @staticmethod
+    def make_domain_item(task: Task):
+        item = {
+            'name': task['name'],
+            'url': task['url'],
+            'domain': task['domain'],
+            'depth': task['depth'],
+            'origin': task['origin'],
+            'groups': task['groups'],
+            'create_at': task['create_at'],
+            'update_at': task['update_at'],
+        }
+        return item
+
     @staticmethod
     def make_duplicate_removal(task: Task):
         item = {
@@ -102,41 +116,51 @@ class BasicSearch:
         return item
 
     def _push_data(self, purpose: str, task: Task, collection):
-        if purpose == 'save':
-            insert_one(collection, self.make_retrieve_item(task))
+        t_name = threading.currentThread().getName()
+        if purpose == 'query':
+            item = self.make_retrieve_item(task)
+            insert_one(collection, item)
+            logger.info(f'<{t_name}> - 上传查询结果 - {item["_id"]}')
+        elif purpose == 'domain':
+            item = self.make_domain_item(task)
+            insert_one(collection, item)
+            logger.info(f'<{t_name}> - 上传挖掘结果 - {item["_id"]}')
         elif purpose == 'remove':
-            insert_one(collection, self.make_duplicate_removal(task))
+            item = self.make_duplicate_removal(task)
+            insert_one(collection, item)
+            logger.info(f'<{t_name}> - 上传去重特征 - {item["_id"]}')
         else:
             insert_one(collection, task)
+            logger.info(f'<{t_name}> - 上传记录数据 - {task["_id"]}')
 
     def push_remove(self, task: Task):
-        """数据去重的垃圾表"""
-        logger.info(f"[上传去重特征]【{task['name']} - {task['url']}】")
+        """数据去重表"""
         if not self.validator.data(task['url']):
             self._push_data('remove', task, MGO_REMOVAL_DUPLICATE)
             self.validator.add_data(task['url'])
+            return True
+        return False
 
     def push_domain(self, task: Task):
-        """挖掘网站的查询结果"""
-        logger.info(f"[数据挖掘 - 推送]【{task['name']} - {task['domain']}】")
+        """数据挖掘结果,推送保存"""
         if not self.collector.data(task['domain']):
-            self._push_data('save', task, MGO_DOMAIN)
+            self._push_data('domain', task, MGO_DOMAIN)
             self.collector.add_data(task['domain'])
+            return True
+        return False
 
     def push_query(self, task: Task):
-        """搜索组织单位查询结果"""
-        logger.info(f"[查询结果 - 推送]【{task['name']} - {task['url']}】")
-        self._push_data('save', task, MGO_QUERY)
+        """搜索组织单位查询结果,推送保存"""
+        self._push_data('query', task, MGO_QUERY)
 
     def push_records(self, task: Task):
         """挖掘数据的记录"""
-        if task['name'] > 20:
+        if len(task['name']) > 20:
             task['name'] = '{:.20s}'.format(task['name'])
-        logger.info(f"[数据记录 - 推送]【{task['name']} - {task['url']}】")
         self._push_data('records', task, MGO_RECORDS)
 
     def orgs_table(self) -> List[Mapping]:
-        """组织|单位"""
+        """组织|单位"""
         search_orgs = []
         cursor = MGO_ORGS.find(self.query, projection=self.projection)
         for item in cursor.sort(self.sort):
@@ -144,7 +168,7 @@ class BasicSearch:
         return search_orgs
 
     def keywords_table(self):
-        """关键词"""
+        """关键词"""
         search_keywords = []
         cursor = MGO_KEYWORDS.find(projection=self.projection)
         for item in cursor.sort(self.sort):
@@ -152,7 +176,7 @@ class BasicSearch:
         return search_keywords
 
     def seed_urls_table(self) -> List[Mapping]:
-        """种子urls"""
+        """种子列表"""
         search_urls = []
         cursor = MGO_URLS.find(self.query, projection=self.projection)
         for item in cursor.sort(self.sort):
@@ -160,7 +184,7 @@ class BasicSearch:
         return search_urls
 
     def competing_goods_table(self):
-        """竞品urls"""
+        """竞品列表"""
         competing_goods = []
         cursor = MGO_COMPETING_GOODS.find(self.query, projection=self.projection)
         for item in cursor.sort(self.sort):