dongzhaorui %!s(int64=3) %!d(string=hai) anos
pai
achega
3a25ce9ee9
Modificáronse 1 ficheiros con 9 adicións e 12 borrados
  1. 9 12
      find_source/crawler/services/basics.py

+ 9 - 12
find_source/crawler/services/basics.py

@@ -2,7 +2,7 @@ import threading
 import time
 from typing import List, Mapping
 
-from common.databases import insert_one
+from common.databases import insert_one, update_one_by_domain
 from common.log import logger
 from common.tools import delay_by
 from constants import (
@@ -34,9 +34,6 @@ class BasicSearch:
 
     def __init__(
             self,
-            keyword_weight=9,
-            url_weight=8,
-            org_weight=7,
             scheduler=None,
             validator=None,
             downloader=None,
@@ -54,10 +51,9 @@ class BasicSearch:
         self.projection = {'name': 1}
         self.sort = [('_id', -1)]
         # 权重
-        self.org_weight = org_weight
-        self.url_weight = url_weight
-        self.keyword_weight = keyword_weight
-        self.retrieve_weight = 0
+        self.org_weight = (kwargs.pop('org_weight', None) or 7)
+        self.url_weight = (kwargs.pop('url_weight', None) or 8)
+        self.keyword_weight = (kwargs.pop('keyword_weight', None) or 9)
         # 分类
         self.visit_classify = VISIT_CLASSIFY
         self.query_classify = QUERY_CLASSIFY
@@ -68,10 +64,11 @@ class BasicSearch:
         self.competing_groups = COMPETING_GOODS
 
     @staticmethod
-    def loops_interval(interval):
+    def loops_interval(interval, enable_debug_log=False):
         t_name = threading.currentThread().getName()
         next_run_time = delay_by((interval or 300))
-        logger.debug(f'运行结束:<{t_name}>,下次运行时间:{next_run_time}')
+        if enable_debug_log:
+            logger.debug(f'运行结束:<{t_name}>,下次运行时间:{next_run_time}')
         time.sleep(interval)
 
     @staticmethod
@@ -127,8 +124,8 @@ class BasicSearch:
             logger.info(f'<{t_name}> - 上传挖掘结果 - {item["_id"]}')
         elif purpose == 'remove':
             item = self.make_duplicate_removal(task)
-            insert_one(collection, item)
-            logger.info(f'<{t_name}> - 上传去重特征 - {item["_id"]}')
+            update_one_by_domain(collection, item)
+            logger.info(f'<{t_name}> - 上传去重特征 - {item["domain"]}')
         else:
             insert_one(collection, task)
             logger.info(f'<{t_name}> - 上传记录数据 - {task["_id"]}')