Эх сурвалжийг харах

优化 - 同步luaconfig数据直接推送收录器

dongzhaorui 3 жил өмнө
parent
commit
762391fd34

+ 16 - 18
find_source/crawler/services/sync_data.py

@@ -13,10 +13,10 @@ from settings import (
 
 class SyncData(BasicSearch):
 
-    def __init__(self, init_validator=False, loop_sync_interval=600, **kwargs):
+    def __init__(self, init_validator=False, loop_interval=600, **kwargs):
         super(SyncData, self).__init__(**kwargs)
         self._init_validator = init_validator
-        self._interval = loop_sync_interval
+        self._interval = loop_interval
         self._init()
 
     def _init(self):
@@ -37,7 +37,7 @@ class SyncData(BasicSearch):
                 classify=self.query_classify,
                 weight=self.keyword_weight
             ))
-        self.scheduler.add_query(lst, level=self.keyword_weight)
+        self.scheduler.add_query(self.keyword_groups, lst, level=self.keyword_weight)
         logger.info(f'[同步数据]更新{len(words)}条关键词')
 
     def sync_data_orgs(self):
@@ -62,7 +62,7 @@ class SyncData(BasicSearch):
                 classify=self.query_classify,
                 weight=self.org_weight
             ))
-        self.scheduler.add_query(lst, level=self.org_weight)
+        self.scheduler.add_query(self.org_groups, lst, level=self.org_weight)
         # 已添加的组织单位名称进行标记,之后不在推送到任务队列
         for item in items:
             MGO_ORGS.update_one(
@@ -129,18 +129,16 @@ class SyncData(BasicSearch):
             )
         logger.info(f'[同步数据]更新{len(items)}条竞品挖掘url')
 
-    def sync_lua_commons(self):
-        """同步lua采集爬虫中网址与网址名称"""
-        logger.info(f'[同步数据]加载lua_commons数据')
-        items = self.lua_common_domains()
-        for item in items:
-            MGO_REMOVAL_DUPLICATE.insert_one(item)
-        logger.info(f'[同步数据]更新{len(items)}个网站域名数据')
+    def sync_collector(self):
+        """同步lua已收录网址,推送url收录器"""
+        logger.info(f'[同步数据]初始化加载收录器')
+        total = self.lua_common_domains()
+        logger.info(f'[同步数据]新收录{total}个网站域名')
 
-    def sync_loading_validator(self):
+    def sync_validator(self):
         """将垃圾表内容加载到过滤器"""
         if self._init_validator:
-            logger.info(f'[同步数据]过滤器加载去重网址特征')
+            logger.info(f'[同步数据]初始化加载过滤器')
             count = 0
             cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
             for item in cursor.sort(self.sort):
@@ -154,19 +152,19 @@ class SyncData(BasicSearch):
                 if not self.validator.data(domain):
                     self.validator.add_data(domain)
                     count += 1
-            logger.info(f'[同步数据]新{count}条去重网址特征')
+            logger.info(f'[同步数据]新{count}条去重网址特征')
 
     def sync_data(self):
         """同步数据"""
         logger.info(f'[同步数据]初始化加载')
         while True:
             try:
-                self.sync_loading_validator()
-                self.sync_lua_commons()
-                self.sync_data_keywords()
-                self.sync_data_orgs()
+                self.sync_collector()
+                self.sync_validator()
                 self.sync_data_competing_goods()
+                self.sync_data_keywords()
                 self.sync_data_urls()
+                self.sync_data_orgs()
             except Exception as e:
                 logger.exception(e)
             self.loops_interval(self._interval)