|
@@ -13,10 +13,10 @@ from settings import (
|
|
|
|
|
|
class SyncData(BasicSearch):
|
|
|
|
|
|
- def __init__(self, init_validator=False, loop_sync_interval=600, **kwargs):
|
|
|
+ def __init__(self, init_validator=False, loop_interval=600, **kwargs):
|
|
|
super(SyncData, self).__init__(**kwargs)
|
|
|
self._init_validator = init_validator
|
|
|
- self._interval = loop_sync_interval
|
|
|
+ self._interval = loop_interval
|
|
|
self._init()
|
|
|
|
|
|
def _init(self):
|
|
@@ -37,7 +37,7 @@ class SyncData(BasicSearch):
|
|
|
classify=self.query_classify,
|
|
|
weight=self.keyword_weight
|
|
|
))
|
|
|
- self.scheduler.add_query(lst, level=self.keyword_weight)
|
|
|
+ self.scheduler.add_query(self.keyword_groups, lst, level=self.keyword_weight)
|
|
|
logger.info(f'[同步数据]更新{len(words)}条关键词')
|
|
|
|
|
|
def sync_data_orgs(self):
|
|
@@ -62,7 +62,7 @@ class SyncData(BasicSearch):
|
|
|
classify=self.query_classify,
|
|
|
weight=self.org_weight
|
|
|
))
|
|
|
- self.scheduler.add_query(lst, level=self.org_weight)
|
|
|
+ self.scheduler.add_query(self.org_groups, lst, level=self.org_weight)
|
|
|
# 已添加的组织单位名称进行标记,之后不在推送到任务队列
|
|
|
for item in items:
|
|
|
MGO_ORGS.update_one(
|
|
@@ -129,18 +129,16 @@ class SyncData(BasicSearch):
|
|
|
)
|
|
|
logger.info(f'[同步数据]更新{len(items)}条竞品挖掘url')
|
|
|
|
|
|
- def sync_lua_commons(self):
|
|
|
- """同步lua采集爬虫中网址与网址名称"""
|
|
|
- logger.info(f'[同步数据]加载lua_commons数据')
|
|
|
- items = self.lua_common_domains()
|
|
|
- for item in items:
|
|
|
- MGO_REMOVAL_DUPLICATE.insert_one(item)
|
|
|
- logger.info(f'[同步数据]更新{len(items)}个网站域名数据')
|
|
|
+ def sync_collector(self):
|
|
|
+ """同步lua已收录网址,推送url收录器"""
|
|
|
+ logger.info(f'[同步数据]初始化加载收录器')
|
|
|
+ total = self.lua_common_domains()
|
|
|
+ logger.info(f'[同步数据]新收录{total}个网站域名')
|
|
|
|
|
|
- def sync_loading_validator(self):
|
|
|
+ def sync_validator(self):
|
|
|
"""将垃圾表内容加载到过滤器"""
|
|
|
if self._init_validator:
|
|
|
- logger.info(f'[同步数据]过滤器加载去重网址特征')
|
|
|
+ logger.info(f'[同步数据]初始化加载过滤器')
|
|
|
count = 0
|
|
|
cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
|
|
|
for item in cursor.sort(self.sort):
|
|
@@ -154,19 +152,19 @@ class SyncData(BasicSearch):
|
|
|
if not self.validator.data(domain):
|
|
|
self.validator.add_data(domain)
|
|
|
count += 1
|
|
|
- logger.info(f'[同步数据]更新{count}条去重网址特征')
|
|
|
+ logger.info(f'[同步数据]新增{count}条去重网址特征')
|
|
|
|
|
|
def sync_data(self):
|
|
|
"""同步数据"""
|
|
|
logger.info(f'[同步数据]初始化加载')
|
|
|
while True:
|
|
|
try:
|
|
|
- self.sync_loading_validator()
|
|
|
- self.sync_lua_commons()
|
|
|
- self.sync_data_keywords()
|
|
|
- self.sync_data_orgs()
|
|
|
+ self.sync_collector()
|
|
|
+ self.sync_validator()
|
|
|
self.sync_data_competing_goods()
|
|
|
+ self.sync_data_keywords()
|
|
|
self.sync_data_urls()
|
|
|
+ self.sync_data_orgs()
|
|
|
except Exception as e:
|
|
|
logger.exception(e)
|
|
|
self.loops_interval(self._interval)
|