|
@@ -32,7 +32,7 @@ class SyncData(BasicSearch):
|
|
|
|
|
|
def sync_data_keywords(self):
|
|
|
"""同步关键词数据"""
|
|
|
- logger.info(f'[同步数据]加载关键词')
|
|
|
+ logger.info(f'[数据同步 - 关键词]开始加载')
|
|
|
words = self.keywords_table()
|
|
|
# 处理关键词格式并推送到任务队列
|
|
|
words = [str(word).replace(' ', '').strip() for word in words]
|
|
@@ -46,18 +46,18 @@ class SyncData(BasicSearch):
|
|
|
weight=self.keyword_weight
|
|
|
))
|
|
|
self.scheduler.add_query(self.keyword_groups, lst, level=self.keyword_weight)
|
|
|
- logger.info(f'[同步数据]更新{len(words)}条关键词')
|
|
|
+ logger.info(f'[数据同步 - 关键词]{len(words)}条')
|
|
|
|
|
|
def sync_data_orgs(self):
|
|
|
"""同步组织单位数据"""
|
|
|
- logger.info(f'[同步数据]加载单位组织数据')
|
|
|
+ logger.info(f'[数据同步 - 组织列表]开始加载')
|
|
|
items = self.orgs_table()
|
|
|
# 处理单位组织名称并推送到任务队列
|
|
|
orgs = []
|
|
|
for item in items:
|
|
|
name = item.get('name')
|
|
|
if name in ['', None]:
|
|
|
- logger.warning(f'[异常的单位组织]{item}')
|
|
|
+ logger.error(f'[数据同步 - 组织列表]组织名称错误: {item}')
|
|
|
continue
|
|
|
word = str(name).replace(' ', '').strip()
|
|
|
orgs.append(word)
|
|
@@ -77,11 +77,11 @@ class SyncData(BasicSearch):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[同步数据]更新{len(items)}个单位组织')
|
|
|
+ logger.info(f'[数据同步 - 组织列表]{len(items)}个')
|
|
|
|
|
|
def sync_data_urls(self):
|
|
|
"""同步网址数据"""
|
|
|
- logger.info(f'[同步数据]加载种子url列表')
|
|
|
+ logger.info(f'[数据同步 - 种子列表]开始加载')
|
|
|
items = self.seed_urls_table()
|
|
|
lst = []
|
|
|
for item in items:
|
|
@@ -105,11 +105,11 @@ class SyncData(BasicSearch):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[同步数据]更新{len(items)}条网址数据')
|
|
|
+ logger.info(f'[数据同步 - 种子列表]{len(items)}条')
|
|
|
|
|
|
def sync_data_competing_goods(self):
|
|
|
"""同步竞品urls"""
|
|
|
- logger.info(f'[同步数据]加载竞品url列表')
|
|
|
+ logger.info(f'[数据同步 - 竞品列表]开始加载')
|
|
|
items = self.competing_goods_table()
|
|
|
# 处理竞品urls并推送到任务队列
|
|
|
lst = []
|
|
@@ -135,12 +135,12 @@ class SyncData(BasicSearch):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[同步数据]更新{len(items)}条竞品挖掘url')
|
|
|
+ logger.info(f'[数据同步 - 竞品列表]{len(items)}条')
|
|
|
|
|
|
def sync_collector(self):
|
|
|
"""同步lua已收录网址,推送url收录器"""
|
|
|
if self._init_collector:
|
|
|
- logger.info(f'[同步数据]初始化加载收录器')
|
|
|
+ logger.info(f'[数据同步 - 收录器]初始化加载')
|
|
|
count = 0
|
|
|
projection = {'param_common': 1}
|
|
|
cursor = MGO_LUA_SPIDERS.find(projection=projection)
|
|
@@ -155,12 +155,12 @@ class SyncData(BasicSearch):
|
|
|
if not self.collector.data(domain):
|
|
|
self.collector.add_data(domain)
|
|
|
count += 1
|
|
|
- logger.info(f'[同步数据]新收录{count}个网站域名')
|
|
|
+ logger.info(f'[数据同步 - 收录器]加载{count}个网站域名')
|
|
|
|
|
|
def sync_validator(self):
|
|
|
"""将垃圾表内容加载到过滤器"""
|
|
|
if self._init_validator:
|
|
|
- logger.info(f'[同步数据]初始化加载过滤器')
|
|
|
+ logger.info(f'[数据同步 - 过滤器]初始化加载')
|
|
|
count = 0
|
|
|
cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
|
|
|
for item in cursor.sort(self.sort):
|
|
@@ -174,11 +174,11 @@ class SyncData(BasicSearch):
|
|
|
if not self.validator.data(domain):
|
|
|
self.validator.add_data(domain)
|
|
|
count += 1
|
|
|
- logger.info(f'[同步数据]新增{count}条去重网址特征')
|
|
|
+ logger.info(f'[数据同步- 过滤器]加载{count}条去重特征')
|
|
|
|
|
|
def sync_data(self):
|
|
|
- """同步数据"""
|
|
|
- logger.info(f'[同步数据]初始化加载')
|
|
|
+ """数据同步"""
|
|
|
+ logger.info(f'[数据同步]初始化加载')
|
|
|
while True:
|
|
|
try:
|
|
|
self.sync_collector()
|