|
@@ -28,7 +28,7 @@ class SyncData(BasicService):
|
|
|
|
|
|
def sync_keywords(self):
|
|
|
"""同步搜索词数据"""
|
|
|
- logger.info(f'[数据同步]开始加载 - 搜索词表')
|
|
|
+ logger.info(f'[加载数据]搜索词表')
|
|
|
words = self.keywords_table()
|
|
|
# 处理关键词格式并推送到任务队列
|
|
|
words = [str(word).replace(' ', '').strip() for word in words]
|
|
@@ -42,18 +42,18 @@ class SyncData(BasicService):
|
|
|
weight=self.keyword_weight
|
|
|
))
|
|
|
self.scheduler.add_query(self.keyword_groups, lst, level=self.keyword_weight)
|
|
|
- logger.info(f'[数据同步]任务队列读取{len(words)}条搜索词')
|
|
|
+ logger.info(f'[搜索词表]任务队列加载{len(words)}条搜索词')
|
|
|
|
|
|
def sync_orgs(self):
|
|
|
"""同步组织单位数据"""
|
|
|
- logger.info(f'[数据同步]开始加载 - 单位组织表')
|
|
|
+ logger.info(f'[加载数据]单位组织表')
|
|
|
items = self.orgs_table()
|
|
|
# 处理单位组织名称并推送到任务队列
|
|
|
orgs = []
|
|
|
for item in items:
|
|
|
name = item.get('name')
|
|
|
if name in ['', None]:
|
|
|
- logger.error(f'[数据同步 - 组织列表]组织名称错误: {item}')
|
|
|
+ logger.error(f'[单位组织表]组织名称错误: {item}')
|
|
|
continue
|
|
|
word = str(name).replace(' ', '').strip()
|
|
|
orgs.append(word)
|
|
@@ -73,11 +73,11 @@ class SyncData(BasicService):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[数据同步]任务队列读取{len(items)}家单位组织')
|
|
|
+ logger.info(f'[单位组织表]任务队列加载{len(items)}家单位组织')
|
|
|
|
|
|
def sync_seed_urls(self):
|
|
|
"""同步网址数据"""
|
|
|
- logger.info(f'[数据同步]开始加载 - 种子网址表')
|
|
|
+ logger.info(f'[加载数据]种子网址表')
|
|
|
items = self.seed_urls_table()
|
|
|
lst = []
|
|
|
for item in items:
|
|
@@ -97,11 +97,11 @@ class SyncData(BasicService):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[数据同步]任务队列读取{len(items)}条种子网址')
|
|
|
+ logger.info(f'[种子网址表]任务队列加载{len(items)}条种子网址')
|
|
|
|
|
|
def sync_competing_goods(self):
|
|
|
"""同步竞品urls"""
|
|
|
- logger.info(f'[数据同步]开始加载 - 竞品网址表')
|
|
|
+ logger.info(f'[加载数据]竞品网址表')
|
|
|
items = self.competing_goods_table()
|
|
|
# 处理竞品urls并推送到任务队列
|
|
|
lst = []
|
|
@@ -127,12 +127,12 @@ class SyncData(BasicService):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[数据同步]任务队列读取{len(items)}条竞品网址')
|
|
|
+ logger.info(f'[竞品网址表]任务队列加载{len(items)}条网址')
|
|
|
|
|
|
- def sync_collector(self):
|
|
|
+ def data_collector(self):
|
|
|
"""收录器,存放新发现和已拥有的网址域名"""
|
|
|
if self._init_collector:
|
|
|
- logger.info(f'[数据同步]开始加载 - 收录器')
|
|
|
+ logger.info(f'[加载数据]收录器')
|
|
|
count = 0
|
|
|
q = {"param_common.11": {'$exists': True}}
|
|
|
projection = {'param_common': 1}
|
|
@@ -142,15 +142,16 @@ class SyncData(BasicService):
|
|
|
if not is_url(url):
|
|
|
continue
|
|
|
domain = extract_domain(url)
|
|
|
+ logger.debug(f'[收录器]{domain}')
|
|
|
if not self.collector.data(domain):
|
|
|
self.collector.add_data(domain)
|
|
|
count += 1
|
|
|
- logger.info(f'[数据同步]收录器读取{count}个网址域名')
|
|
|
+ logger.info(f'[收录器]读取{count}条网址域名')
|
|
|
|
|
|
- def sync_validator(self):
|
|
|
+ def data_validator(self):
|
|
|
"""垃圾池:存放寻源过程中垃圾网址和没有相关信息的网址"""
|
|
|
if self._init_validator:
|
|
|
- logger.info(f'[数据同步]开始加载 - 过滤器')
|
|
|
+ logger.info(f'[加载数据]过滤器')
|
|
|
count = 0
|
|
|
cursor = MGO_DATA_GARBAGE.find(projection={'domain': 1})
|
|
|
for item in cursor.sort(self.sort):
|
|
@@ -161,20 +162,21 @@ class SyncData(BasicService):
|
|
|
continue
|
|
|
except IndexError:
|
|
|
continue
|
|
|
+ logger.debug(f'[过滤器]{domain}')
|
|
|
if not self.validator.data(domain):
|
|
|
self.validator.add_data(domain)
|
|
|
count += 1
|
|
|
- logger.info(f'[数据同步]过滤器读取{count}条去重特征')
|
|
|
+ logger.info(f'[过滤器]读取{count}条去重特征')
|
|
|
|
|
|
def start(self):
|
|
|
- """数据同步"""
|
|
|
+ """程序入口"""
|
|
|
|
|
|
def _validate():
|
|
|
- """验证模块"""
|
|
|
+ """数据过滤"""
|
|
|
while True:
|
|
|
try:
|
|
|
- self.sync_collector()
|
|
|
- self.sync_validator()
|
|
|
+ self.data_collector()
|
|
|
+ self.data_validator()
|
|
|
if not self._allow_load_data:
|
|
|
self._allow_load_data = True
|
|
|
except Exception as e:
|