|
@@ -7,7 +7,7 @@ from settings import (
|
|
|
MGO_URLS,
|
|
|
MGO_ORGS,
|
|
|
MGO_COMPETING_GOODS,
|
|
|
- MGO_DATA_GARBAGE,
|
|
|
+ MGO_GARBAGE,
|
|
|
MGO_LUA_SPIDERS
|
|
|
)
|
|
|
|
|
@@ -28,7 +28,6 @@ class SyncData(BasicService):
|
|
|
|
|
|
def sync_keywords(self):
|
|
|
"""同步搜索词数据"""
|
|
|
- logger.info(f'[加载数据]搜索词表')
|
|
|
words = self.keywords_table()
|
|
|
# 处理关键词格式并推送到任务队列
|
|
|
words = [str(word).replace(' ', '').strip() for word in words]
|
|
@@ -46,7 +45,6 @@ class SyncData(BasicService):
|
|
|
|
|
|
def sync_orgs(self):
|
|
|
"""同步组织单位数据"""
|
|
|
- logger.info(f'[加载数据]单位组织表')
|
|
|
items = self.orgs_table()
|
|
|
# 处理单位组织名称并推送到任务队列
|
|
|
orgs = []
|
|
@@ -73,11 +71,10 @@ class SyncData(BasicService):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[单位组织表]任务队列加载{len(items)}家单位组织')
|
|
|
+ logger.info(f'[单位网址查询]任务队列加载{len(items)}家单位组织')
|
|
|
|
|
|
def sync_seed_urls(self):
|
|
|
"""同步网址数据"""
|
|
|
- logger.info(f'[加载数据]种子网址表')
|
|
|
items = self.seed_urls_table()
|
|
|
lst = []
|
|
|
for item in items:
|
|
@@ -97,11 +94,10 @@ class SyncData(BasicService):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[种子网址表]任务队列加载{len(items)}条种子网址')
|
|
|
+ logger.info(f'[种子寻源]任务队列加载{len(items)}条种子网址')
|
|
|
|
|
|
def sync_competing_goods(self):
|
|
|
"""同步竞品urls"""
|
|
|
- logger.info(f'[加载数据]竞品网址表')
|
|
|
items = self.competing_goods_table()
|
|
|
# 处理竞品urls并推送到任务队列
|
|
|
lst = []
|
|
@@ -127,13 +123,12 @@ class SyncData(BasicService):
|
|
|
{'_id': item['_id']},
|
|
|
{'$set': {'enable_added': True}}
|
|
|
)
|
|
|
- logger.info(f'[竞品网址表]任务队列加载{len(items)}条网址')
|
|
|
+ logger.info(f'[竞品寻源]任务队列加载{len(items)}条网址')
|
|
|
|
|
|
def data_collector(self):
|
|
|
- """收录器,存放新发现和已拥有的网址域名"""
|
|
|
+ """收录器 - 存放新发现和已拥有的网址域名"""
|
|
|
if self._init_collector:
|
|
|
- logger.info(f'[加载数据]收录器')
|
|
|
- count = 0
|
|
|
+ domains = []
|
|
|
q = {"param_common.11": {'$exists': True}}
|
|
|
projection = {'param_common': 1}
|
|
|
cursor = MGO_LUA_SPIDERS.find(q, projection=projection)
|
|
@@ -142,37 +137,47 @@ class SyncData(BasicService):
|
|
|
if not is_url(url):
|
|
|
continue
|
|
|
domain = extract_domain(url)
|
|
|
- logger.debug(f'[收录器]{domain}')
|
|
|
+ logger.debug(f'[收录器]拉取收录域名特征:{domain}')
|
|
|
+ if domain not in domains:
|
|
|
+ domains.append(domain)
|
|
|
+
|
|
|
+ count = 0
|
|
|
+ for domain in domains:
|
|
|
+ logger.debug(f'[收录器]更新收录域名特征:{domain}')
|
|
|
if not self.collector.data(domain):
|
|
|
+ logger.debug(f'[收录器]添加收录域名特征:{domain}')
|
|
|
self.collector.add_data(domain)
|
|
|
count += 1
|
|
|
- logger.info(f'[收录器]读取{count}条网址域名')
|
|
|
+ logger.info(f'[收录器]加载收录网址特征{count}条')
|
|
|
|
|
|
def data_validator(self):
|
|
|
- """垃圾池:存放寻源过程中垃圾网址和没有相关信息的网址"""
|
|
|
+ """过滤器 - 存放寻源过程中垃圾网址和没有招投标相关信息的网站"""
|
|
|
if self._init_validator:
|
|
|
- logger.info(f'[加载数据]过滤器')
|
|
|
count = 0
|
|
|
- cursor = MGO_DATA_GARBAGE.find(projection={'domain': 1})
|
|
|
+ q = {
|
|
|
+ "source": {"$exists": False}, # 来源
|
|
|
+ "domain": {"$type": "string"}
|
|
|
+ }
|
|
|
+ cursor = MGO_GARBAGE.find(q, projection={'domain': 1})
|
|
|
for item in cursor.sort(self.sort):
|
|
|
- try:
|
|
|
- domain = item['domain']
|
|
|
- if not isinstance(domain, str):
|
|
|
- MGO_DATA_GARBAGE.delete_one({'_id': item['_id']})
|
|
|
- continue
|
|
|
- except IndexError:
|
|
|
- continue
|
|
|
- logger.debug(f'[过滤器]{domain}')
|
|
|
+ domain = item['domain']
|
|
|
+ logger.debug(f'[过滤器]拉取过滤网址特征:{domain}')
|
|
|
if not self.validator.data(domain):
|
|
|
+ logger.debug(f'[过滤器]更新过滤网址特征:{domain}')
|
|
|
self.validator.add_data(domain)
|
|
|
+ MGO_GARBAGE.update_one(
|
|
|
+ {'_id': item['_id']},
|
|
|
+ {'$set': {"source": "other"}}
|
|
|
+ )
|
|
|
count += 1
|
|
|
- logger.info(f'[过滤器]读取{count}条去重特征')
|
|
|
+ logger.info(f'[过滤器]加载去重特征{count}条')
|
|
|
|
|
|
def start(self):
|
|
|
"""程序入口"""
|
|
|
|
|
|
def _validate():
|
|
|
"""数据过滤"""
|
|
|
+ logger.info('[自动寻源]加载任务过滤模块')
|
|
|
while True:
|
|
|
try:
|
|
|
self.data_collector()
|
|
@@ -185,6 +190,7 @@ class SyncData(BasicService):
|
|
|
|
|
|
def _keywords():
|
|
|
"""搜索词"""
|
|
|
+ logger.info('[自动寻源]加载搜索词模块')
|
|
|
while True:
|
|
|
if self._allow_load_data:
|
|
|
try:
|
|
@@ -197,6 +203,7 @@ class SyncData(BasicService):
|
|
|
|
|
|
def _competing_goods():
|
|
|
"""竞品列表"""
|
|
|
+ logger.info('[自动寻源]加载竞品寻源模块')
|
|
|
while True:
|
|
|
if self._allow_load_data:
|
|
|
try:
|
|
@@ -209,6 +216,7 @@ class SyncData(BasicService):
|
|
|
|
|
|
def _seed_urls():
|
|
|
"""种子url"""
|
|
|
+ logger.info('[自动寻源]加载种子寻源模块')
|
|
|
while True:
|
|
|
if self._allow_load_data:
|
|
|
try:
|
|
@@ -221,6 +229,7 @@ class SyncData(BasicService):
|
|
|
|
|
|
def _orgs():
|
|
|
"""单位组织"""
|
|
|
+ logger.info('[自动寻源]加载单位网址查询模块')
|
|
|
while True:
|
|
|
if self._allow_load_data:
|
|
|
try:
|