|
@@ -2,20 +2,28 @@ import threading
|
|
|
|
|
|
from common.log import logger
|
|
|
from crawler.services.basics import BasicSearch
|
|
|
-from crawler.utils import is_url
|
|
|
+from crawler.utils import is_url, extract_domain
|
|
|
from settings import (
|
|
|
MGO_URLS,
|
|
|
MGO_ORGS,
|
|
|
MGO_COMPETING_GOODS,
|
|
|
- MGO_REMOVAL_DUPLICATE
|
|
|
+ MGO_REMOVAL_DUPLICATE,
|
|
|
+ MGO_LUA_SPIDERS
|
|
|
)
|
|
|
|
|
|
|
|
|
class SyncData(BasicSearch):
|
|
|
|
|
|
- def __init__(self, init_validator=False, loop_interval=600, **kwargs):
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ init_validator=False,
|
|
|
+ init_collector=False,
|
|
|
+ loop_interval=600,
|
|
|
+ **kwargs
|
|
|
+ ):
|
|
|
super(SyncData, self).__init__(**kwargs)
|
|
|
self._init_validator = init_validator
|
|
|
+ self._init_collector = init_collector
|
|
|
self._interval = loop_interval
|
|
|
self._init()
|
|
|
|
|
@@ -25,7 +33,7 @@ class SyncData(BasicSearch):
|
|
|
def sync_data_keywords(self):
|
|
|
"""同步关键词数据"""
|
|
|
logger.info(f'[同步数据]加载关键词')
|
|
|
- words = self.seed_keywords()
|
|
|
+ words = self.keywords_table()
|
|
|
# 处理关键词格式并推送到任务队列
|
|
|
words = [str(word).replace(' ', '').strip() for word in words]
|
|
|
lst = []
|
|
@@ -43,7 +51,7 @@ class SyncData(BasicSearch):
|
|
|
def sync_data_orgs(self):
|
|
|
"""同步组织单位数据"""
|
|
|
logger.info(f'[同步数据]加载单位组织数据')
|
|
|
- items = self.seed_orgs()
|
|
|
+ items = self.orgs_table()
|
|
|
# 处理单位组织名称并推送到任务队列
|
|
|
orgs = []
|
|
|
for item in items:
|
|
@@ -74,7 +82,7 @@ class SyncData(BasicSearch):
|
|
|
def sync_data_urls(self):
|
|
|
"""同步网址数据"""
|
|
|
logger.info(f'[同步数据]加载种子url列表')
|
|
|
- items = self.seed_urls()
|
|
|
+ items = self.seed_urls_table()
|
|
|
lst = []
|
|
|
for item in items:
|
|
|
if not is_url(item['name']):
|
|
@@ -102,7 +110,7 @@ class SyncData(BasicSearch):
|
|
|
def sync_data_competing_goods(self):
|
|
|
"""同步竞品urls"""
|
|
|
logger.info(f'[同步数据]加载竞品url列表')
|
|
|
- items = self.seed_competing_goods()
|
|
|
+ items = self.competing_goods_table()
|
|
|
# 处理竞品urls并推送到任务队列
|
|
|
lst = []
|
|
|
for item in items:
|
|
@@ -131,9 +139,23 @@ class SyncData(BasicSearch):
|
|
|
|
|
|
def sync_collector(self):
|
|
|
"""同步lua已收录网址,推送url收录器"""
|
|
|
- logger.info(f'[同步数据]初始化加载收录器')
|
|
|
- total = self.lua_common_domains()
|
|
|
- logger.info(f'[同步数据]新收录{total}个网站域名')
|
|
|
+ if self._init_collector:
|
|
|
+ logger.info(f'[同步数据]初始化加载收录器')
|
|
|
+ count = 0
|
|
|
+ projection = {'param_common': 1}
|
|
|
+ cursor = MGO_LUA_SPIDERS.find(projection=projection)
|
|
|
+ for item in cursor.sort(self.sort):
|
|
|
+ try:
|
|
|
+ url = item['param_common'][11]
|
|
|
+ if not is_url(url):
|
|
|
+ continue
|
|
|
+ domain = extract_domain(url)
|
|
|
+ except IndexError:
|
|
|
+ continue
|
|
|
+ if not self.collector.data(domain):
|
|
|
+ self.collector.add_data(domain)
|
|
|
+ count += 1
|
|
|
+ logger.info(f'[同步数据]新收录{count}个网站域名')
|
|
|
|
|
|
def sync_validator(self):
|
|
|
"""将垃圾表内容加载到过滤器"""
|