|
@@ -1,7 +1,7 @@
|
|
|
import threading
|
|
|
|
|
|
from common.log import logger
|
|
|
-from crawler.services.basics import BasicSearch
|
|
|
+from crawler.services.basics import BasicService
|
|
|
from crawler.utils import is_url, extract_domain
|
|
|
from settings import (
|
|
|
MGO_URLS,
|
|
@@ -12,7 +12,8 @@ from settings import (
|
|
|
)
|
|
|
|
|
|
|
|
|
-class SyncData(BasicSearch):
|
|
|
+class SyncData(BasicService):
|
|
|
+ """数据同步服务"""
|
|
|
|
|
|
def __init__(
|
|
|
self,
|
|
@@ -29,10 +30,6 @@ class SyncData(BasicSearch):
|
|
|
self._init_validator = init_validator
|
|
|
self._init_collector = init_collector
|
|
|
self._allow_load_data = False
|
|
|
- self._init()
|
|
|
-
|
|
|
- def _init(self):
|
|
|
- threading.Thread(target=self.sync_data, name='SyncData').start()
|
|
|
|
|
|
def sync_keywords(self):
|
|
|
"""同步搜索词数据"""
|
|
@@ -92,10 +89,10 @@ class SyncData(BasicSearch):
|
|
|
if not is_url(item['name']):
|
|
|
items.remove(item)
|
|
|
continue
|
|
|
- exists_url = self.validator.data(item['name'])
|
|
|
- if exists_url:
|
|
|
- items.remove(item)
|
|
|
- continue
|
|
|
+ # exists_url = self.validator.data(item['name'])
|
|
|
+ # if exists_url:
|
|
|
+ # items.remove(item)
|
|
|
+ # continue
|
|
|
lst.append(self.make_task(
|
|
|
url=item['name'],
|
|
|
origin=item['name'],
|
|
@@ -180,7 +177,7 @@ class SyncData(BasicSearch):
|
|
|
count += 1
|
|
|
logger.info(f'[数据同步]过滤器读取{count}条去重特征')
|
|
|
|
|
|
- def sync_data(self):
|
|
|
+ def start(self):
|
|
|
"""数据同步"""
|
|
|
|
|
|
def _validate():
|