|
@@ -1,36 +1,37 @@
|
|
|
import threading
|
|
|
|
|
|
+from crawler.engines import BingSearchEngine, QccSearchEngine
|
|
|
from crawler.services import (
|
|
|
SyncData,
|
|
|
- QueryKeyWord,
|
|
|
- QueryOrganization,
|
|
|
+ DataQuery,
|
|
|
DataExcavate
|
|
|
)
|
|
|
|
|
|
|
|
|
class BreadthCrawler:
|
|
|
+ _weight_items = {
|
|
|
+ 'url_weight': 20,
|
|
|
+ 'keyword_weight': 15,
|
|
|
+ 'org_weight': 5,
|
|
|
+ }
|
|
|
|
|
|
def __init__(
|
|
|
self,
|
|
|
- allow_sync_data: bool,
|
|
|
- allow_query: bool,
|
|
|
- allow_excavate: bool,
|
|
|
+ enable_sync_data: bool,
|
|
|
+ enable_query: bool,
|
|
|
+ enable_excavate: bool,
|
|
|
**kwargs,
|
|
|
):
|
|
|
- self.allow_sync_data = allow_sync_data
|
|
|
- self.allow_query = allow_query
|
|
|
- self.allow_excavate = allow_excavate
|
|
|
-
|
|
|
- self.enable_keyword_query = False
|
|
|
- self.enable_org_query = False
|
|
|
- self.enable_excavate = True
|
|
|
-
|
|
|
+ self.enable_query = enable_query
|
|
|
+ self.enable_excavate = enable_excavate
|
|
|
+ self.enable_sync_data = enable_sync_data
|
|
|
+ kwargs.update(self._weight_items)
|
|
|
+ self.app = {}
|
|
|
self._init(**kwargs)
|
|
|
|
|
|
def _init(self, **kwargs):
|
|
|
- if self.allow_sync_data:
|
|
|
- '''同步与定时推送数据服务'''
|
|
|
- SyncData(
|
|
|
+ if self.enable_sync_data:
|
|
|
+ _sync_data = SyncData(
|
|
|
init_validator=True,
|
|
|
init_collector=True,
|
|
|
validate_interval=1200,
|
|
@@ -40,48 +41,34 @@ class BreadthCrawler:
|
|
|
orgs_interval=5 * 3600,
|
|
|
**kwargs
|
|
|
)
|
|
|
- if self.allow_query:
|
|
|
- '''查询服务 - 搜索词'''
|
|
|
- self._query_kw = QueryKeyWord(
|
|
|
- engine=kwargs.pop('keyword_query_engine', None),
|
|
|
- query_workers=kwargs.pop('keyword_query_workers', None),
|
|
|
+ self.app['MainSyncData'] = _sync_data
|
|
|
+
|
|
|
+ if self.enable_query:
|
|
|
+ _query_keyword = DataQuery(
|
|
|
+ engine=BingSearchEngine(),
|
|
|
+ query_workers=kwargs.pop('query_workers', None),
|
|
|
max_pages=30,
|
|
|
query_interval=300,
|
|
|
**kwargs
|
|
|
)
|
|
|
- self.enable_keyword_query = True
|
|
|
- '''查询服务 - 组织单位'''
|
|
|
- self._query_org = QueryOrganization(
|
|
|
- engine=kwargs.pop('org_query_engine', None),
|
|
|
- query_workers=kwargs.pop('org_query_workers ', None),
|
|
|
+ _query_organization = DataQuery(
|
|
|
+ engine=QccSearchEngine(),
|
|
|
+ query_workers=kwargs.pop('query_workers', None),
|
|
|
query_interval=1800,
|
|
|
**kwargs
|
|
|
)
|
|
|
- self.enable_org_query = True
|
|
|
- if self.allow_excavate:
|
|
|
- '''数据挖掘'''
|
|
|
- self._excavator = DataExcavate(
|
|
|
+ self.app['MainQueryKeyWord'] = _query_keyword
|
|
|
+ self.app['MainQueryOrganization'] = _query_organization
|
|
|
+
|
|
|
+ if self.enable_excavate:
|
|
|
+ _excavator = DataExcavate(
|
|
|
workers=kwargs.pop('excavate_workers', None),
|
|
|
excavate_depth=kwargs.pop('excavate_depth', 3),
|
|
|
excavate_interval=10,
|
|
|
**kwargs
|
|
|
)
|
|
|
- self.enable_excavate = True
|
|
|
+ self.app['MainDataExcavate'] = _excavator
|
|
|
|
|
|
def start(self):
|
|
|
- if self.allow_query:
|
|
|
- if self.enable_keyword_query:
|
|
|
- threading.Thread(
|
|
|
- target=self._query_kw.start,
|
|
|
- name='MainKeywordQuery'
|
|
|
- ).start()
|
|
|
- if self.enable_org_query:
|
|
|
- threading.Thread(
|
|
|
- target=self._query_org.start,
|
|
|
- name='MainOrganizationQuery'
|
|
|
- ).start()
|
|
|
- if self.enable_excavate:
|
|
|
- threading.Thread(
|
|
|
- target=self._excavator.start,
|
|
|
- name='MainDataExcavate'
|
|
|
- ).start()
|
|
|
+ for name, app in self.app.items():
|
|
|
+ threading.Thread(target=app.start, name=name).start()
|