|
@@ -12,53 +12,76 @@ class BreadthCrawler:
|
|
|
|
|
|
def __init__(
|
|
|
self,
|
|
|
- query_kw_engine=None,
|
|
|
- query_org_engine=None,
|
|
|
- **kwargs
|
|
|
+ allow_sync_data: bool,
|
|
|
+ allow_query: bool,
|
|
|
+ allow_excavate: bool,
|
|
|
+ **kwargs,
|
|
|
):
|
|
|
- self.enable_query_kw = False
|
|
|
- self.enable_query_org = False
|
|
|
- '''同步与定时推送数据服务'''
|
|
|
- SyncData(
|
|
|
- init_validator=kwargs.pop('init_validator', False),
|
|
|
- init_collector=kwargs.pop('init_collector', True),
|
|
|
- loop_interval=kwargs.pop('loop_sync_interval', 1200)
|
|
|
- )
|
|
|
- '''查询搜索词'''
|
|
|
- if query_kw_engine is not None:
|
|
|
+ self.allow_sync_data = allow_sync_data
|
|
|
+ self.allow_query = allow_query
|
|
|
+ self.allow_excavate = allow_excavate
|
|
|
+
|
|
|
+ self.enable_keyword_query = False
|
|
|
+ self.enable_org_query = False
|
|
|
+ self.enable_excavate = True
|
|
|
+
|
|
|
+ self._init(**kwargs)
|
|
|
+
|
|
|
+ def _init(self, **kwargs):
|
|
|
+ if self.allow_sync_data:
|
|
|
+ '''同步与定时推送数据服务'''
|
|
|
+ SyncData(
|
|
|
+ init_validator=True,
|
|
|
+ init_collector=True,
|
|
|
+ validate_interval=1200,
|
|
|
+ keywords_interval=4 * 3600,
|
|
|
+ competing_goods_interval=600,
|
|
|
+ seed_urls_interval=10,
|
|
|
+ orgs_interval=5 * 3600,
|
|
|
+ **kwargs
|
|
|
+ )
|
|
|
+ if self.allow_query:
|
|
|
+ '''查询服务 - 搜索词'''
|
|
|
self._query_kw = QueryKeyWord(
|
|
|
- engine=query_kw_engine,
|
|
|
- query_workers=kwargs.pop('query_kw_workers', 1),
|
|
|
- loop_query_interval=kwargs.pop('loop_query_kw_interval', 60),
|
|
|
- max_query_page=kwargs.pop('max_query_page', 3)
|
|
|
+ engine=kwargs.pop('keyword_query_engine', None),
|
|
|
+ query_workers=kwargs.pop('keyword_query_workers', None),
|
|
|
+ max_pages=30,
|
|
|
+ query_interval=300,
|
|
|
+ **kwargs
|
|
|
)
|
|
|
- self.enable_query_kw = True
|
|
|
- '''查询组织单位'''
|
|
|
- if query_org_engine is not None:
|
|
|
+ self.enable_keyword_query = True
|
|
|
+ '''查询服务 - 组织单位'''
|
|
|
self._query_org = QueryOrganization(
|
|
|
- engine=query_org_engine,
|
|
|
- query_workers=kwargs.pop('query_org_workers', 1),
|
|
|
- loop_query_interval=kwargs.pop('loop_query_org_interval', 60),
|
|
|
+ engine=kwargs.pop('org_query_engine', None),
|
|
|
+ query_workers=kwargs.pop('org_query_workers ', None),
|
|
|
+ query_interval=1800,
|
|
|
+ **kwargs
|
|
|
)
|
|
|
- self.enable_query_org = True
|
|
|
- '''数据挖掘'''
|
|
|
- self._excavator = DataExcavate(
|
|
|
- workers=kwargs.pop('excavate_workers', 1),
|
|
|
- loop_interval=kwargs.pop('loop_excavate_interval', 20)
|
|
|
- )
|
|
|
+ self.enable_org_query = True
|
|
|
+ if self.allow_excavate:
|
|
|
+ '''数据挖掘'''
|
|
|
+ self._excavator = DataExcavate(
|
|
|
+ workers=kwargs.pop('excavate_workers', None),
|
|
|
+ excavate_depth=kwargs.pop('excavate_depth', 3),
|
|
|
+ excavate_interval=10,
|
|
|
+ **kwargs
|
|
|
+ )
|
|
|
+ self.enable_excavate = True
|
|
|
|
|
|
def start(self):
|
|
|
- if self.enable_query_kw:
|
|
|
- threading.Thread(
|
|
|
- target=self._query_kw.start,
|
|
|
- name='MainQueryKeyWord'
|
|
|
- ).start()
|
|
|
- if self.enable_query_org:
|
|
|
+ if self.allow_query:
|
|
|
+ if self.enable_keyword_query:
|
|
|
+ threading.Thread(
|
|
|
+ target=self._query_kw.start,
|
|
|
+ name='MainKeywordQuery'
|
|
|
+ ).start()
|
|
|
+ if self.enable_org_query:
|
|
|
+ threading.Thread(
|
|
|
+ target=self._query_org.start,
|
|
|
+ name='MainOrganizationQuery'
|
|
|
+ ).start()
|
|
|
+ if self.enable_excavate:
|
|
|
threading.Thread(
|
|
|
- target=self._query_org.start,
|
|
|
- name='MainQueryOrganization'
|
|
|
+ target=self._excavator.start,
|
|
|
+ name='MainDataExcavate'
|
|
|
).start()
|
|
|
- threading.Thread(
|
|
|
- target=self._excavator.start,
|
|
|
- name='MainDataExcavate'
|
|
|
- ).start()
|