import threading from crawler.services import ( SyncData, QueryKeyWord, QueryOrganization, DataExcavate ) class BreadthCrawler: def __init__( self, allow_sync_data: bool, allow_query: bool, allow_excavate: bool, **kwargs, ): self.allow_sync_data = allow_sync_data self.allow_query = allow_query self.allow_excavate = allow_excavate self.enable_keyword_query = False self.enable_org_query = False self.enable_excavate = True self._init(**kwargs) def _init(self, **kwargs): if self.allow_sync_data: '''同步与定时推送数据服务''' SyncData( init_validator=True, init_collector=True, validate_interval=1200, keywords_interval=4 * 3600, competing_goods_interval=600, seed_urls_interval=10, orgs_interval=5 * 3600, **kwargs ) if self.allow_query: '''查询服务 - 搜索词''' self._query_kw = QueryKeyWord( engine=kwargs.pop('keyword_query_engine', None), query_workers=kwargs.pop('keyword_query_workers', None), max_pages=30, query_interval=300, **kwargs ) self.enable_keyword_query = True '''查询服务 - 组织单位''' self._query_org = QueryOrganization( engine=kwargs.pop('org_query_engine', None), query_workers=kwargs.pop('org_query_workers ', None), query_interval=1800, **kwargs ) self.enable_org_query = True if self.allow_excavate: '''数据挖掘''' self._excavator = DataExcavate( workers=kwargs.pop('excavate_workers', None), excavate_depth=kwargs.pop('excavate_depth', 3), excavate_interval=10, **kwargs ) self.enable_excavate = True def start(self): if self.allow_query: if self.enable_keyword_query: threading.Thread( target=self._query_kw.start, name='MainKeywordQuery' ).start() if self.enable_org_query: threading.Thread( target=self._query_org.start, name='MainOrganizationQuery' ).start() if self.enable_excavate: threading.Thread( target=self._excavator.start, name='MainDataExcavate' ).start()