import threading from .engines import BingSearchEngine, QccSearchEngine from .services import ( SyncData, DataQuery, DataExcavate ) class BreadthCrawler: _weight_items = { 'url_weight': 20, 'keyword_weight': 15, 'org_weight': 5, } def __init__( self, enable_sync_data: bool, enable_search: bool, enable_query: bool, enable_excavate: bool, **kwargs, ): self.enable_sync_data = enable_sync_data self.enable_search = enable_search self.enable_query = enable_query self.enable_excavate = enable_excavate kwargs.update(self._weight_items) self._app = {} self._init(**kwargs) def _init(self, **kwargs): if self.enable_sync_data: _sync_data = SyncData( init_validator=True, init_collector=True, validate_interval=300, keywords_interval=4 * 3600, competing_goods_interval=600, seed_urls_interval=10, orgs_interval=5 * 3600, **kwargs ) self._app['SyncData'] = _sync_data if self.enable_search: _bing_search = DataQuery( engine=BingSearchEngine(), query_workers=kwargs.pop('query_workers', None), max_pages=30, query_interval=3600, **kwargs ) self._app['BingSearch'] = _bing_search if self.enable_query: _qcc_query = DataQuery( engine=QccSearchEngine(), query_workers=kwargs.pop('query_workers', None), query_interval=1800, **kwargs ) self._app['QccQuery'] = _qcc_query if self.enable_excavate: _excavator = DataExcavate( workers=kwargs.pop('excavate_workers', None), excavate_depth=kwargs.pop('excavate_depth', 3), excavate_interval=10, **kwargs ) self._app['DataExcavate'] = _excavator def start(self): for name, app in self._app.items(): threading.Thread(target=app.start, name=name).start()