import threading from crawler.engines import BingSearchEngine, QccSearchEngine from crawler.services import ( SyncData, DataQuery, DataExcavate ) class BreadthCrawler: _weight_items = { 'url_weight': 20, 'keyword_weight': 15, 'org_weight': 5, } def __init__( self, enable_sync_data: bool, enable_query: bool, enable_excavate: bool, **kwargs, ): self.enable_query = enable_query self.enable_excavate = enable_excavate self.enable_sync_data = enable_sync_data kwargs.update(self._weight_items) self.app = {} self._init(**kwargs) def _init(self, **kwargs): if self.enable_sync_data: _sync_data = SyncData( init_validator=True, init_collector=True, validate_interval=1200, keywords_interval=4 * 3600, competing_goods_interval=600, seed_urls_interval=10, orgs_interval=5 * 3600, **kwargs ) self.app['MainSyncData'] = _sync_data if self.enable_query: _query_keyword = DataQuery( engine=BingSearchEngine(), query_workers=kwargs.pop('query_workers', None), max_pages=30, query_interval=300, **kwargs ) _query_organization = DataQuery( engine=QccSearchEngine(), query_workers=kwargs.pop('query_workers', None), query_interval=1800, **kwargs ) self.app['MainQueryKeyWord'] = _query_keyword self.app['MainQueryOrganization'] = _query_organization if self.enable_excavate: _excavator = DataExcavate( workers=kwargs.pop('excavate_workers', None), excavate_depth=kwargs.pop('excavate_depth', 3), excavate_interval=10, **kwargs ) self.app['MainDataExcavate'] = _excavator def start(self): for name, app in self.app.items(): threading.Thread(target=app.start, name=name).start()