1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374 |
- import threading
- from crawler.engines import BingSearchEngine, QccSearchEngine
- from crawler.services import (
- SyncData,
- DataQuery,
- DataExcavate
- )
- class BreadthCrawler:
- _weight_items = {
- 'url_weight': 20,
- 'keyword_weight': 15,
- 'org_weight': 5,
- }
- def __init__(
- self,
- enable_sync_data: bool,
- enable_query: bool,
- enable_excavate: bool,
- **kwargs,
- ):
- self.enable_query = enable_query
- self.enable_excavate = enable_excavate
- self.enable_sync_data = enable_sync_data
- kwargs.update(self._weight_items)
- self.app = {}
- self._init(**kwargs)
- def _init(self, **kwargs):
- if self.enable_sync_data:
- _sync_data = SyncData(
- init_validator=True,
- init_collector=True,
- validate_interval=1200,
- keywords_interval=4 * 3600,
- competing_goods_interval=600,
- seed_urls_interval=10,
- orgs_interval=5 * 3600,
- **kwargs
- )
- self.app['MainSyncData'] = _sync_data
- if self.enable_query:
- _query_keyword = DataQuery(
- engine=BingSearchEngine(),
- query_workers=kwargs.pop('query_workers', None),
- max_pages=30,
- query_interval=300,
- **kwargs
- )
- _query_organization = DataQuery(
- engine=QccSearchEngine(),
- query_workers=kwargs.pop('query_workers', None),
- query_interval=1800,
- **kwargs
- )
- self.app['MainQueryKeyWord'] = _query_keyword
- self.app['MainQueryOrganization'] = _query_organization
- if self.enable_excavate:
- _excavator = DataExcavate(
- workers=kwargs.pop('excavate_workers', None),
- excavate_depth=kwargs.pop('excavate_depth', 3),
- excavate_interval=10,
- **kwargs
- )
- self.app['MainDataExcavate'] = _excavator
- def start(self):
- for name, app in self.app.items():
- threading.Thread(target=app.start, name=name).start()
|