123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import threading
- from .engines import BingSearchEngine, QccSearchEngine
- from .services import (
- SyncData,
- DataQuery,
- DataExcavate
- )
- class BreadthCrawler:
- _weight_items = {
- 'url_weight': 20,
- 'keyword_weight': 15,
- 'org_weight': 5,
- }
- def __init__(
- self,
- enable_sync_data: bool,
- enable_search: bool,
- enable_query: bool,
- enable_excavate: bool,
- **kwargs,
- ):
- self.enable_sync_data = enable_sync_data
- self.enable_search = enable_search
- self.enable_query = enable_query
- self.enable_excavate = enable_excavate
- kwargs.update(self._weight_items)
- self._app = {}
- self._init(**kwargs)
- def _init(self, **kwargs):
- if self.enable_sync_data:
- _sync_data = SyncData(
- init_validator=True,
- init_collector=True,
- validate_interval=300,
- keywords_interval=4 * 3600,
- competing_goods_interval=600,
- seed_urls_interval=10,
- orgs_interval=5 * 3600,
- **kwargs
- )
- self._app['SyncData'] = _sync_data
- if self.enable_search:
- _bing_search = DataQuery(
- engine=BingSearchEngine(),
- query_workers=kwargs.pop('query_workers', None),
- max_pages=30,
- query_interval=3600,
- **kwargs
- )
- self._app['BingSearch'] = _bing_search
- if self.enable_query:
- _qcc_query = DataQuery(
- engine=QccSearchEngine(),
- query_workers=kwargs.pop('query_workers', None),
- query_interval=1800,
- **kwargs
- )
- self._app['QccQuery'] = _qcc_query
- if self.enable_excavate:
- _excavator = DataExcavate(
- workers=kwargs.pop('excavate_workers', None),
- excavate_depth=kwargs.pop('excavate_depth', 3),
- excavate_interval=10,
- **kwargs
- )
- self._app['DataExcavate'] = _excavator
- def start(self):
- for name, app in self._app.items():
- threading.Thread(target=app.start, name=name).start()
|