123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- import threading
- from crawler.services import (
- SyncData,
- QueryKeyWord,
- QueryOrganization,
- DataExcavate
- )
- class BreadthCrawler:
- def __init__(
- self,
- allow_sync_data: bool,
- allow_query: bool,
- allow_excavate: bool,
- **kwargs,
- ):
- self.allow_sync_data = allow_sync_data
- self.allow_query = allow_query
- self.allow_excavate = allow_excavate
- self.enable_keyword_query = False
- self.enable_org_query = False
- self.enable_excavate = True
- self._init(**kwargs)
- def _init(self, **kwargs):
- if self.allow_sync_data:
- '''同步与定时推送数据服务'''
- SyncData(
- init_validator=True,
- init_collector=True,
- validate_interval=1200,
- keywords_interval=4 * 3600,
- competing_goods_interval=600,
- seed_urls_interval=10,
- orgs_interval=5 * 3600,
- **kwargs
- )
- if self.allow_query:
- '''查询服务 - 搜索词'''
- self._query_kw = QueryKeyWord(
- engine=kwargs.pop('keyword_query_engine', None),
- query_workers=kwargs.pop('keyword_query_workers', None),
- max_pages=30,
- query_interval=300,
- **kwargs
- )
- self.enable_keyword_query = True
- '''查询服务 - 组织单位'''
- self._query_org = QueryOrganization(
- engine=kwargs.pop('org_query_engine', None),
- query_workers=kwargs.pop('org_query_workers ', None),
- query_interval=1800,
- **kwargs
- )
- self.enable_org_query = True
- if self.allow_excavate:
- '''数据挖掘'''
- self._excavator = DataExcavate(
- workers=kwargs.pop('excavate_workers', None),
- excavate_depth=kwargs.pop('excavate_depth', 3),
- excavate_interval=10,
- **kwargs
- )
- self.enable_excavate = True
- def start(self):
- if self.allow_query:
- if self.enable_keyword_query:
- threading.Thread(
- target=self._query_kw.start,
- name='MainKeywordQuery'
- ).start()
- if self.enable_org_query:
- threading.Thread(
- target=self._query_org.start,
- name='MainOrganizationQuery'
- ).start()
- if self.enable_excavate:
- threading.Thread(
- target=self._excavator.start,
- name='MainDataExcavate'
- ).start()
|