12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- import threading
- from crawler.services import (
- SyncData,
- QueryKeyWord,
- QueryOrganization,
- DataExcavate
- )
- class BreadthCrawler:
- def __init__(
- self,
- query_kw_engine=None,
- query_org_engine=None,
- **kwargs
- ):
- self.enable_query_kw = False
- self.enable_query_org = False
- '''同步与定时推送数据服务'''
- SyncData(
- init_validator=kwargs.get('init_validator', False),
- init_collector=kwargs.get('init_collector', True),
- loop_interval=kwargs.get('loop_sync_interval', 1200)
- )
- '''查询搜索词'''
- if query_kw_engine is not None:
- self._query_kw = QueryKeyWord(
- engine=query_kw_engine,
- query_workers=kwargs.get('query_kw_workers', 1),
- loop_query_interval=kwargs.get('loop_query_kw_interval', 60),
- max_query_page=kwargs.get('max_query_page', 3)
- )
- self.enable_query_kw = True
- '''查询组织单位'''
- if query_org_engine is not None:
- self._query_org = QueryOrganization(
- engine=query_org_engine,
- query_workers=kwargs.get('query_org_workers', 1),
- loop_query_interval=kwargs.get('loop_query_org_interval', 60),
- )
- self.enable_query_org = True
- '''数据挖掘'''
- self._excavator = DataExcavate(
- workers=kwargs.get('excavate_workers', 1),
- loop_interval=kwargs.get('loop_excavate_interval', 20)
- )
- def start(self):
- if self.enable_query_kw:
- threading.Thread(
- target=self._query_kw.start,
- name='MainQueryKeyWord'
- ).start()
- if self.enable_query_org:
- threading.Thread(
- target=self._query_org.start,
- name='MainQueryOrganization'
- ).start()
- threading.Thread(
- target=self._excavator.start,
- name='MainDataExcavate'
- ).start()
|