__init__.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import threading
  2. from crawler.services import (
  3. SyncData,
  4. QueryKeyWord,
  5. QueryOrganization,
  6. DataExcavate
  7. )
  8. class BreadthCrawler:
  9. def __init__(
  10. self,
  11. query_kw_engine=None,
  12. query_org_engine=None,
  13. **kwargs
  14. ):
  15. self.enable_query_kw = False
  16. self.enable_query_org = False
  17. '''同步与定时推送数据服务'''
  18. SyncData(
  19. init_validator=kwargs.get('init_validator', False),
  20. init_collector=kwargs.get('init_collector', True),
  21. loop_interval=kwargs.get('loop_sync_interval', 1200)
  22. )
  23. '''查询搜索词'''
  24. if query_kw_engine is not None:
  25. self._query_kw = QueryKeyWord(
  26. engine=query_kw_engine,
  27. query_workers=kwargs.get('query_kw_workers', 1),
  28. loop_query_interval=kwargs.get('loop_query_kw_interval', 60),
  29. max_query_page=kwargs.get('max_query_page', 3)
  30. )
  31. self.enable_query_kw = True
  32. '''查询组织单位'''
  33. if query_org_engine is not None:
  34. self._query_org = QueryOrganization(
  35. engine=query_org_engine,
  36. query_workers=kwargs.get('query_org_workers', 1),
  37. loop_query_interval=kwargs.get('loop_query_org_interval', 60),
  38. )
  39. self.enable_query_org = True
  40. '''数据挖掘'''
  41. self._excavator = DataExcavate(
  42. workers=kwargs.get('excavate_workers', 1),
  43. loop_interval=kwargs.get('loop_excavate_interval', 20)
  44. )
  45. def start(self):
  46. if self.enable_query_kw:
  47. threading.Thread(
  48. target=self._query_kw.start,
  49. name='MainQueryKeyWord'
  50. ).start()
  51. if self.enable_query_org:
  52. threading.Thread(
  53. target=self._query_org.start,
  54. name='MainQueryOrganization'
  55. ).start()
  56. threading.Thread(
  57. target=self._excavator.start,
  58. name='MainDataExcavate'
  59. ).start()