__init__.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. import threading
  2. from crawler.services import (
  3. SyncData,
  4. QueryKeyWord,
  5. QueryOrganization,
  6. DataExcavate
  7. )
  8. class BreadthCrawler:
  9. def __init__(
  10. self,
  11. allow_sync_data: bool,
  12. allow_query: bool,
  13. allow_excavate: bool,
  14. **kwargs,
  15. ):
  16. self.allow_sync_data = allow_sync_data
  17. self.allow_query = allow_query
  18. self.allow_excavate = allow_excavate
  19. self.enable_keyword_query = False
  20. self.enable_org_query = False
  21. self.enable_excavate = True
  22. self._init(**kwargs)
  23. def _init(self, **kwargs):
  24. if self.allow_sync_data:
  25. '''同步与定时推送数据服务'''
  26. SyncData(
  27. init_validator=True,
  28. init_collector=True,
  29. validate_interval=1200,
  30. keywords_interval=4 * 3600,
  31. competing_goods_interval=600,
  32. seed_urls_interval=10,
  33. orgs_interval=5 * 3600,
  34. **kwargs
  35. )
  36. if self.allow_query:
  37. '''查询服务 - 搜索词'''
  38. self._query_kw = QueryKeyWord(
  39. engine=kwargs.pop('keyword_query_engine', None),
  40. query_workers=kwargs.pop('keyword_query_workers', None),
  41. max_pages=30,
  42. query_interval=300,
  43. **kwargs
  44. )
  45. self.enable_keyword_query = True
  46. '''查询服务 - 组织单位'''
  47. self._query_org = QueryOrganization(
  48. engine=kwargs.pop('org_query_engine', None),
  49. query_workers=kwargs.pop('org_query_workers ', None),
  50. query_interval=1800,
  51. **kwargs
  52. )
  53. self.enable_org_query = True
  54. if self.allow_excavate:
  55. '''数据挖掘'''
  56. self._excavator = DataExcavate(
  57. workers=kwargs.pop('excavate_workers', None),
  58. excavate_depth=kwargs.pop('excavate_depth', 3),
  59. excavate_interval=10,
  60. **kwargs
  61. )
  62. self.enable_excavate = True
  63. def start(self):
  64. if self.allow_query:
  65. if self.enable_keyword_query:
  66. threading.Thread(
  67. target=self._query_kw.start,
  68. name='MainKeywordQuery'
  69. ).start()
  70. if self.enable_org_query:
  71. threading.Thread(
  72. target=self._query_org.start,
  73. name='MainOrganizationQuery'
  74. ).start()
  75. if self.enable_excavate:
  76. threading.Thread(
  77. target=self._excavator.start,
  78. name='MainDataExcavate'
  79. ).start()