__init__.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import threading
  2. from .engines import BingSearchEngine, QccSearchEngine
  3. from .services import (
  4. SyncData,
  5. DataQuery,
  6. DataExcavate
  7. )
  8. class BreadthCrawler:
  9. _weight_items = {
  10. 'url_weight': 20,
  11. 'keyword_weight': 15,
  12. 'org_weight': 5,
  13. }
  14. def __init__(
  15. self,
  16. enable_sync_data: bool,
  17. enable_search: bool,
  18. enable_query: bool,
  19. enable_excavate: bool,
  20. **kwargs,
  21. ):
  22. self.enable_sync_data = enable_sync_data
  23. self.enable_search = enable_search
  24. self.enable_query = enable_query
  25. self.enable_excavate = enable_excavate
  26. kwargs.update(self._weight_items)
  27. self._app = {}
  28. self._init(**kwargs)
  29. def _init(self, **kwargs):
  30. if self.enable_sync_data:
  31. _sync_data = SyncData(
  32. init_validator=True,
  33. init_collector=True,
  34. validate_interval=300,
  35. keywords_interval=4 * 3600,
  36. competing_goods_interval=600,
  37. seed_urls_interval=10,
  38. orgs_interval=5 * 3600,
  39. **kwargs
  40. )
  41. self._app['SyncData'] = _sync_data
  42. if self.enable_search:
  43. _bing_search = DataQuery(
  44. engine=BingSearchEngine(),
  45. query_workers=kwargs.pop('query_workers', None),
  46. max_pages=30,
  47. query_interval=3600,
  48. **kwargs
  49. )
  50. self._app['BingSearch'] = _bing_search
  51. if self.enable_query:
  52. _qcc_query = DataQuery(
  53. engine=QccSearchEngine(),
  54. query_workers=kwargs.pop('query_workers', None),
  55. query_interval=1800,
  56. **kwargs
  57. )
  58. self._app['QccQuery'] = _qcc_query
  59. if self.enable_excavate:
  60. _excavator = DataExcavate(
  61. workers=kwargs.pop('excavate_workers', None),
  62. excavate_depth=kwargs.pop('excavate_depth', 3),
  63. excavate_interval=10,
  64. **kwargs
  65. )
  66. self._app['DataExcavate'] = _excavator
  67. def start(self):
  68. for name, app in self._app.items():
  69. threading.Thread(target=app.start, name=name).start()