__init__.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import threading
  2. from crawler.engines import BingSearchEngine, QccSearchEngine
  3. from crawler.services import (
  4. SyncData,
  5. DataQuery,
  6. DataExcavate
  7. )
  8. class BreadthCrawler:
  9. _weight_items = {
  10. 'url_weight': 20,
  11. 'keyword_weight': 15,
  12. 'org_weight': 5,
  13. }
  14. def __init__(
  15. self,
  16. enable_sync_data: bool,
  17. enable_query: bool,
  18. enable_excavate: bool,
  19. **kwargs,
  20. ):
  21. self.enable_query = enable_query
  22. self.enable_excavate = enable_excavate
  23. self.enable_sync_data = enable_sync_data
  24. kwargs.update(self._weight_items)
  25. self.app = {}
  26. self._init(**kwargs)
  27. def _init(self, **kwargs):
  28. if self.enable_sync_data:
  29. _sync_data = SyncData(
  30. init_validator=True,
  31. init_collector=True,
  32. validate_interval=1200,
  33. keywords_interval=4 * 3600,
  34. competing_goods_interval=600,
  35. seed_urls_interval=10,
  36. orgs_interval=5 * 3600,
  37. **kwargs
  38. )
  39. self.app['MainSyncData'] = _sync_data
  40. if self.enable_query:
  41. _query_keyword = DataQuery(
  42. engine=BingSearchEngine(),
  43. query_workers=kwargs.pop('query_workers', None),
  44. max_pages=30,
  45. query_interval=300,
  46. **kwargs
  47. )
  48. _query_organization = DataQuery(
  49. engine=QccSearchEngine(),
  50. query_workers=kwargs.pop('query_workers', None),
  51. query_interval=1800,
  52. **kwargs
  53. )
  54. self.app['MainQueryKeyWord'] = _query_keyword
  55. self.app['MainQueryOrganization'] = _query_organization
  56. if self.enable_excavate:
  57. _excavator = DataExcavate(
  58. workers=kwargs.pop('excavate_workers', None),
  59. excavate_depth=kwargs.pop('excavate_depth', 3),
  60. excavate_interval=10,
  61. **kwargs
  62. )
  63. self.app['MainDataExcavate'] = _excavator
  64. def start(self):
  65. for name, app in self.app.items():
  66. threading.Thread(target=app.start, name=name).start()