dongzhaorui преди 3 години
родител
ревизия
ff784f50c2
променени са 1 файла, в които са добавени 64 реда и са изтрити 41 реда
  1. 64 41
      find_source/crawler/__init__.py

+ 64 - 41
find_source/crawler/__init__.py

@@ -12,53 +12,76 @@ class BreadthCrawler:
 
     def __init__(
             self,
-            query_kw_engine=None,
-            query_org_engine=None,
-            **kwargs
+            allow_sync_data: bool,
+            allow_query: bool,
+            allow_excavate: bool,
+            **kwargs,
     ):
-        self.enable_query_kw = False
-        self.enable_query_org = False
-        '''同步与定时推送数据服务'''
-        SyncData(
-            init_validator=kwargs.pop('init_validator', False),
-            init_collector=kwargs.pop('init_collector', True),
-            loop_interval=kwargs.pop('loop_sync_interval', 1200)
-        )
-        '''查询搜索词'''
-        if query_kw_engine is not None:
+        self.allow_sync_data = allow_sync_data
+        self.allow_query = allow_query
+        self.allow_excavate = allow_excavate
+
+        self.enable_keyword_query = False
+        self.enable_org_query = False
+        self.enable_excavate = True
+
+        self._init(**kwargs)
+
+    def _init(self, **kwargs):
+        if self.allow_sync_data:
+            '''同步与定时推送数据服务'''
+            SyncData(
+                init_validator=True,
+                init_collector=True,
+                validate_interval=1200,
+                keywords_interval=4 * 3600,
+                competing_goods_interval=600,
+                seed_urls_interval=10,
+                orgs_interval=5 * 3600,
+                **kwargs
+            )
+        if self.allow_query:
+            '''查询服务 - 搜索词'''
             self._query_kw = QueryKeyWord(
-                engine=query_kw_engine,
-                query_workers=kwargs.pop('query_kw_workers', 1),
-                loop_query_interval=kwargs.pop('loop_query_kw_interval', 60),
-                max_query_page=kwargs.pop('max_query_page', 3)
+                engine=kwargs.pop('keyword_query_engine', None),
+                query_workers=kwargs.pop('keyword_query_workers', None),
+                max_pages=30,
+                query_interval=300,
+                **kwargs
             )
-            self.enable_query_kw = True
-        '''查询组织单位'''
-        if query_org_engine is not None:
+            self.enable_keyword_query = True
+            '''查询服务 - 组织单位'''
             self._query_org = QueryOrganization(
-                engine=query_org_engine,
-                query_workers=kwargs.pop('query_org_workers', 1),
-                loop_query_interval=kwargs.pop('loop_query_org_interval', 60),
+                engine=kwargs.pop('org_query_engine', None),
+                query_workers=kwargs.pop('org_query_workers ', None),
+                query_interval=1800,
+                **kwargs
             )
-            self.enable_query_org = True
-        '''数据挖掘'''
-        self._excavator = DataExcavate(
-            workers=kwargs.pop('excavate_workers', 1),
-            loop_interval=kwargs.pop('loop_excavate_interval', 20)
-        )
+            self.enable_org_query = True
+        if self.allow_excavate:
+            '''数据挖掘'''
+            self._excavator = DataExcavate(
+                workers=kwargs.pop('excavate_workers', None),
+                excavate_depth=kwargs.pop('excavate_depth', 3),
+                excavate_interval=10,
+                **kwargs
+            )
+            self.enable_excavate = True
 
     def start(self):
-        if self.enable_query_kw:
-            threading.Thread(
-                target=self._query_kw.start,
-                name='MainQueryKeyWord'
-            ).start()
-        if self.enable_query_org:
+        if self.allow_query:
+            if self.enable_keyword_query:
+                threading.Thread(
+                    target=self._query_kw.start,
+                    name='MainKeywordQuery'
+                ).start()
+            if self.enable_org_query:
+                threading.Thread(
+                    target=self._query_org.start,
+                    name='MainOrganizationQuery'
+                ).start()
+        if self.enable_excavate:
             threading.Thread(
-                target=self._query_org.start,
-                name='MainQueryOrganization'
+                target=self._excavator.start,
+                name='MainDataExcavate'
             ).start()
-        threading.Thread(
-            target=self._excavator.start,
-            name='MainDataExcavate'
-        ).start()