dongzhaorui 3 年之前
父节点
当前提交
aa8118002c
共有 1 个文件被更改,包括 34 次插入47 次删除
  1. 34 47
      find_source/crawler/__init__.py

+ 34 - 47
find_source/crawler/__init__.py

@@ -1,36 +1,37 @@
 import threading
 
+from crawler.engines import BingSearchEngine, QccSearchEngine
 from crawler.services import (
     SyncData,
-    QueryKeyWord,
-    QueryOrganization,
+    DataQuery,
     DataExcavate
 )
 
 
 class BreadthCrawler:
+    _weight_items = {
+        'url_weight': 20,
+        'keyword_weight': 15,
+        'org_weight': 5,
+    }
 
     def __init__(
             self,
-            allow_sync_data: bool,
-            allow_query: bool,
-            allow_excavate: bool,
+            enable_sync_data: bool,
+            enable_query: bool,
+            enable_excavate: bool,
             **kwargs,
     ):
-        self.allow_sync_data = allow_sync_data
-        self.allow_query = allow_query
-        self.allow_excavate = allow_excavate
-
-        self.enable_keyword_query = False
-        self.enable_org_query = False
-        self.enable_excavate = True
-
+        self.enable_query = enable_query
+        self.enable_excavate = enable_excavate
+        self.enable_sync_data = enable_sync_data
+        kwargs.update(self._weight_items)
+        self.app = {}
         self._init(**kwargs)
 
     def _init(self, **kwargs):
-        if self.allow_sync_data:
-            '''同步与定时推送数据服务'''
-            SyncData(
+        if self.enable_sync_data:
+            _sync_data = SyncData(
                 init_validator=True,
                 init_collector=True,
                 validate_interval=1200,
@@ -40,48 +41,34 @@ class BreadthCrawler:
                 orgs_interval=5 * 3600,
                 **kwargs
             )
-        if self.allow_query:
-            '''查询服务 - 搜索词'''
-            self._query_kw = QueryKeyWord(
-                engine=kwargs.pop('keyword_query_engine', None),
-                query_workers=kwargs.pop('keyword_query_workers', None),
+            self.app['MainSyncData'] = _sync_data
+
+        if self.enable_query:
+            _query_keyword = DataQuery(
+                engine=BingSearchEngine(),
+                query_workers=kwargs.pop('query_workers', None),
                 max_pages=30,
                 query_interval=300,
                 **kwargs
             )
-            self.enable_keyword_query = True
-            '''查询服务 - 组织单位'''
-            self._query_org = QueryOrganization(
-                engine=kwargs.pop('org_query_engine', None),
-                query_workers=kwargs.pop('org_query_workers ', None),
+            _query_organization = DataQuery(
+                engine=QccSearchEngine(),
+                query_workers=kwargs.pop('query_workers', None),
                 query_interval=1800,
                 **kwargs
             )
-            self.enable_org_query = True
-        if self.allow_excavate:
-            '''数据挖掘'''
-            self._excavator = DataExcavate(
+            self.app['MainQueryKeyWord'] = _query_keyword
+            self.app['MainQueryOrganization'] = _query_organization
+
+        if self.enable_excavate:
+            _excavator = DataExcavate(
                 workers=kwargs.pop('excavate_workers', None),
                 excavate_depth=kwargs.pop('excavate_depth', 3),
                 excavate_interval=10,
                 **kwargs
             )
-            self.enable_excavate = True
+            self.app['MainDataExcavate'] = _excavator
 
     def start(self):
-        if self.allow_query:
-            if self.enable_keyword_query:
-                threading.Thread(
-                    target=self._query_kw.start,
-                    name='MainKeywordQuery'
-                ).start()
-            if self.enable_org_query:
-                threading.Thread(
-                    target=self._query_org.start,
-                    name='MainOrganizationQuery'
-                ).start()
-        if self.enable_excavate:
-            threading.Thread(
-                target=self._excavator.start,
-                name='MainDataExcavate'
-            ).start()
+        for name, app in self.app.items():
+            threading.Thread(target=app.start, name=name).start()