Przeglądaj źródła

添加 init_collector 控制变量

dongzhaorui 3 lat temu
rodzic
commit
c821257f2b

+ 1 - 0
find_source/build_spider.py

@@ -5,6 +5,7 @@ from crawler.engines import BingSearchEngine, QccSearchEngine
 def main():
     BreadthCrawler(
         init_validator=True,
+        init_collector=True,
         url_weight=20,
         org_weight=5,
         keyword_weight=15,

+ 9 - 29
find_source/crawler/services/basics.py

@@ -2,13 +2,9 @@ import threading
 import time
 from typing import List, Mapping
 
-from common.databases import insert_one, int2long
+from common.databases import insert_one
 from common.log import logger
 from common.tools import delay_by
-from crawler.Task import Task
-from crawler.analysis import Parser
-from crawler.download import Downloader
-from crawler.schedule import Scheduler
 from constants import (
     ORGANIZATION,
     KEYWORD,
@@ -17,10 +13,12 @@ from constants import (
     VISIT_CLASSIFY,
     QUERY_CLASSIFY
 )
-from crawler.utils import extract_domain, is_url
+from crawler.Task import Task
+from crawler.analysis import Parser
+from crawler.download import Downloader
+from crawler.schedule import Scheduler
 from crawler.validate import Validator
 from settings import (
-    MGO_LUA_SPIDERS,
     MGO_URLS,
     MGO_ORGS,
     MGO_KEYWORDS,
@@ -137,7 +135,7 @@ class BasicSearch:
         logger.info(f"[推送数据记录]【{task['name']} - {task['url']}】")
         self._push_data('records', task, MGO_RECORDS)
 
-    def seed_orgs(self) -> List[Mapping]:
+    def orgs_table(self) -> List[Mapping]:
         """组织|单位"""
         search_orgs = []
         cursor = MGO_ORGS.find(self.query, projection=self.projection)
@@ -145,7 +143,7 @@ class BasicSearch:
             search_orgs.append(item)
         return search_orgs
 
-    def seed_keywords(self):
+    def keywords_table(self):
         """关键词"""
         search_keywords = []
         cursor = MGO_KEYWORDS.find(projection=self.projection)
@@ -153,7 +151,7 @@ class BasicSearch:
             search_keywords.append(item['name'])
         return search_keywords
 
-    def seed_urls(self) -> List[Mapping]:
+    def seed_urls_table(self) -> List[Mapping]:
         """种子urls"""
         search_urls = []
         cursor = MGO_URLS.find(self.query, projection=self.projection)
@@ -161,28 +159,10 @@ class BasicSearch:
             search_urls.append(item)
         return search_urls
 
-    def seed_competing_goods(self):
+    def competing_goods_table(self):
         """竞品urls"""
         competing_goods = []
         cursor = MGO_COMPETING_GOODS.find(self.query, projection=self.projection)
         for item in cursor.sort(self.sort):
             competing_goods.append(item)
         return competing_goods
-
-    def lua_common_domains(self):
-        """从lua采集爬虫配置表获取网址域名,推送到数据源收录器"""
-        _c = 0
-        projection = {'param_common': 1}
-        cursor = MGO_LUA_SPIDERS.find(projection=projection)
-        for item in cursor.sort(self.sort):
-            try:
-                url = item['param_common'][11]
-                if not is_url(url):
-                    continue
-                domain = extract_domain(url)
-            except IndexError:
-                continue
-            if not self.collector.data(domain):
-                self.collector.add_data(domain)
-                _c += 1
-        return _c