|
@@ -2,13 +2,9 @@ import threading
|
|
|
import time
|
|
|
from typing import List, Mapping
|
|
|
|
|
|
-from common.databases import insert_one, int2long
|
|
|
+from common.databases import insert_one
|
|
|
from common.log import logger
|
|
|
from common.tools import delay_by
|
|
|
-from crawler.Task import Task
|
|
|
-from crawler.analysis import Parser
|
|
|
-from crawler.download import Downloader
|
|
|
-from crawler.schedule import Scheduler
|
|
|
from constants import (
|
|
|
ORGANIZATION,
|
|
|
KEYWORD,
|
|
@@ -17,10 +13,12 @@ from constants import (
|
|
|
VISIT_CLASSIFY,
|
|
|
QUERY_CLASSIFY
|
|
|
)
|
|
|
-from crawler.utils import extract_domain, is_url
|
|
|
+from crawler.Task import Task
|
|
|
+from crawler.analysis import Parser
|
|
|
+from crawler.download import Downloader
|
|
|
+from crawler.schedule import Scheduler
|
|
|
from crawler.validate import Validator
|
|
|
from settings import (
|
|
|
- MGO_LUA_SPIDERS,
|
|
|
MGO_URLS,
|
|
|
MGO_ORGS,
|
|
|
MGO_KEYWORDS,
|
|
@@ -137,7 +135,7 @@ class BasicSearch:
|
|
|
logger.info(f"[推送数据记录]【{task['name']} - {task['url']}】")
|
|
|
self._push_data('records', task, MGO_RECORDS)
|
|
|
|
|
|
- def seed_orgs(self) -> List[Mapping]:
|
|
|
+ def orgs_table(self) -> List[Mapping]:
|
|
|
"""组织|单位"""
|
|
|
search_orgs = []
|
|
|
cursor = MGO_ORGS.find(self.query, projection=self.projection)
|
|
@@ -145,7 +143,7 @@ class BasicSearch:
|
|
|
search_orgs.append(item)
|
|
|
return search_orgs
|
|
|
|
|
|
- def seed_keywords(self):
|
|
|
+ def keywords_table(self):
|
|
|
"""关键词"""
|
|
|
search_keywords = []
|
|
|
cursor = MGO_KEYWORDS.find(projection=self.projection)
|
|
@@ -153,7 +151,7 @@ class BasicSearch:
|
|
|
search_keywords.append(item['name'])
|
|
|
return search_keywords
|
|
|
|
|
|
- def seed_urls(self) -> List[Mapping]:
|
|
|
+ def seed_urls_table(self) -> List[Mapping]:
|
|
|
"""种子urls"""
|
|
|
search_urls = []
|
|
|
cursor = MGO_URLS.find(self.query, projection=self.projection)
|
|
@@ -161,28 +159,10 @@ class BasicSearch:
|
|
|
search_urls.append(item)
|
|
|
return search_urls
|
|
|
|
|
|
- def seed_competing_goods(self):
|
|
|
+ def competing_goods_table(self):
|
|
|
"""竞品urls"""
|
|
|
competing_goods = []
|
|
|
cursor = MGO_COMPETING_GOODS.find(self.query, projection=self.projection)
|
|
|
for item in cursor.sort(self.sort):
|
|
|
competing_goods.append(item)
|
|
|
return competing_goods
|
|
|
-
|
|
|
- def lua_common_domains(self):
|
|
|
- """从lua采集爬虫配置表获取网址域名,推送到数据源收录器"""
|
|
|
- _c = 0
|
|
|
- projection = {'param_common': 1}
|
|
|
- cursor = MGO_LUA_SPIDERS.find(projection=projection)
|
|
|
- for item in cursor.sort(self.sort):
|
|
|
- try:
|
|
|
- url = item['param_common'][11]
|
|
|
- if not is_url(url):
|
|
|
- continue
|
|
|
- domain = extract_domain(url)
|
|
|
- except IndexError:
|
|
|
- continue
|
|
|
- if not self.collector.data(domain):
|
|
|
- self.collector.add_data(domain)
|
|
|
- _c += 1
|
|
|
- return _c
|