|
@@ -61,27 +61,30 @@ class BreadthCrawler:
|
|
|
print("关闭寻源爬虫")
|
|
|
break
|
|
|
|
|
|
- # TODO 层级管理如何体现在爬虫采集?
|
|
|
-
|
|
|
task_key, task = tasks
|
|
|
+ groups = task['groups']
|
|
|
domain = extract_domain(task['url'])
|
|
|
- visit_domain = self._validator.url(domain)
|
|
|
- if not visit_domain:
|
|
|
+ allow_visit_domain = self._validator.url(domain)
|
|
|
+ if not allow_visit_domain:
|
|
|
continue
|
|
|
- logger.info(f'准备访问:{domain}')
|
|
|
+
|
|
|
+ logger.info(f'request web site -> {task["url"]}')
|
|
|
response = self._downloader.get(task['url'])
|
|
|
+ print(response, len(response.text))
|
|
|
if response.status_code != 200 or response.text in ['']:
|
|
|
continue
|
|
|
- page_source = response.text
|
|
|
+
|
|
|
task['domain'] = domain
|
|
|
base_url = extract_base_url(task['url'])
|
|
|
task['base_url'] = base_url
|
|
|
+ page_source = response.text
|
|
|
title = extract_page_title(page_source)
|
|
|
+ print(title)
|
|
|
task['name'] = title
|
|
|
try:
|
|
|
self.verify(task)
|
|
|
urls = self._parser(page_source, base_url)
|
|
|
- new_tasks = [Task(url=url) for url in urls]
|
|
|
+ new_tasks = [Task(url=url, groups=groups) for url in urls]
|
|
|
self._scheduler.insert_tasks(new_tasks)
|
|
|
except HostsRetrieveError:
|
|
|
pass
|
|
@@ -89,7 +92,7 @@ class BreadthCrawler:
|
|
|
def set_search_engine(self, engine=None):
|
|
|
if isinstance(engine, JySearchEngine):
|
|
|
self._engines.append(engine)
|
|
|
- logger.info(f'[搜索引擎 - {engine.__class__.__name__}] 添加成功')
|
|
|
+ logger.info(f'[搜索引擎 - {engine.__class__.__name__}]添加成功')
|
|
|
return self
|
|
|
|
|
|
def set_search_engines(self, engines):
|
|
@@ -99,9 +102,10 @@ class BreadthCrawler:
|
|
|
|
|
|
def search_words(self, engine, words):
|
|
|
for word in words:
|
|
|
- logger.info(f"[搜索引擎 - {engine.__class__.__name__}]搜索:{word}")
|
|
|
+ word = str(word).replace(' ', '').strip()
|
|
|
+ logger.info(f"[{engine.__class__.__name__} - 搜索]{word}")
|
|
|
urls = engine.search(word)
|
|
|
- lst = [Task(url=url) for url in urls]
|
|
|
+ lst = [Task(url=url, groups='keywords') for url in urls]
|
|
|
self._scheduler.insert_tasks(lst)
|
|
|
|
|
|
def enable_search_engines(self):
|
|
@@ -112,7 +116,7 @@ class BreadthCrawler:
|
|
|
|
|
|
futures = []
|
|
|
for engine in self._engines:
|
|
|
- logger.info(f"[搜索引擎 - {engine.__class__.__name__}] 启动成功")
|
|
|
+ logger.info(f"[搜索引擎 - {engine.__class__.__name__}]启动成功")
|
|
|
f = executor.submit(self.search_words, engine, search_words)
|
|
|
f.add_done_callback(err_details)
|
|
|
futures.append(f)
|