|
@@ -1,18 +1,19 @@
|
|
|
import re
|
|
|
-import threading
|
|
|
from concurrent.futures import ThreadPoolExecutor, wait
|
|
|
-from typing import List
|
|
|
|
|
|
from common.log import logger
|
|
|
from crawler.Task import Task
|
|
|
+from crawler.analysis import TimeExtractor
|
|
|
from crawler.services.basics import BasicService
|
|
|
from crawler.utils import (
|
|
|
extract_host,
|
|
|
extract_page_title,
|
|
|
extract_domain,
|
|
|
- split_domain,
|
|
|
err_details,
|
|
|
is_url,
|
|
|
+ html2element,
|
|
|
+ iter_node,
|
|
|
+ check_page_by_words
|
|
|
)
|
|
|
|
|
|
TLDS = ['com', 'cn', 'net', 'org']
|
|
@@ -34,105 +35,137 @@ class DataExcavate(BasicService):
|
|
|
if task.get('depth') is None:
|
|
|
task.update({'depth': self._default_depth})
|
|
|
|
|
|
- def check_depth(self, task: Task, sub_tasks: List[Task]):
|
|
|
- _results = []
|
|
|
- curr_depth = task['depth']
|
|
|
- if curr_depth < self._max_depth:
|
|
|
- for t in sub_tasks:
|
|
|
- t.setdefault('depth', curr_depth + 1)
|
|
|
- _results.append(t)
|
|
|
- return _results
|
|
|
-
|
|
|
- def is_rubbish(self, url: str):
|
|
|
+ def interceptor(self, task: Task):
|
|
|
"""
|
|
|
- 网址过滤器
|
|
|
+ 拦截器:拦截需要丢弃的任务
|
|
|
|
|
|
- :param url: 网址
|
|
|
- :return: False = 不存在, True = 存在
|
|
|
+ :param task: 请求任务
|
|
|
+ :return:
|
|
|
"""
|
|
|
+ url = task['url']
|
|
|
if not is_url(url):
|
|
|
+ logger.debug(f'<{self.thread_name}> - 垃圾网址 - {url}')
|
|
|
return True
|
|
|
|
|
|
- if self.validator.data(url):
|
|
|
- return True
|
|
|
-
|
|
|
+ curr_depth = task['depth']
|
|
|
domain = extract_domain(url)
|
|
|
- if self.validator.data(domain):
|
|
|
+ is_attachment_url = re.match(URL_SUFFIX_PATTERN, url) is not None
|
|
|
+ if any([
|
|
|
+ curr_depth > self._max_depth, # 检查任务层级
|
|
|
+ is_attachment_url, # 检查网址是否附件下载地址
|
|
|
+ self.validator.data(url), # 垃圾池 - 判重任务请求网址
|
|
|
+ self.validator.data(domain), # 垃圾池 - 过滤域名
|
|
|
+ self.collector.data(domain), # 收录池 - 判重域名
|
|
|
+ ]):
|
|
|
+ logger.debug(f'<{self.thread_name}> - 无效任务 - {curr_depth} - {url}')
|
|
|
return True
|
|
|
|
|
|
- if domain.startswith('www.'):
|
|
|
- domain = domain.replace('www.', '')
|
|
|
+ return False
|
|
|
|
|
|
- '''域名处理'''
|
|
|
- domain_lst = split_domain(domain)
|
|
|
- domain_lst = [d for d in domain_lst if d not in TLDS]
|
|
|
- text = ".".join(domain_lst)
|
|
|
- if self.validator.data(text):
|
|
|
- return True
|
|
|
+ def filter_data(self, lst):
|
|
|
+ """通过垃圾过滤器过滤数据"""
|
|
|
+ results = []
|
|
|
+ for val in lst:
|
|
|
+ if not self.validator.data(val):
|
|
|
+ results.append(val)
|
|
|
+ return results
|
|
|
+
|
|
|
+ def same_origin_strategy(self, source, task: Task):
|
|
|
+ """同源策略"""
|
|
|
+ # 排查时间文本
|
|
|
+ hit_date_total = 0
|
|
|
+ date_nodes = []
|
|
|
+ element = html2element(source)
|
|
|
+ for node, _ in iter_node(element):
|
|
|
+ pt = TimeExtractor().extractor(node)
|
|
|
+ if pt and not node.getchildren() and node not in date_nodes:
|
|
|
+ date_nodes.append(node)
|
|
|
+ hit_date_total += 1
|
|
|
+ # 全文排查检索词
|
|
|
+ hit_text_total = 0
|
|
|
+ all_text = ["".join(text.split()) for text in element.itertext()]
|
|
|
+ all_text = [text for text in all_text if len(text) > 0]
|
|
|
+ for text in all_text:
|
|
|
+ # print(text)
|
|
|
+ if check_page_by_words(text):
|
|
|
+ hit_text_total += 1
|
|
|
+ # 寻源结果
|
|
|
+ if all([3 < hit_date_total < 50, hit_text_total > 3]):
|
|
|
+ self.push_domain(task)
|
|
|
+ elif hit_text_total > 5:
|
|
|
+ self.push_domain(task)
|
|
|
+ else:
|
|
|
+ self.push_remove(task)
|
|
|
|
|
|
- '''检查域名中所包含的字符串'''
|
|
|
- for val in domain_lst:
|
|
|
- if self.validator.data(val):
|
|
|
- return True
|
|
|
- return False
|
|
|
+ # 抽取当前页面同源链接(使用同源模式:mode=1),不遗漏挖掘层级,同时权重优先级+1,优先处理
|
|
|
+ lst = []
|
|
|
+ history = []
|
|
|
+ sub_depth = task['depth'] + 1
|
|
|
+ sub_weight = task['weight'] + 1
|
|
|
+ items = self.parser.turls(source, task['url'], mode=1)
|
|
|
+ for item in items:
|
|
|
+ name, url = item['title'], item['href']
|
|
|
+ if not self.validator.data(url) and url not in history:
|
|
|
+ lst.append(self.make_task(
|
|
|
+ url=url,
|
|
|
+ name=name,
|
|
|
+ depth=sub_depth,
|
|
|
+ origin=task['origin'],
|
|
|
+ groups=task['groups'],
|
|
|
+ classify=self.visit_classify,
|
|
|
+ weight=sub_weight
|
|
|
+ ))
|
|
|
+ history.append(url)
|
|
|
+ self.scheduler.add_excavate(lst, level=sub_weight)
|
|
|
|
|
|
- def process(self, t_name: str, task: Task):
|
|
|
- logger.info(f'<{t_name}> - 请求 - {task["url"]}')
|
|
|
- response = self.downloader.get(task['url'], timeout=5)
|
|
|
+ def non_origin_strategy(self, source, task: Task):
|
|
|
+ """非同源策略"""
|
|
|
+ lst = []
|
|
|
+ history = []
|
|
|
+ # 抽取当前页面非同源链接(使用非同源模式:mode=2),添加挖掘队列,一个站点挖掘结束,开始处理新站点
|
|
|
+ non_origin_urls = self.parser.urls(source, task['url'], mode=2)
|
|
|
+ for url in non_origin_urls:
|
|
|
+ if not self.validator.data(url) and url not in history:
|
|
|
+ lst.append(self.make_task(
|
|
|
+ url=url,
|
|
|
+ origin=task['origin'],
|
|
|
+ groups=task['groups'],
|
|
|
+ classify=self.visit_classify,
|
|
|
+ weight=self.url_weight
|
|
|
+ ))
|
|
|
+ history.append(url)
|
|
|
+ self.scheduler.add_excavate(lst, level=self.url_weight)
|
|
|
+
|
|
|
+ def fetch_page(self, task: Task):
|
|
|
+ url = task['url']
|
|
|
+ response = self.downloader.get(url, timeout=5)
|
|
|
status_code = response.status_code
|
|
|
page_source = response.text
|
|
|
reason = response.reason
|
|
|
- task['status_code'] = status_code
|
|
|
if status_code != 200 or page_source in ['', None]:
|
|
|
task['err_reason'] = reason
|
|
|
- logger.error(f'<{t_name}> - {reason} - {status_code} - {task["url"]}')
|
|
|
+ msg = f'<{self.thread_name}> - {url} - {status_code} - {reason}'
|
|
|
+ logger.error(msg)
|
|
|
+ return status_code, None
|
|
|
+ return status_code, page_source
|
|
|
+
|
|
|
+ def process(self, task: Task):
|
|
|
+ logger.info(f'<{self.thread_name}> - 请求 - {task["url"]}')
|
|
|
+ status_code, page_source = self.fetch_page(task)
|
|
|
+ task['status_code'] = status_code
|
|
|
+ if page_source is None:
|
|
|
+ # 访问失败的域名是否添加过滤器?
|
|
|
+ self.push_remove(task)
|
|
|
return False
|
|
|
-
|
|
|
task['domain'] = extract_domain(task['url'])
|
|
|
- task['name'] = extract_page_title(page_source)
|
|
|
task['base_url'] = extract_host(task['url'])
|
|
|
-
|
|
|
- lst = []
|
|
|
- _history = []
|
|
|
- _c = 0 # 过滤词计数器
|
|
|
- sub_depth = task['depth'] + 1
|
|
|
- sub_weight = task['weight'] + 1
|
|
|
- items = self.parser.non_origin(page_source, task['url'])
|
|
|
- for item in items:
|
|
|
- name, url = item['title'], item['href']
|
|
|
- if self.validator.words(name):
|
|
|
- if url not in _history:
|
|
|
- lst.append(self.make_task(
|
|
|
- url=url,
|
|
|
- name=name,
|
|
|
- depth=sub_depth,
|
|
|
- origin=task['origin'],
|
|
|
- groups=task['groups'],
|
|
|
- classify=self.visit_classify,
|
|
|
- weight=sub_weight
|
|
|
- ))
|
|
|
- _history.append(url)
|
|
|
- _c += 1
|
|
|
-
|
|
|
- if _c > 1:
|
|
|
- save = self.push_domain(task)
|
|
|
- msg = f'<{t_name}> - 收录成功 - {task["url"]}'
|
|
|
- if not save:
|
|
|
- msg = f'<{t_name}> - 重复收录 - {task["url"]}'
|
|
|
- else:
|
|
|
- remove = self.push_remove(task)
|
|
|
- msg = f'<{t_name}> - 过滤丢弃 - {task["url"]}'
|
|
|
- if not remove:
|
|
|
- msg = f'<{t_name}> - 重复收录 - {task["url"]}'
|
|
|
-
|
|
|
- logger.debug(msg)
|
|
|
- '''层级深的,优先采集'''
|
|
|
- self.scheduler.add_excavate(lst, level=sub_weight)
|
|
|
+ task['name'] = extract_page_title(page_source)
|
|
|
+ self.same_origin_strategy(page_source, task)
|
|
|
+ self.non_origin_strategy(page_source, task)
|
|
|
return True
|
|
|
|
|
|
def excavate(self):
|
|
|
- t_name = threading.currentThread().getName()
|
|
|
- logger.info(f'开启线程 - <{t_name}>')
|
|
|
+ logger.info(f'开启线程 - <{self.thread_name}>')
|
|
|
while True:
|
|
|
tasks = self.scheduler.get_excavate_task()
|
|
|
if len(tasks) == 0:
|
|
@@ -140,31 +173,17 @@ class DataExcavate(BasicService):
|
|
|
continue
|
|
|
|
|
|
task_key, task = tasks
|
|
|
- # 初始化网站层级
|
|
|
- self._init_depth(task)
|
|
|
- if self.is_rubbish(task['url']):
|
|
|
- logger.debug(f'<{t_name}> - 垃圾数据 - {task["url"]}')
|
|
|
- continue
|
|
|
-
|
|
|
- # 层级控制
|
|
|
- if task['depth'] > self._max_depth:
|
|
|
- logger.debug(f'<{t_name}> - 层级超限 - {task["url"]}')
|
|
|
- # self.push_records(task)
|
|
|
- continue
|
|
|
-
|
|
|
- dont_visit = re.match(URL_SUFFIX_PATTERN, task['url']) is not None
|
|
|
- if not dont_visit:
|
|
|
+ self._init_depth(task) # 初始化网站层级
|
|
|
+ if not self.interceptor(task):
|
|
|
try:
|
|
|
- success = self.process(t_name, task)
|
|
|
- if not success:
|
|
|
- self.validator.add_data(task['url'])
|
|
|
+ self.process(task)
|
|
|
except Exception as e:
|
|
|
logger.exception(e)
|
|
|
# '''挖掘记录'''
|
|
|
# self.push_records(task)
|
|
|
|
|
|
def start(self):
|
|
|
- with ThreadPoolExecutor(self._workers, 'DataExcavate') as executor:
|
|
|
+ with ThreadPoolExecutor(self._workers, 'dataExcavate') as executor:
|
|
|
futures = []
|
|
|
for _ in range(1, self._workers + 1):
|
|
|
f = executor.submit(self.excavate)
|