|
@@ -429,19 +429,18 @@ class DataExcavate(BasicSearch):
|
|
|
if self.validator.url(task['url']):
|
|
|
return
|
|
|
|
|
|
- logger.info(f'request host -> {task["url"]}')
|
|
|
+ logger.info(f'[数据挖掘]开始请求 -> {task["url"]}')
|
|
|
response = self.downloader.get(task['url'])
|
|
|
if response.status_code != 200 or response.text in ['', None]:
|
|
|
+ logger.error(f'[数据挖掘]访问异常 -> {task["url"]}')
|
|
|
return
|
|
|
|
|
|
task['domain'] = extract_domain(task['url'])
|
|
|
page_source = response.text
|
|
|
- title = extract_page_title(page_source)
|
|
|
- task['name'] = title
|
|
|
- base_url = extract_base_url(task['url'])
|
|
|
- task['base_url'] = base_url
|
|
|
+ task['name'] = extract_page_title(page_source)
|
|
|
+ task['base_url'] = extract_base_url(task['url'])
|
|
|
|
|
|
- items = self.parser.site_items(page_source, base_url)
|
|
|
+ items = self.parser.site_items(page_source, task['base_url'])
|
|
|
lst = []
|
|
|
_c = 0 # 页面包含的关键词计数器
|
|
|
for item in items:
|
|
@@ -460,10 +459,6 @@ class DataExcavate(BasicSearch):
|
|
|
if _c > 1:
|
|
|
self.save(task)
|
|
|
self.scheduler.add_excavate(lst, level=task['weight'])
|
|
|
- '''domain - 添加过滤器'''
|
|
|
- self.validator.add_url(task['domain'])
|
|
|
- '''url - 添加过滤器'''
|
|
|
- self.validator.add_url(task['url'])
|
|
|
|
|
|
def excavate(self):
|
|
|
t_name = threading.currentThread().getName()
|
|
@@ -477,6 +472,10 @@ class DataExcavate(BasicSearch):
|
|
|
task_key, task = tasks
|
|
|
task['update_at'] = int2long(int(time.time()))
|
|
|
self.retrieve_site(task)
|
|
|
+ '''domain - 添加过滤器'''
|
|
|
+ self.validator.add_url(task['domain'])
|
|
|
+ '''url - 添加过滤器'''
|
|
|
+ self.validator.add_url(task['url'])
|
|
|
|
|
|
def start(self):
|
|
|
logger.info(f'[数据挖掘]初始化加载')
|