|
@@ -8,9 +8,12 @@ from crawler.utils import (
|
|
|
extract_base_url,
|
|
|
extract_page_title,
|
|
|
extract_domain,
|
|
|
+ parser_domain,
|
|
|
err_details,
|
|
|
)
|
|
|
|
|
|
+TLDS = ['com', 'cn']
|
|
|
+
|
|
|
|
|
|
class DataExcavate(BasicSearch):
|
|
|
|
|
@@ -19,6 +22,24 @@ class DataExcavate(BasicSearch):
|
|
|
self._interval = loop_interval
|
|
|
self._workers = workers
|
|
|
|
|
|
+ def is_rubbish(self, url: str):
|
|
|
+ if self.validator.data(url):
|
|
|
+ return True
|
|
|
+
|
|
|
+ domain = extract_domain(url)
|
|
|
+ if self.validator.data(domain):
|
|
|
+ return True
|
|
|
+
|
|
|
+ if domain.startswith('www.'):
|
|
|
+ domain = domain.replace('www.', '')
|
|
|
+
|
|
|
+ domain_lst = parser_domain(domain)
|
|
|
+ domain_lst = [d for d in domain_lst if d not in TLDS]
|
|
|
+ for val in domain_lst:
|
|
|
+ if self.validator.data(val):
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
def retrieve_site(self, task: Task):
|
|
|
t_name = threading.currentThread().getName()
|
|
|
logger.info(f'[{t_name}]开始请求 - {task["url"]}')
|
|
@@ -67,7 +88,8 @@ class DataExcavate(BasicSearch):
|
|
|
continue
|
|
|
|
|
|
task_key, task = tasks
|
|
|
- if self.validator.data(task['url']):
|
|
|
+ if self.is_rubbish(task['url']):
|
|
|
+ logger.info(f'[{t_name}]过滤网址 - {task["url"]}')
|
|
|
continue
|
|
|
'''挖掘站点'''
|
|
|
success = self.retrieve_site(task)
|