|
@@ -0,0 +1,72 @@
|
|
|
+import threading
|
|
|
+import time
|
|
|
+
|
|
|
+from common.log import logger
|
|
|
+from crawler.bloom_filter.RedisBloomFilter import RedisFilter
|
|
|
+from settings import (
|
|
|
+ MGO_REPETITION,
|
|
|
+ REQUIREMENT_PHRASE,
|
|
|
+ SENSITIVE_WORDS
|
|
|
+)
|
|
|
+
|
|
|
+
|
|
|
+def _requirement_phrase(title: str):
|
|
|
+ """关键词"""
|
|
|
+ for word in REQUIREMENT_PHRASE:
|
|
|
+ if title.find(word) != -1:
|
|
|
+ return True
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def _sensitive_word(title: str):
|
|
|
+ """敏感词"""
|
|
|
+ for word in SENSITIVE_WORDS:
|
|
|
+ if title.find(word) != -1:
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+class Validator:
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.rbf = RedisFilter(redis_key='duplicated_')
|
|
|
+ self.rbf.start(1000000000, 0.00001)
|
|
|
+ self.loop_Interval = 7200
|
|
|
+
|
|
|
+ def _load_filter_feature(self):
|
|
|
+ while True:
|
|
|
+ count = 0
|
|
|
+ try:
|
|
|
+ for item in MGO_REPETITION.find(projection={'url': 1}).sort([('_id', -1)]):
|
|
|
+ url = item['url']
|
|
|
+ if not isinstance(url, str):
|
|
|
+ MGO_REPETITION.delete_one({'_id': item['_id']})
|
|
|
+ continue
|
|
|
+ if not self.rbf.is_exists(url):
|
|
|
+ self.rbf.add(url)
|
|
|
+ count += 1
|
|
|
+ finally:
|
|
|
+ logger.info(f'[站点判重]数据加载:{len(self.rbf)}条,新增:{count}条')
|
|
|
+ time.sleep(self.loop_Interval)
|
|
|
+
|
|
|
+ def load_filter(self):
|
|
|
+ logger.info(f'[站点判重]初始化加载')
|
|
|
+ threading.Thread(
|
|
|
+ target=self._load_filter_feature,
|
|
|
+ name='DuplicateRemoval_'
|
|
|
+ ).start()
|
|
|
+
|
|
|
+ def add_filter_feature(self, feature: str):
|
|
|
+ self.rbf.add(feature)
|
|
|
+
|
|
|
+ def title(self, title, task):
|
|
|
+ if _sensitive_word(title):
|
|
|
+ task['sensitive'] = True
|
|
|
+ return False
|
|
|
+ elif not _requirement_phrase(title):
|
|
|
+ task['requirement'] = True
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+ def url(self, base_url):
|
|
|
+ return self.rbf.is_exists(base_url)
|