verify.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import threading
  2. import time
  3. from common.log import logger
  4. from crawler.bloom_filter.RedisBloomFilter import RedisFilter
  5. from settings import (
  6. MGO_REPETITION,
  7. REQUIREMENT_PHRASE,
  8. SENSITIVE_WORDS
  9. )
  10. def _requirement_phrase(title: str):
  11. """关键词"""
  12. for word in REQUIREMENT_PHRASE:
  13. if title.find(word) != -1:
  14. return True
  15. return False
  16. def _sensitive_word(title: str):
  17. """敏感词"""
  18. for word in SENSITIVE_WORDS:
  19. if title.find(word) != -1:
  20. return True
  21. return False
  22. class Validator:
  23. def __init__(self):
  24. self.rbf = RedisFilter(redis_key='duplicated_')
  25. self.rbf.start(1000000000, 0.00001)
  26. self.loop_Interval = 7200
  27. self._sensitive_word = _sensitive_word
  28. self._requirement_phrase = _requirement_phrase
  29. def _load_filter_feature(self):
  30. while True:
  31. count = 0
  32. try:
  33. for item in MGO_REPETITION.find(projection={'url': 1}).sort([('_id', -1)]):
  34. url = item['url']
  35. if not isinstance(url, str):
  36. MGO_REPETITION.delete_one({'_id': item['_id']})
  37. continue
  38. if not self.rbf.is_exists(url):
  39. self.rbf.add(url)
  40. count += 1
  41. finally:
  42. logger.info(f'[过滤器]数据加载:{len(self.rbf)}条,新增:{count}条')
  43. time.sleep(self.loop_Interval)
  44. def load_filter(self):
  45. logger.info(f'[过滤器]初始化加载')
  46. threading.Thread(
  47. target=self._load_filter_feature,
  48. name='DuplicateRemoval_'
  49. ).start()
  50. def add_filter_feature(self, feature: str):
  51. self.rbf.add(feature)
  52. def words(self, title, task):
  53. if self._sensitive_word(title):
  54. task['sensitive'] = True
  55. return False
  56. elif not self._requirement_phrase(title):
  57. task['requirement'] = True
  58. return False
  59. return True
  60. def url(self, base_url):
  61. return not self.rbf.is_exists(base_url)