verify.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import threading
  2. import time
  3. from common.log import logger
  4. from crawler.bloom_filter.RedisBloomFilter import RedisFilter
  5. from settings import (
  6. MGO_REPETITION,
  7. REQUIREMENT_PHRASE,
  8. SENSITIVE_WORDS
  9. )
  10. def _requirement_phrase(title: str):
  11. """关键词"""
  12. for word in REQUIREMENT_PHRASE:
  13. if title.find(word) != -1:
  14. return True
  15. return False
  16. def _sensitive_word(title: str):
  17. """敏感词"""
  18. for word in SENSITIVE_WORDS:
  19. if title.find(word) != -1:
  20. return False
  21. return True
  22. class Validator:
  23. def __init__(self):
  24. self.rbf = RedisFilter(redis_key='duplicated_')
  25. self.rbf.start(1000000000, 0.00001)
  26. self.loop_Interval = 7200
  27. def _load_filter_feature(self):
  28. while True:
  29. count = 0
  30. try:
  31. for item in MGO_REPETITION.find(projection={'url': 1}).sort([('_id', -1)]):
  32. url = item['url']
  33. if not isinstance(url, str):
  34. MGO_REPETITION.delete_one({'_id': item['_id']})
  35. continue
  36. if not self.rbf.is_exists(url):
  37. self.rbf.add(url)
  38. count += 1
  39. finally:
  40. logger.info(f'[站点判重]数据加载:{len(self.rbf)}条,新增:{count}条')
  41. time.sleep(self.loop_Interval)
  42. def load_filter(self):
  43. logger.info(f'[站点判重]初始化加载')
  44. threading.Thread(
  45. target=self._load_filter_feature,
  46. name='DuplicateRemoval_'
  47. ).start()
  48. def add_filter_feature(self, feature: str):
  49. self.rbf.add(feature)
  50. def title(self, title, task):
  51. if _sensitive_word(title):
  52. task['sensitive'] = True
  53. return False
  54. elif not _requirement_phrase(title):
  55. task['requirement'] = True
  56. return False
  57. return True
  58. def url(self, base_url):
  59. return self.rbf.is_exists(base_url)