verify.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. import threading
  2. import time
  3. from common.log import logger
  4. from crawler.bloom_filter.RedisBloomFilter import RedisFilter
  5. from settings import (
  6. MGO_REMOVAL_DUPLICATE,
  7. REQUIREMENT_PHRASE,
  8. SENSITIVE_WORDS
  9. )
  10. def _requirement_phrase(title: str):
  11. """关键词"""
  12. for word in REQUIREMENT_PHRASE:
  13. if title.find(word) != -1:
  14. return True
  15. return False
  16. def _sensitive_word(title: str):
  17. """敏感词"""
  18. for word in SENSITIVE_WORDS:
  19. if title.find(word) != -1:
  20. return True
  21. return False
  22. class Validator:
  23. def __init__(self):
  24. self._rbf = RedisFilter(redis_key='RemovalDuplicate_')
  25. self._rbf.start(1000000000, 0.00001)
  26. self._sensitive_word = _sensitive_word
  27. self._requirement_phrase = _requirement_phrase
  28. self._loop_Interval = 7200
  29. def _sync_data_rubbish(self):
  30. while True:
  31. count = 0
  32. cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
  33. try:
  34. for item in cursor.sort([('_id', -1)]):
  35. domain = item['domain']
  36. if not isinstance(domain, str):
  37. MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']})
  38. continue
  39. if not self._rbf.is_exists(domain):
  40. self._rbf.add(domain)
  41. count += 1
  42. finally:
  43. logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条')
  44. time.sleep(self._loop_Interval)
  45. def load_filter(self):
  46. logger.info(f'[过滤器]初始化加载')
  47. threading.Thread(
  48. target=self._sync_data_rubbish,
  49. name='RemovalDuplicate_'
  50. ).start()
  51. def add_url(self, url: str):
  52. self._rbf.add(url)
  53. def words(self, title, task):
  54. if self._sensitive_word(title):
  55. task['sensitive'] = True
  56. return False
  57. elif not self._requirement_phrase(title):
  58. task['requirement'] = True
  59. return False
  60. return True
  61. def url(self, url: str):
  62. return self._rbf.is_exists(url)