verify.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import threading
  2. import time
  3. from common.log import logger
  4. from crawler.bloom_filter.RedisBloomFilter import RedisFilter
  5. from settings import (
  6. MGO_REMOVAL_DUPLICATE,
  7. REQUIREMENT_PHRASE
  8. )
  9. def _requirement_phrase(val: str):
  10. """关键词"""
  11. for word in REQUIREMENT_PHRASE:
  12. if val.find(word) != -1:
  13. return True
  14. return False
  15. class Validator:
  16. def __init__(self):
  17. self._rbf = RedisFilter(redis_key='RemovalDuplicate_')
  18. self._rbf.start(1000000000, 0.00001)
  19. self._requirement_phrase = _requirement_phrase
  20. self._loop_Interval = 7200
  21. def _sync_data_rubbish(self):
  22. while True:
  23. count = 0
  24. cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
  25. try:
  26. for item in cursor.sort([('_id', -1)]):
  27. domain = item['domain']
  28. if not isinstance(domain, str):
  29. MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']})
  30. continue
  31. if not self._rbf.is_exists(domain):
  32. self._rbf.add(domain)
  33. count += 1
  34. finally:
  35. logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条')
  36. time.sleep(self._loop_Interval)
  37. def load_filter(self):
  38. logger.info(f'[过滤器]初始化加载')
  39. threading.Thread(
  40. target=self._sync_data_rubbish,
  41. name='RemovalDuplicate_'
  42. ).start()
  43. def add_url(self, url: str):
  44. self._rbf.add(url)
  45. def requirement_word(self, val):
  46. return self._requirement_phrase(val)
  47. def url(self, url: str):
  48. return self._rbf.is_exists(url)