verify.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. import threading
  2. import time
  3. from common.log import logger
  4. from crawler.bloom_filter.RedisBloomFilter import RedisFilter
  5. from settings import (
  6. MGO_REMOVAL_DUPLICATE,
  7. REQUIREMENT_PHRASE,
  8. SENSITIVE_WORDS
  9. )
  10. def _requirement_phrase(val: str):
  11. """关键词"""
  12. for word in REQUIREMENT_PHRASE:
  13. if val.find(word) != -1:
  14. return True
  15. return False
  16. def _sensitive_word(val: str):
  17. """垃圾词"""
  18. for word in SENSITIVE_WORDS:
  19. if val.find(word) != -1:
  20. return True
  21. return False
  22. class Validator:
  23. def __init__(self):
  24. self._rbf = RedisFilter(redis_key='RemovalDuplicate_')
  25. self._rbf.start(1000000000, 0.00001)
  26. self._sensitive_word = _sensitive_word
  27. self._requirement_phrase = _requirement_phrase
  28. self._loop_Interval = 7200
  29. def _sync_data_rubbish(self):
  30. while True:
  31. count = 0
  32. cursor = MGO_REMOVAL_DUPLICATE.find(projection={'domain': 1})
  33. try:
  34. for item in cursor.sort([('_id', -1)]):
  35. domain = item['domain']
  36. if not isinstance(domain, str):
  37. MGO_REMOVAL_DUPLICATE.delete_one({'_id': item['_id']})
  38. continue
  39. if not self._rbf.is_exists(domain):
  40. self._rbf.add(domain)
  41. count += 1
  42. finally:
  43. logger.info(f'[过滤器]数据加载:{len(self._rbf)}条,新增:{count}条')
  44. time.sleep(self._loop_Interval)
  45. def load_filter(self):
  46. logger.info(f'[过滤器]初始化加载')
  47. threading.Thread(
  48. target=self._sync_data_rubbish,
  49. name='RemovalDuplicate_'
  50. ).start()
  51. def add_url(self, url: str):
  52. self._rbf.add(url)
  53. def sensitive_word(self, val):
  54. if val is None or len(val) < 5:
  55. return True
  56. if self._sensitive_word(val):
  57. return True
  58. return False
  59. def requirement_word(self, val):
  60. return self._requirement_phrase(val)
  61. def url(self, url: str):
  62. return self._rbf.is_exists(url)