ac_sensitive.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. #!/usr/bin/python3.6
  2. # -*- coding: utf-8 -*-
  3. # @File : ac_sensitive.py
  4. # @Software: PyCharm
  5. # DFA算法查找敏感词
  6. class Node(object):
  7. def __init__(self):
  8. self.next = {}
  9. self.fail = None
  10. self.isWord = False
  11. self.word = ""
  12. class ACAutomation(object):
  13. def __init__(self):
  14. self.root = Node()
  15. # 添加敏感词函数
  16. def add_word(self, word):
  17. temp_root = self.root
  18. for char in word:
  19. if char not in temp_root.next:
  20. temp_root.next[char] = Node()
  21. temp_root = temp_root.next[char]
  22. temp_root.isWord = True
  23. temp_root.word = word
  24. # 失败指针函数
  25. def make_fail(self):
  26. temp_queue = []
  27. temp_queue.append(self.root)
  28. while len(temp_queue) != 0:
  29. temp = temp_queue.pop(0)
  30. p = None
  31. for key, value in temp.next.item():
  32. if temp == self.root:
  33. temp.next[key].fail = self.root
  34. else:
  35. p = temp.fail
  36. while p is not None:
  37. if key in p.next:
  38. temp.next[key].fail = p.fail
  39. break
  40. p = p.fail
  41. if p is None:
  42. temp.next[key].fail = self.root
  43. temp_queue.append(temp.next[key])
  44. # 查找敏感词函数
  45. def search(self, content):
  46. p = self.root
  47. result = []
  48. currentposition = 0
  49. while currentposition < len(content):
  50. word = content[currentposition]
  51. while word in p.next == False and p != self.root:
  52. p = p.fail
  53. if word in p.next:
  54. p = p.next[word]
  55. else:
  56. p = self.root
  57. if p.isWord:
  58. result.append(p.word)
  59. p = self.root
  60. currentposition += 1
  61. return result
  62. # 加载敏感词库函数
  63. def parse(self, words_path: str):
  64. with open(words_path, encoding='utf-8') as f:
  65. for keyword in f:
  66. self.add_word(str(keyword).strip())
  67. # 敏感词替换函数
  68. def words_replace(self, text):
  69. """
  70. :param ah: AC自动机
  71. :param text: 文本
  72. :return: 过滤敏感词之后的文本
  73. """
  74. import time
  75. start=time.time()
  76. result = list(set(self.search(text)))
  77. print(result)
  78. for x in result:
  79. m = text.replace(x, '*' * len(x))
  80. text = m
  81. print(time.time()-start)
  82. return text
  83. if __name__ == "__main__":
  84. gfw = ACAutomation()
  85. path = "../data/sensitive_words.txt"
  86. gfw.parse(path)
  87. text = "苹果新品发布会"
  88. result = gfw.words_replace(text)
  89. print(result)