# coding:utf-8 """ 以AC自动机机制为基础,编写获取关键词程序,优点速度快 """ class Node(object): def __init__(self): self.next = {} self.fail = None self.isWord = False self.word = "" class AcAutomation(object): def __init__(self): self.root = Node() self.catch=set() # 添加敏感词函数 def add_word(self, word): temp_root = self.root self.catch.add(word) for char in word: if char not in temp_root.next: temp_root.next[char] = Node() temp_root = temp_root.next[char] temp_root.isWord = True temp_root.word = word @staticmethod def __check_word(cache_result: list, char: str): """ 获取keywords :param cache_result: :param char: :return: """ update_catch = [] # 更新之后的指针列表 words = [] # 获取关键词列表 for cache in cache_result: if char in cache.next: next_p = cache.next[char] if next_p.isWord: words.append(next_p.word) update_catch.append(next_p) return update_catch, words @staticmethod def __check_word_position(cache_result: dict, char: str, position: int): """ 获取keywords包含位置信心 :param cache_result: :param char:字符 :param position:位置 :return: """ update_catch = {} # 更新之后的指针列表 words = [] # 获取关键词列表 for d_key, (start, cache) in cache_result.items(): if char in cache.next: next_p = cache.next[char] if next_p.isWord: words.append((next_p.word, start, position)) update_catch[len(update_catch)] = (start, next_p) return update_catch, words # 查找关键词函数 def search(self, content): """ 关键词查找 :param content: :return: """ cache_result = [] # 缓存节点 words_result = [] # 关键词列表 position = 0 # 当前位置 while position < len(content): p = self.root char = content[position] cache_result, words = self.__check_word(cache_result, char) words_result.extend(words) if char in p.next: if p.next[char].isWord: words_result.append(p.next[char].word) cache_result.append(p.next[char]) position += 1 return words_result # 查找关键词函数并记录位置 def search_and_position(self, content): """ 关键词查找包含位置 :param content: :return: """ cache_result = {} # 缓存节点 words_result = [] # 关键词列表 position = 0 # 当前位置 while position < len(content): p = self.root char = content[position] cache_result, words = self.__check_word_position(cache_result, char, position) words_result.extend(words) if char in p.next: if p.next[char].isWord: words_result.append((position, p.next[char].word)) cache_result[len(cache_result)] = (position, p.next[char]) position += 1 return words_result if __name__ == '__main__': ah = AcAutomation() for w in ["国", "国产", "产化", "国产化", "路由器123", "国产化路由器", "国产化+服务器", "国产化+防火墙", "国产化+无线"]: ah.add_word(w) result = ah.search_and_position("国产化路由器123") print(result)