123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- # coding:utf-8
- """
- 以AC自动机机制为基础,编写获取关键词程序,优点速度快
- """
- class Node(object):
- def __init__(self):
- self.next = {}
- self.fail = None
- self.isWord = False
- self.word = ""
- class AcAutomation(object):
- def __init__(self):
- self.root = Node()
- self.catch=set()
- # 添加敏感词函数
- def add_word(self, word):
- temp_root = self.root
- self.catch.add(word)
- for char in word:
- if char not in temp_root.next:
- temp_root.next[char] = Node()
- temp_root = temp_root.next[char]
- temp_root.isWord = True
- temp_root.word = word
- @staticmethod
- def __check_word(cache_result: list, char: str):
- """
- 获取keywords
- :param cache_result:
- :param char:
- :return:
- """
- update_catch = [] # 更新之后的指针列表
- words = [] # 获取关键词列表
- for cache in cache_result:
- if char in cache.next:
- next_p = cache.next[char]
- if next_p.isWord:
- words.append(next_p.word)
- update_catch.append(next_p)
- return update_catch, words
- @staticmethod
- def __check_word_position(cache_result: dict, char: str, position: int):
- """
- 获取keywords包含位置信心
- :param cache_result:
- :param char:字符
- :param position:位置
- :return:
- """
- update_catch = {} # 更新之后的指针列表
- words = [] # 获取关键词列表
- for d_key, (start, cache) in cache_result.items():
- if char in cache.next:
- next_p = cache.next[char]
- if next_p.isWord:
- words.append((next_p.word, start, position))
- update_catch[len(update_catch)] = (start, next_p)
- return update_catch, words
- # 查找关键词函数
- def search(self, content):
- """
- 关键词查找
- :param content:
- :return:
- """
- cache_result = [] # 缓存节点
- words_result = [] # 关键词列表
- position = 0 # 当前位置
- while position < len(content):
- p = self.root
- char = content[position]
- cache_result, words = self.__check_word(cache_result, char)
- words_result.extend(words)
- if char in p.next:
- if p.next[char].isWord:
- words_result.append(p.next[char].word)
- cache_result.append(p.next[char])
- position += 1
- return words_result
- # 查找关键词函数并记录位置
- def search_and_position(self, content):
- """
- 关键词查找包含位置
- :param content:
- :return:
- """
- cache_result = {} # 缓存节点
- words_result = [] # 关键词列表
- position = 0 # 当前位置
- while position < len(content):
- p = self.root
- char = content[position]
- cache_result, words = self.__check_word_position(cache_result, char, position)
- words_result.extend(words)
- if char in p.next:
- if p.next[char].isWord:
- words_result.append((position, p.next[char].word))
- cache_result[len(cache_result)] = (position, p.next[char])
- position += 1
- return words_result
- if __name__ == '__main__':
- ah = AcAutomation()
- for w in ["国", "国产", "产化", "国产化", "路由器123", "国产化路由器", "国产化+服务器", "国产化+防火墙", "国产化+无线"]:
- ah.add_word(w)
- result = ah.search_and_position("国产化路由器123")
- print(result)
|