123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241 |
- package match
- import (
- "context"
- "fmt"
- "github.com/gogf/gf/v2/frame/g"
- )
- // 定义前缀树节点
- type TrieNode struct {
- Children map[rune]*TrieNode // 子节点
- IsEnd bool // 是否为单词结尾
- }
- // 插入单词到前缀树中
- func (tn *TrieNode) Insert(words ...string) {
- for _, word := range words {
- current := tn
- for _, char := range word {
- if current.Children == nil {
- current.Children = make(map[rune]*TrieNode)
- }
- node, exists := current.Children[char]
- if !exists {
- node = &TrieNode{}
- current.Children[char] = node
- }
- current = node
- }
- current.IsEnd = true
- }
- }
- // Remove 删除单词
- func (tn *TrieNode) Remove(words ...string) {
- if tn == nil {
- g.Log().Infof(context.Background(), "TrieNode Remove is nil")
- }
- for _, word := range words {
- current := tn
- parent := tn
- if tn == nil {
- return
- }
- var lastChar int32
- for _, char := range word {
- if current == nil || current.Children == nil {
- return
- }
- node, _ := current.Children[char]
- parent = current
- current = node
- lastChar = char
- }
- if current != nil && current.IsEnd {
- current.IsEnd = false
- }
- if current != nil && current.Children != nil && len(current.Children) == 0 {
- delete(parent.Children, lastChar)
- }
- }
- }
- //func (trie *TrieNode) Print() {
- // g.Dump(trie)
- //}
- // 加载数据到前缀树中
- func (trie *TrieNode) LoadDataToTrie(data map[string]int) {
- for key, value := range data {
- runes := []rune(key)
- current := trie
- for i, char := range runes {
- isEnd := 0
- if i == len(runes)-1 {
- isEnd = value
- }
- if current.Children == nil {
- current.Children = make(map[rune]*TrieNode)
- }
- node, exists := current.Children[char]
- if !exists {
- node = &TrieNode{}
- current.Children[char] = node
- }
- current = node
- current.IsEnd = isEnd == 1
- }
- }
- }
- // 在前缀树中查找单词
- func (trie *TrieNode) FindWordInTrie(word string) bool {
- current := trie
- for _, char := range word {
- node, exists := current.Children[char]
- if !exists {
- return false
- }
- current = node
- }
- return current.IsEnd
- }
- // 在文本中查找单词
- func (trie *TrieNode) FindWords(text string) []string {
- words := map[string]struct{}{}
- var rData []string
- runes := []rune(text)
- for i := 0; i < len(runes); i++ {
- current := trie
- for j := i; j < len(runes); j++ {
- char := runes[j]
- node, exists := current.Children[char]
- if !exists {
- break
- }
- current = node
- if current.IsEnd {
- words[string(runes[i:j+1])] = struct{}{}
- }
- }
- }
- for word, _ := range words {
- rData = append(rData, word)
- }
- return rData
- }
- func (trie *TrieNode) FindOneMaxStr(text string) string {
- stringArr := trie.FindWords(text)
- if len(stringArr) == 0 {
- return "" // 处理空数组的情况
- }
- longest := stringArr[0]
- for _, s := range stringArr {
- if len(s) > len(longest) {
- longest = s
- }
- }
- return longest
- }
- // 在文本中查找单词
- func (trie *TrieNode) FindWordsInText(title, detail string) map[string]string {
- words := map[string]string{}
- var nomatchDetail string
- runes := []rune(fmt.Sprintf("%s %s", detail, title))
- detailArr := []rune(detail)
- detailLen := len(detailArr)
- for i := 0; i < len(runes); i++ {
- current := trie
- for j := i; j < len(runes); j++ {
- char := runes[j]
- node, exists := current.Children[char]
- if !exists {
- break
- }
- current = node
- if current.IsEnd {
- if _, ok := words[string(runes[i:j+1])]; !ok {
- if j < detailLen { //匹配正文
- //fmt.Println(string(runes[i : j+1]))
- words[string(runes[i:j+1])] = getDetailMatchStr(detailArr, i, j)
- } else { //匹配到标题
- if nomatchDetail == "" && detailLen > 0 { //匹配到标题后,正文取开头
- nomatchDetail = getDetailMatchStr(detailArr, detailLen-1, detailLen)
- }
- words[string(runes[i:j+1])] = nomatchDetail
- }
- }
- }
- }
- }
- return words
- }
- // 取关键词前40后40个字
- var splitRune = map[int32]bool{}
- func init() {
- splitRune = map[int32]bool{}
- for _, s := range []string{" ", ",", ".", ",", "。", " ", ":", ":", "、", "-"} {
- splitRune[[]rune(s)[0]] = true
- }
- }
- // getDetailMatchStr 获取正文中匹配到的短句
- func getDetailMatchStr(desc []rune, left, right int) string {
- s, e, l := 0, 0, len(desc)
- sT := left - 40
- if sT < 0 {
- sT = 0
- }
- eT := sT + 80
- if eT > l {
- eT = l
- }
- for i := 1; i < 20; i++ {
- if s != 0 && e != 0 {
- return string(desc[s:e])
- }
- if s == 0 {
- //fmt.Printf("[sT]%s-%d %s-%d\n", string(desc[sT-i]), desc[sT-i], string(desc[sT+i]), desc[sT+i])
- if sT+i < left && splitRune[desc[sT+i]] {
- s = sT + i + 1
- }
- if sT-i > 0 && splitRune[desc[sT-i]] {
- s = sT - i + 1
- }
- }
- if e == 0 {
- //fmt.Printf("[eT]%s-%d %s-%d\n", string(desc[eT-i]), desc[eT-i], string(desc[eT+i]), desc[eT+i])
- if eT+i < l && splitRune[desc[eT+i]] {
- e = eT + i
- }
- if eT-i > right && splitRune[desc[eT-i]] {
- e = eT - i
- }
- }
- }
- if s == 0 {
- s = sT
- }
- if e == 0 {
- e = eT
- }
- return string(desc[s:e])
- }
|