|
@@ -0,0 +1,226 @@
|
|
|
+package match
|
|
|
+
|
|
|
+import (
|
|
|
+ "context"
|
|
|
+ "fmt"
|
|
|
+ "github.com/gogf/gf/v2/frame/g"
|
|
|
+)
|
|
|
+
|
|
|
+// 定义前缀树节点
|
|
|
+type TrieNode struct {
|
|
|
+ Children map[rune]*TrieNode // 子节点
|
|
|
+ IsEnd bool // 是否为单词结尾
|
|
|
+}
|
|
|
+
|
|
|
+// 插入单词到前缀树中
|
|
|
+func (tn *TrieNode) Insert(words ...string) {
|
|
|
+ for _, word := range words {
|
|
|
+ current := tn
|
|
|
+ for _, char := range word {
|
|
|
+ if current.Children == nil {
|
|
|
+ current.Children = make(map[rune]*TrieNode)
|
|
|
+ }
|
|
|
+ node, exists := current.Children[char]
|
|
|
+ if !exists {
|
|
|
+ node = &TrieNode{}
|
|
|
+ current.Children[char] = node
|
|
|
+ }
|
|
|
+ current = node
|
|
|
+ }
|
|
|
+ current.IsEnd = true
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// Remove 删除单词
|
|
|
+func (tn *TrieNode) Remove(words ...string) {
|
|
|
+ if tn == nil {
|
|
|
+ g.Log().Infof(context.Background(), "TrieNode Remove is nil")
|
|
|
+ }
|
|
|
+ for _, word := range words {
|
|
|
+ current := tn
|
|
|
+ parent := tn
|
|
|
+ if tn == nil {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ var lastChar int32
|
|
|
+ for _, char := range word {
|
|
|
+ if current == nil || current.Children == nil {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ node, _ := current.Children[char]
|
|
|
+ parent = current
|
|
|
+ current = node
|
|
|
+ lastChar = char
|
|
|
+ }
|
|
|
+
|
|
|
+ if current != nil && current.IsEnd {
|
|
|
+ current.IsEnd = false
|
|
|
+ }
|
|
|
+ if current != nil && current.Children != nil && len(current.Children) == 0 {
|
|
|
+ delete(parent.Children, lastChar)
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//func (trie *TrieNode) Print() {
|
|
|
+// g.Dump(trie)
|
|
|
+//}
|
|
|
+
|
|
|
+// 加载数据到前缀树中
|
|
|
+func (trie *TrieNode) LoadDataToTrie(data map[string]int) {
|
|
|
+ for key, value := range data {
|
|
|
+ runes := []rune(key)
|
|
|
+ current := trie
|
|
|
+ for i, char := range runes {
|
|
|
+ isEnd := 0
|
|
|
+ if i == len(runes)-1 {
|
|
|
+ isEnd = value
|
|
|
+ }
|
|
|
+ if current.Children == nil {
|
|
|
+ current.Children = make(map[rune]*TrieNode)
|
|
|
+ }
|
|
|
+ node, exists := current.Children[char]
|
|
|
+ if !exists {
|
|
|
+ node = &TrieNode{}
|
|
|
+ current.Children[char] = node
|
|
|
+ }
|
|
|
+ current = node
|
|
|
+ current.IsEnd = isEnd == 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// 在前缀树中查找单词
|
|
|
+func (trie *TrieNode) FindWordInTrie(word string) bool {
|
|
|
+ current := trie
|
|
|
+ for _, char := range word {
|
|
|
+ node, exists := current.Children[char]
|
|
|
+ if !exists {
|
|
|
+ return false
|
|
|
+ }
|
|
|
+ current = node
|
|
|
+ }
|
|
|
+ return current.IsEnd
|
|
|
+}
|
|
|
+
|
|
|
+// 在文本中查找单词
|
|
|
+func (trie *TrieNode) FindWords(text string) []string {
|
|
|
+ words := map[string]struct{}{}
|
|
|
+ var rData []string
|
|
|
+
|
|
|
+ runes := []rune(text)
|
|
|
+ for i := 0; i < len(runes); i++ {
|
|
|
+ current := trie
|
|
|
+ for j := i; j < len(runes); j++ {
|
|
|
+ char := runes[j]
|
|
|
+ node, exists := current.Children[char]
|
|
|
+ if !exists {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ current = node
|
|
|
+ if current.IsEnd {
|
|
|
+ words[string(runes[i:j+1])] = struct{}{}
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for word, _ := range words {
|
|
|
+ rData = append(rData, word)
|
|
|
+ }
|
|
|
+ return rData
|
|
|
+}
|
|
|
+
|
|
|
+// 在文本中查找单词
|
|
|
+func (trie *TrieNode) FindWordsInText(title, detail string) map[string]string {
|
|
|
+ words := map[string]string{}
|
|
|
+ var nomatchDetail string
|
|
|
+ runes := []rune(fmt.Sprintf("%s %s", detail, title))
|
|
|
+ detailArr := []rune(detail)
|
|
|
+ detailLen := len(detailArr)
|
|
|
+ for i := 0; i < len(runes); i++ {
|
|
|
+ current := trie
|
|
|
+ for j := i; j < len(runes); j++ {
|
|
|
+ char := runes[j]
|
|
|
+ node, exists := current.Children[char]
|
|
|
+ if !exists {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ current = node
|
|
|
+ if current.IsEnd {
|
|
|
+ if _, ok := words[string(runes[i:j+1])]; !ok {
|
|
|
+ if j < detailLen { //匹配正文
|
|
|
+ //fmt.Println(string(runes[i : j+1]))
|
|
|
+ words[string(runes[i:j+1])] = getDetailMatchStr(detailArr, i, j)
|
|
|
+ } else { //匹配到标题
|
|
|
+ if nomatchDetail == "" && detailLen > 0 { //匹配到标题后,正文取开头
|
|
|
+ nomatchDetail = getDetailMatchStr(detailArr, detailLen-1, detailLen)
|
|
|
+ }
|
|
|
+ words[string(runes[i:j+1])] = nomatchDetail
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return words
|
|
|
+}
|
|
|
+
|
|
|
+// 取关键词前40后40个字
|
|
|
+var splitRune = map[int32]bool{}
|
|
|
+
|
|
|
+func init() {
|
|
|
+ splitRune = map[int32]bool{}
|
|
|
+ for _, s := range []string{" ", ",", ".", ",", "。", " ", ":", ":", "、", "-"} {
|
|
|
+ splitRune[[]rune(s)[0]] = true
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// getDetailMatchStr 获取正文中匹配到的短句
|
|
|
+func getDetailMatchStr(desc []rune, left, right int) string {
|
|
|
+ s, e, l := 0, 0, len(desc)
|
|
|
+
|
|
|
+ sT := left - 40
|
|
|
+ if sT < 0 {
|
|
|
+ sT = 0
|
|
|
+ }
|
|
|
+
|
|
|
+ eT := sT + 80
|
|
|
+ if eT > l {
|
|
|
+ eT = l
|
|
|
+ }
|
|
|
+
|
|
|
+ for i := 1; i < 20; i++ {
|
|
|
+ if s != 0 && e != 0 {
|
|
|
+ return string(desc[s:e])
|
|
|
+ }
|
|
|
+
|
|
|
+ if s == 0 {
|
|
|
+ //fmt.Printf("[sT]%s-%d %s-%d\n", string(desc[sT-i]), desc[sT-i], string(desc[sT+i]), desc[sT+i])
|
|
|
+ if sT+i < left && splitRune[desc[sT+i]] {
|
|
|
+ s = sT + i + 1
|
|
|
+ }
|
|
|
+ if sT-i > 0 && splitRune[desc[sT-i]] {
|
|
|
+ s = sT - i + 1
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if e == 0 {
|
|
|
+ //fmt.Printf("[eT]%s-%d %s-%d\n", string(desc[eT-i]), desc[eT-i], string(desc[eT+i]), desc[eT+i])
|
|
|
+ if eT+i < l && splitRune[desc[eT+i]] {
|
|
|
+ e = eT + i
|
|
|
+ }
|
|
|
+ if eT-i > right && splitRune[desc[eT-i]] {
|
|
|
+ e = eT - i
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if s == 0 {
|
|
|
+ s = sT
|
|
|
+ }
|
|
|
+ if e == 0 {
|
|
|
+ e = eT
|
|
|
+ }
|
|
|
+
|
|
|
+ return string(desc[s:e])
|
|
|
+}
|