Browse Source

wip:用户标签

wangkaiyue 10 months ago
parent
commit
6465563e39
3 changed files with 246 additions and 0 deletions
  1. 4 0
      userSign/config.yaml
  2. 16 0
      userSign/main.go
  3. 226 0
      userSign/match/match.go

+ 4 - 0
userSign/config.yaml

@@ -0,0 +1,4 @@
+database:
+  default:
+    link: "clickhouse:jydev:Jsh2scksi7&hs@tcp(127.0.0.1:29000)/pub_tags?dial_timeout=2000ms&max_execution_time=60"
+    debug: true

File diff suppressed because it is too large
+ 16 - 0
userSign/main.go


+ 226 - 0
userSign/match/match.go

@@ -0,0 +1,226 @@
+package match
+
+import (
+	"context"
+	"fmt"
+	"github.com/gogf/gf/v2/frame/g"
+)
+
+// 定义前缀树节点
+type TrieNode struct {
+	Children map[rune]*TrieNode // 子节点
+	IsEnd    bool               // 是否为单词结尾
+}
+
+// 插入单词到前缀树中
+func (tn *TrieNode) Insert(words ...string) {
+	for _, word := range words {
+		current := tn
+		for _, char := range word {
+			if current.Children == nil {
+				current.Children = make(map[rune]*TrieNode)
+			}
+			node, exists := current.Children[char]
+			if !exists {
+				node = &TrieNode{}
+				current.Children[char] = node
+			}
+			current = node
+		}
+		current.IsEnd = true
+	}
+}
+
+// Remove 删除单词
+func (tn *TrieNode) Remove(words ...string) {
+	if tn == nil {
+		g.Log().Infof(context.Background(), "TrieNode Remove is nil")
+	}
+	for _, word := range words {
+		current := tn
+		parent := tn
+		if tn == nil {
+			return
+		}
+		var lastChar int32
+		for _, char := range word {
+			if current == nil || current.Children == nil {
+				return
+			}
+			node, _ := current.Children[char]
+			parent = current
+			current = node
+			lastChar = char
+		}
+
+		if current != nil && current.IsEnd {
+			current.IsEnd = false
+		}
+		if current != nil && current.Children != nil && len(current.Children) == 0 {
+			delete(parent.Children, lastChar)
+		}
+
+	}
+}
+
+//func (trie *TrieNode) Print() {
+//	g.Dump(trie)
+//}
+
+// 加载数据到前缀树中
+func (trie *TrieNode) LoadDataToTrie(data map[string]int) {
+	for key, value := range data {
+		runes := []rune(key)
+		current := trie
+		for i, char := range runes {
+			isEnd := 0
+			if i == len(runes)-1 {
+				isEnd = value
+			}
+			if current.Children == nil {
+				current.Children = make(map[rune]*TrieNode)
+			}
+			node, exists := current.Children[char]
+			if !exists {
+				node = &TrieNode{}
+				current.Children[char] = node
+			}
+			current = node
+			current.IsEnd = isEnd == 1
+		}
+	}
+}
+
+// 在前缀树中查找单词
+func (trie *TrieNode) FindWordInTrie(word string) bool {
+	current := trie
+	for _, char := range word {
+		node, exists := current.Children[char]
+		if !exists {
+			return false
+		}
+		current = node
+	}
+	return current.IsEnd
+}
+
+// 在文本中查找单词
+func (trie *TrieNode) FindWords(text string) []string {
+	words := map[string]struct{}{}
+	var rData []string
+
+	runes := []rune(text)
+	for i := 0; i < len(runes); i++ {
+		current := trie
+		for j := i; j < len(runes); j++ {
+			char := runes[j]
+			node, exists := current.Children[char]
+			if !exists {
+				break
+			}
+			current = node
+			if current.IsEnd {
+				words[string(runes[i:j+1])] = struct{}{}
+			}
+		}
+	}
+	for word, _ := range words {
+		rData = append(rData, word)
+	}
+	return rData
+}
+
+// 在文本中查找单词
+func (trie *TrieNode) FindWordsInText(title, detail string) map[string]string {
+	words := map[string]string{}
+	var nomatchDetail string
+	runes := []rune(fmt.Sprintf("%s %s", detail, title))
+	detailArr := []rune(detail)
+	detailLen := len(detailArr)
+	for i := 0; i < len(runes); i++ {
+		current := trie
+		for j := i; j < len(runes); j++ {
+			char := runes[j]
+			node, exists := current.Children[char]
+			if !exists {
+				break
+			}
+			current = node
+			if current.IsEnd {
+				if _, ok := words[string(runes[i:j+1])]; !ok {
+					if j < detailLen { //匹配正文
+						//fmt.Println(string(runes[i : j+1]))
+						words[string(runes[i:j+1])] = getDetailMatchStr(detailArr, i, j)
+					} else { //匹配到标题
+						if nomatchDetail == "" && detailLen > 0 { //匹配到标题后,正文取开头
+							nomatchDetail = getDetailMatchStr(detailArr, detailLen-1, detailLen)
+						}
+						words[string(runes[i:j+1])] = nomatchDetail
+					}
+				}
+			}
+		}
+	}
+
+	return words
+}
+
+// 取关键词前40后40个字
+var splitRune = map[int32]bool{}
+
+func init() {
+	splitRune = map[int32]bool{}
+	for _, s := range []string{" ", ",", ".", ",", "。", " ", ":", ":", "、", "-"} {
+		splitRune[[]rune(s)[0]] = true
+	}
+}
+
+// getDetailMatchStr 获取正文中匹配到的短句
+func getDetailMatchStr(desc []rune, left, right int) string {
+	s, e, l := 0, 0, len(desc)
+
+	sT := left - 40
+	if sT < 0 {
+		sT = 0
+	}
+
+	eT := sT + 80
+	if eT > l {
+		eT = l
+	}
+
+	for i := 1; i < 20; i++ {
+		if s != 0 && e != 0 {
+			return string(desc[s:e])
+		}
+
+		if s == 0 {
+			//fmt.Printf("[sT]%s-%d   %s-%d\n", string(desc[sT-i]), desc[sT-i], string(desc[sT+i]), desc[sT+i])
+			if sT+i < left && splitRune[desc[sT+i]] {
+				s = sT + i + 1
+			}
+			if sT-i > 0 && splitRune[desc[sT-i]] {
+				s = sT - i + 1
+			}
+		}
+
+		if e == 0 {
+			//fmt.Printf("[eT]%s-%d   %s-%d\n", string(desc[eT-i]), desc[eT-i], string(desc[eT+i]), desc[eT+i])
+			if eT+i < l && splitRune[desc[eT+i]] {
+				e = eT + i
+			}
+			if eT-i > right && splitRune[desc[eT-i]] {
+				e = eT - i
+			}
+		}
+	}
+
+	if s == 0 {
+		s = sT
+	}
+	if e == 0 {
+		e = eT
+	}
+
+	return string(desc[s:e])
+}

Some files were not shown because too many files changed in this diff