match.go 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. package match
  2. import (
  3. "context"
  4. "fmt"
  5. "github.com/gogf/gf/v2/frame/g"
  6. )
  7. // 定义前缀树节点
  8. type TrieNode struct {
  9. Children map[rune]*TrieNode // 子节点
  10. IsEnd bool // 是否为单词结尾
  11. }
  12. // 插入单词到前缀树中
  13. func (tn *TrieNode) Insert(words ...string) {
  14. for _, word := range words {
  15. current := tn
  16. for _, char := range word {
  17. if current.Children == nil {
  18. current.Children = make(map[rune]*TrieNode)
  19. }
  20. node, exists := current.Children[char]
  21. if !exists {
  22. node = &TrieNode{}
  23. current.Children[char] = node
  24. }
  25. current = node
  26. }
  27. current.IsEnd = true
  28. }
  29. }
  30. // Remove 删除单词
  31. func (tn *TrieNode) Remove(words ...string) {
  32. if tn == nil {
  33. g.Log().Infof(context.Background(), "TrieNode Remove is nil")
  34. }
  35. for _, word := range words {
  36. current := tn
  37. parent := tn
  38. if tn == nil {
  39. return
  40. }
  41. var lastChar int32
  42. for _, char := range word {
  43. if current == nil || current.Children == nil {
  44. return
  45. }
  46. node, _ := current.Children[char]
  47. parent = current
  48. current = node
  49. lastChar = char
  50. }
  51. if current != nil && current.IsEnd {
  52. current.IsEnd = false
  53. }
  54. if current != nil && current.Children != nil && len(current.Children) == 0 {
  55. delete(parent.Children, lastChar)
  56. }
  57. }
  58. }
  59. //func (trie *TrieNode) Print() {
  60. // g.Dump(trie)
  61. //}
  62. // 加载数据到前缀树中
  63. func (trie *TrieNode) LoadDataToTrie(data map[string]int) {
  64. for key, value := range data {
  65. runes := []rune(key)
  66. current := trie
  67. for i, char := range runes {
  68. isEnd := 0
  69. if i == len(runes)-1 {
  70. isEnd = value
  71. }
  72. if current.Children == nil {
  73. current.Children = make(map[rune]*TrieNode)
  74. }
  75. node, exists := current.Children[char]
  76. if !exists {
  77. node = &TrieNode{}
  78. current.Children[char] = node
  79. }
  80. current = node
  81. current.IsEnd = isEnd == 1
  82. }
  83. }
  84. }
  85. // 在前缀树中查找单词
  86. func (trie *TrieNode) FindWordInTrie(word string) bool {
  87. current := trie
  88. for _, char := range word {
  89. node, exists := current.Children[char]
  90. if !exists {
  91. return false
  92. }
  93. current = node
  94. }
  95. return current.IsEnd
  96. }
  97. // 在文本中查找单词
  98. func (trie *TrieNode) FindWords(text string) []string {
  99. words := map[string]struct{}{}
  100. var rData []string
  101. runes := []rune(text)
  102. for i := 0; i < len(runes); i++ {
  103. current := trie
  104. for j := i; j < len(runes); j++ {
  105. char := runes[j]
  106. node, exists := current.Children[char]
  107. if !exists {
  108. break
  109. }
  110. current = node
  111. if current.IsEnd {
  112. words[string(runes[i:j+1])] = struct{}{}
  113. }
  114. }
  115. }
  116. for word, _ := range words {
  117. rData = append(rData, word)
  118. }
  119. return rData
  120. }
  121. func (trie *TrieNode) FindOneMaxStr(text string) string {
  122. stringArr := trie.FindWords(text)
  123. if len(stringArr) == 0 {
  124. return "" // 处理空数组的情况
  125. }
  126. longest := stringArr[0]
  127. for _, s := range stringArr {
  128. if len(s) > len(longest) {
  129. longest = s
  130. }
  131. }
  132. return longest
  133. }
  134. // 在文本中查找单词
  135. func (trie *TrieNode) FindWordsInText(title, detail string) map[string]string {
  136. words := map[string]string{}
  137. var nomatchDetail string
  138. runes := []rune(fmt.Sprintf("%s %s", detail, title))
  139. detailArr := []rune(detail)
  140. detailLen := len(detailArr)
  141. for i := 0; i < len(runes); i++ {
  142. current := trie
  143. for j := i; j < len(runes); j++ {
  144. char := runes[j]
  145. node, exists := current.Children[char]
  146. if !exists {
  147. break
  148. }
  149. current = node
  150. if current.IsEnd {
  151. if _, ok := words[string(runes[i:j+1])]; !ok {
  152. if j < detailLen { //匹配正文
  153. //fmt.Println(string(runes[i : j+1]))
  154. words[string(runes[i:j+1])] = getDetailMatchStr(detailArr, i, j)
  155. } else { //匹配到标题
  156. if nomatchDetail == "" && detailLen > 0 { //匹配到标题后,正文取开头
  157. nomatchDetail = getDetailMatchStr(detailArr, detailLen-1, detailLen)
  158. }
  159. words[string(runes[i:j+1])] = nomatchDetail
  160. }
  161. }
  162. }
  163. }
  164. }
  165. return words
  166. }
  167. // 取关键词前40后40个字
  168. var splitRune = map[int32]bool{}
  169. func init() {
  170. splitRune = map[int32]bool{}
  171. for _, s := range []string{" ", ",", ".", ",", "。", " ", ":", ":", "、", "-"} {
  172. splitRune[[]rune(s)[0]] = true
  173. }
  174. }
  175. // getDetailMatchStr 获取正文中匹配到的短句
  176. func getDetailMatchStr(desc []rune, left, right int) string {
  177. s, e, l := 0, 0, len(desc)
  178. sT := left - 40
  179. if sT < 0 {
  180. sT = 0
  181. }
  182. eT := sT + 80
  183. if eT > l {
  184. eT = l
  185. }
  186. for i := 1; i < 20; i++ {
  187. if s != 0 && e != 0 {
  188. return string(desc[s:e])
  189. }
  190. if s == 0 {
  191. //fmt.Printf("[sT]%s-%d %s-%d\n", string(desc[sT-i]), desc[sT-i], string(desc[sT+i]), desc[sT+i])
  192. if sT+i < left && splitRune[desc[sT+i]] {
  193. s = sT + i + 1
  194. }
  195. if sT-i > 0 && splitRune[desc[sT-i]] {
  196. s = sT - i + 1
  197. }
  198. }
  199. if e == 0 {
  200. //fmt.Printf("[eT]%s-%d %s-%d\n", string(desc[eT-i]), desc[eT-i], string(desc[eT+i]), desc[eT+i])
  201. if eT+i < l && splitRune[desc[eT+i]] {
  202. e = eT + i
  203. }
  204. if eT-i > right && splitRune[desc[eT-i]] {
  205. e = eT - i
  206. }
  207. }
  208. }
  209. if s == 0 {
  210. s = sT
  211. }
  212. if e == 0 {
  213. e = eT
  214. }
  215. return string(desc[s:e])
  216. }