ahocorasick.go 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. package megaloscope
  2. /**
  3. AC算法实现敏感词识别
  4. */
  5. import (
  6. "container/list"
  7. )
  8. type trieNode struct {
  9. count int
  10. fail *trieNode
  11. child map[rune]*trieNode
  12. index int
  13. }
  14. func newTrieNode() *trieNode {
  15. return &trieNode{
  16. count: 0,
  17. fail: nil,
  18. child: make(map[rune]*trieNode),
  19. index: -1,
  20. }
  21. }
  22. type Matcher struct {
  23. root *trieNode
  24. size int
  25. }
  26. type Term struct {
  27. Index int
  28. EndPosition int
  29. }
  30. func NewMatcher() *Matcher {
  31. return &Matcher{
  32. root: newTrieNode(),
  33. size: 0,
  34. }
  35. }
  36. func BuildNewMatcher(dictionary []string) *Matcher {
  37. m := &Matcher{
  38. root: newTrieNode(),
  39. size: 0,
  40. }
  41. m.Build(dictionary)
  42. return m
  43. }
  44. // initialize the ahocorasick
  45. func (m *Matcher) Build(dictionary []string) {
  46. for i := range dictionary {
  47. m.insert(dictionary[i])
  48. }
  49. m.build()
  50. }
  51. // string match search
  52. // return all strings matched as indexes into the original dictionary and their positions on matched string
  53. func (m *Matcher) Match(s string) []*Term {
  54. curNode := m.root
  55. var p *trieNode = nil
  56. mark := make([]bool, m.size)
  57. ret := make([]*Term, 0)
  58. for index, rune := range s {
  59. for curNode.child[rune] == nil && curNode != m.root {
  60. curNode = curNode.fail
  61. }
  62. curNode = curNode.child[rune]
  63. if curNode == nil {
  64. curNode = m.root
  65. }
  66. p = curNode
  67. for p != m.root && p.count > 0 && !mark[p.index] {
  68. mark[p.index] = true
  69. for i := 0; i < p.count; i++ {
  70. ret = append(ret, &Term{Index: p.index, EndPosition: index})
  71. }
  72. p = p.fail
  73. }
  74. }
  75. return ret
  76. }
  77. // just return the number of len(Match(s))
  78. func (m *Matcher) GetMatchResultSize(s string) int {
  79. curNode := m.root
  80. var p *trieNode = nil
  81. mark := make([]bool, m.size)
  82. num := 0
  83. for _, v := range s {
  84. for curNode.child[v] == nil && curNode != m.root {
  85. curNode = curNode.fail
  86. }
  87. curNode = curNode.child[v]
  88. if curNode == nil {
  89. curNode = m.root
  90. }
  91. p = curNode
  92. for p != m.root && p.count > 0 && !mark[p.index] {
  93. mark[p.index] = true
  94. num += p.count
  95. p = p.fail
  96. }
  97. }
  98. return num
  99. }
  100. func (m *Matcher) build() {
  101. ll := list.New()
  102. ll.PushBack(m.root)
  103. for ll.Len() > 0 {
  104. temp := ll.Remove(ll.Front()).(*trieNode)
  105. var p *trieNode = nil
  106. for i, v := range temp.child {
  107. if temp == m.root {
  108. v.fail = m.root
  109. } else {
  110. p = temp.fail
  111. for p != nil {
  112. if p.child[i] != nil {
  113. v.fail = p.child[i]
  114. break
  115. }
  116. p = p.fail
  117. }
  118. if p == nil {
  119. v.fail = m.root
  120. }
  121. }
  122. ll.PushBack(v)
  123. }
  124. }
  125. }
  126. func (m *Matcher) insert(s string) {
  127. curNode := m.root
  128. for _, v := range s {
  129. if curNode.child[v] == nil {
  130. curNode.child[v] = newTrieNode()
  131. }
  132. curNode = curNode.child[v]
  133. }
  134. curNode.count++
  135. curNode.index = m.size
  136. m.size++
  137. }