megaloscope.go 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. package megaloscope
  2. import (
  3. "bufio"
  4. "os"
  5. "strings"
  6. "github.com/Chain-Zhang/pinyin"
  7. )
  8. /**
  9. 词组
  10. */
  11. type (
  12. //词组
  13. WordSlice []string
  14. //规则
  15. Rule struct {
  16. Raw string //规则原始定义
  17. Words WordSlice //中文词组合
  18. ExcludeWords WordSlice //排除词
  19. WordsPY WordSlice //词拼音
  20. ExcludeWordsPY WordSlice //排除词拼音
  21. }
  22. //敏感词检测
  23. Megaloscope struct {
  24. AllWords WordSlice //所有词
  25. AllWordsPY WordSlice //所有词的拼音
  26. AllRules map[int]*Rule //所有词组
  27. WordsMatcher *Matcher
  28. WordsPYMatcher *Matcher
  29. }
  30. )
  31. //
  32. func NewMegaloscope(filepath string) *Megaloscope {
  33. m := &Megaloscope{AllRules: make(map[int]*Rule)}
  34. m.loadRules(filepath)
  35. return m
  36. }
  37. //博采+网站^打击|禁止|杜绝
  38. //赌博^打击|禁止|杜绝|取缔
  39. //规则解析
  40. func (m *Megaloscope) parseRule(line string) *Rule {
  41. r := &Rule{Raw: line}
  42. if strings.Contains(line, "^") {
  43. tmp := strings.Split(line, "^")
  44. r.Words = strings.Split(tmp[0], "+")
  45. r.ExcludeWords = strings.Split(tmp[1], "|")
  46. } else {
  47. r.Words = strings.Split(line, "+")
  48. }
  49. return r
  50. }
  51. //加载规则
  52. func (m *Megaloscope) loadRules(filepath string) error {
  53. fi, err := os.Open(filepath)
  54. if err != nil {
  55. return err
  56. }
  57. defer fi.Close()
  58. reader := bufio.NewReader(fi)
  59. allWords := make(WordSlice, 0)
  60. for i := 0; ; i++ {
  61. bs, _, err := reader.ReadLine()
  62. if err != nil {
  63. break
  64. }
  65. rule := m.parseRule(string(bs))
  66. m.AllRules[i] = rule
  67. allWords = append(allWords, rule.Words...)
  68. allWords = append(allWords, rule.ExcludeWords...)
  69. }
  70. //所有词去重
  71. tmp := map[string]bool{}
  72. for _, w := range allWords {
  73. tmp[w] = true
  74. }
  75. //计算词的拼音
  76. allWordPY := make(map[string]string)
  77. words := make([]string, len(tmp))
  78. wordsPY := make([]string, len(tmp))
  79. index := 0
  80. for w := range tmp {
  81. py, _ := pinyin.New(w).Split("").Mode(pinyin.InitialsInCapitals).Convert()
  82. allWordPY[w] = py
  83. words[index] = w
  84. wordsPY[index] = py
  85. index += 1
  86. }
  87. m.AllWords = words
  88. m.AllWordsPY = wordsPY
  89. //完善规则中的拼音
  90. for _, v := range m.AllRules {
  91. v.WordsPY = make(WordSlice, len(v.Words))
  92. for i, w := range v.Words {
  93. v.WordsPY[i] = allWordPY[w]
  94. }
  95. if len(v.ExcludeWords) > 0 {
  96. v.ExcludeWordsPY = make(WordSlice, len(v.ExcludeWords))
  97. for i, w := range v.ExcludeWords {
  98. v.ExcludeWordsPY[i] = allWordPY[w]
  99. }
  100. }
  101. }
  102. m.WordsPYMatcher = BuildNewMatcher(m.AllWordsPY)
  103. m.WordsMatcher = BuildNewMatcher(m.AllWords)
  104. return nil
  105. }