discern.go 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. package megaloscope
  2. import (
  3. "regexp"
  4. "strings"
  5. "sync"
  6. "github.com/Chain-Zhang/pinyin"
  7. )
  8. type (
  9. MatchResult struct {
  10. Line string //句子
  11. MatchRule string //匹配规则
  12. }
  13. MatchResults []*MatchResult
  14. )
  15. var split_reg, _ = regexp.Compile("[,.;!?\n。,;?!]")
  16. //Discern 多线程扫描检查一篇文章,是否存在敏感词
  17. func (m *Megaloscope) Discern(src string, threads int) (ret MatchResults) {
  18. ret = make(MatchResults, 0)
  19. sentences := split_reg.Split(src, -1)
  20. mapRet := map[int]MatchResults{}
  21. lock := new(sync.RWMutex)
  22. wg := new(sync.WaitGroup)
  23. threadsLock := make(chan bool, threads)
  24. var fn = func(lineNo int, line string) {
  25. defer func() {
  26. <-threadsLock
  27. wg.Done()
  28. }()
  29. var r MatchResults
  30. if r = m.checkWords(line); len(r) == 0 {
  31. r = m.checkWordsPY(line)
  32. }
  33. if len(r) > 0 {
  34. lock.Lock()
  35. mapRet[lineNo] = r
  36. lock.Unlock()
  37. }
  38. }
  39. for i, s := range sentences {
  40. if line := strings.Trim(s, ""); len(line) > 0 {
  41. threadsLock <- true
  42. wg.Add(1)
  43. go fn(i, line)
  44. }
  45. }
  46. wg.Wait()
  47. //结果汇总(合并)
  48. for _, v := range mapRet {
  49. ret = append(ret, v...)
  50. }
  51. return
  52. }
  53. //checkWords 检查中文词组
  54. func (m *Megaloscope) checkWords(line string) (ret MatchResults) {
  55. ret = make(MatchResults, 0)
  56. ms := m.WordsMatcher.Match(line)
  57. if len(ms) > 0 { //识别到敏感词
  58. words := make(WordSlice, len(ms))
  59. for i, t := range ms {
  60. words[i] = m.AllWords[t.Index]
  61. }
  62. //规则检查
  63. for _, v := range m.AllRules {
  64. if isSubSlice(words, v.Words) {
  65. //检查排除规则
  66. if len(v.ExcludeWords) == 0 || !isHasOneSlice(words, v.ExcludeWords) {
  67. ret = append(ret, &MatchResult{
  68. Line: line,
  69. MatchRule: v.Raw,
  70. })
  71. }
  72. }
  73. }
  74. }
  75. return
  76. }
  77. //checkWordsPY 检查中文词组拼音,解决多音字问题
  78. func (m *Megaloscope) checkWordsPY(line string) (ret MatchResults) {
  79. ret = make(MatchResults, 0)
  80. linePY, _ := pinyin.New(line).Split("").Mode(pinyin.InitialsInCapitals).Convert()
  81. ms := m.WordsPYMatcher.Match(linePY)
  82. if len(ms) > 0 { //检查是否能对应到词组
  83. words := make(WordSlice, len(ms))
  84. for i, t := range ms {
  85. words[i] = m.AllWordsPY[t.Index]
  86. }
  87. //规则检查
  88. for _, v := range m.AllRules {
  89. if len(v.Words) == 1 {
  90. // 跳过一个字的规则,避免误判 例:屄/标
  91. continue
  92. }
  93. if isSubSlice(words, v.WordsPY) {
  94. //检查排除规则
  95. if len(v.ExcludeWordsPY) == 0 || !isHasOneSlice(words, v.ExcludeWordsPY) {
  96. ret = append(ret, &MatchResult{
  97. Line: line,
  98. MatchRule: v.Raw,
  99. })
  100. }
  101. }
  102. }
  103. }
  104. return
  105. }
  106. //isSubSlice 比较2个Slice看是否包含
  107. func isSubSlice(src, target WordSlice) bool {
  108. tmp := map[string]bool{}
  109. for _, w := range src {
  110. tmp[w] = true
  111. }
  112. //
  113. for _, w := range target {
  114. if _, ok := tmp[w]; !ok {
  115. return false
  116. }
  117. }
  118. return true
  119. }
  120. //isHasOneSlice 是否存在1个
  121. func isHasOneSlice(src, target WordSlice) bool {
  122. tmp := map[string]bool{}
  123. for _, w := range src {
  124. tmp[w] = true
  125. }
  126. //
  127. for _, w := range target {
  128. if _, ok := tmp[w]; ok {
  129. return true
  130. }
  131. }
  132. return false
  133. }