123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142 |
- package megaloscope
- import (
- "regexp"
- "strings"
- "sync"
- "github.com/Chain-Zhang/pinyin"
- )
- type (
- MatchResult struct {
- Line string //句子
- MatchRule string //匹配规则
- }
- MatchResults []*MatchResult
- )
- var split_reg, _ = regexp.Compile("[,.;!?\n。,;?!]")
- //Discern 多线程扫描检查一篇文章,是否存在敏感词
- func (m *Megaloscope) Discern(src string, threads int) (ret MatchResults) {
- ret = make(MatchResults, 0)
- sentences := split_reg.Split(src, -1)
- mapRet := map[int]MatchResults{}
- lock := new(sync.RWMutex)
- wg := new(sync.WaitGroup)
- threadsLock := make(chan bool, threads)
- var fn = func(lineNo int, line string) {
- defer func() {
- <-threadsLock
- wg.Done()
- }()
- var r MatchResults
- if r = m.checkWords(line); len(r) == 0 {
- r = m.checkWordsPY(line)
- }
- if len(r) > 0 {
- lock.Lock()
- mapRet[lineNo] = r
- lock.Unlock()
- }
- }
- for i, s := range sentences {
- if line := strings.Trim(s, ""); len(line) > 0 {
- threadsLock <- true
- wg.Add(1)
- go fn(i, line)
- }
- }
- wg.Wait()
- //结果汇总(合并)
- for _, v := range mapRet {
- ret = append(ret, v...)
- }
- return
- }
- //checkWords 检查中文词组
- func (m *Megaloscope) checkWords(line string) (ret MatchResults) {
- ret = make(MatchResults, 0)
- ms := m.WordsMatcher.Match(line)
- if len(ms) > 0 { //识别到敏感词
- words := make(WordSlice, len(ms))
- for i, t := range ms {
- words[i] = m.AllWords[t.Index]
- }
- //规则检查
- for _, v := range m.AllRules {
- if isSubSlice(words, v.Words) {
- //检查排除规则
- if len(v.ExcludeWords) == 0 || !isHasOneSlice(words, v.ExcludeWords) {
- ret = append(ret, &MatchResult{
- Line: line,
- MatchRule: v.Raw,
- })
- }
- }
- }
- }
- return
- }
- //checkWordsPY 检查中文词组拼音,解决多音字问题
- func (m *Megaloscope) checkWordsPY(line string) (ret MatchResults) {
- ret = make(MatchResults, 0)
- linePY, _ := pinyin.New(line).Split("").Mode(pinyin.InitialsInCapitals).Convert()
- ms := m.WordsPYMatcher.Match(linePY)
- if len(ms) > 0 { //检查是否能对应到词组
- words := make(WordSlice, len(ms))
- for i, t := range ms {
- words[i] = m.AllWordsPY[t.Index]
- }
- //规则检查
- for _, v := range m.AllRules {
- if len(v.Words) == 1 {
- // 跳过一个字的规则,避免误判 例:屄/标
- continue
- }
- if isSubSlice(words, v.WordsPY) {
- //检查排除规则
- if len(v.ExcludeWordsPY) == 0 || !isHasOneSlice(words, v.ExcludeWordsPY) {
- ret = append(ret, &MatchResult{
- Line: line,
- MatchRule: v.Raw,
- })
- }
- }
- }
- }
- return
- }
- //isSubSlice 比较2个Slice看是否包含
- func isSubSlice(src, target WordSlice) bool {
- tmp := map[string]bool{}
- for _, w := range src {
- tmp[w] = true
- }
- //
- for _, w := range target {
- if _, ok := tmp[w]; !ok {
- return false
- }
- }
- return true
- }
- //isHasOneSlice 是否存在1个
- func isHasOneSlice(src, target WordSlice) bool {
- tmp := map[string]bool{}
- for _, w := range src {
- tmp[w] = true
- }
- //
- for _, w := range target {
- if _, ok := tmp[w]; ok {
- return true
- }
- }
- return false
- }
|