package megaloscope import ( "regexp" "strings" "sync" "github.com/Chain-Zhang/pinyin" ) type ( MatchResult struct { Line string //句子 MatchRule string //匹配规则 } MatchResults []*MatchResult ) var split_reg, _ = regexp.Compile("[,.;!?\n。,;?!]") //Discern 多线程扫描检查一篇文章,是否存在敏感词 func (m *Megaloscope) Discern(src string, threads int) (ret MatchResults) { ret = make(MatchResults, 0) sentences := split_reg.Split(src, -1) mapRet := map[int]MatchResults{} lock := new(sync.RWMutex) wg := new(sync.WaitGroup) threadsLock := make(chan bool, threads) var fn = func(lineNo int, line string) { defer func() { <-threadsLock wg.Done() }() var r MatchResults if r = m.checkWords(line); len(r) == 0 { r = m.checkWordsPY(line) } if len(r) > 0 { lock.Lock() mapRet[lineNo] = r lock.Unlock() } } for i, s := range sentences { if line := strings.Trim(s, ""); len(line) > 0 { threadsLock <- true wg.Add(1) go fn(i, line) } } wg.Wait() //结果汇总(合并) for _, v := range mapRet { ret = append(ret, v...) } return } //checkWords 检查中文词组 func (m *Megaloscope) checkWords(line string) (ret MatchResults) { ret = make(MatchResults, 0) ms := m.WordsMatcher.Match(line) if len(ms) > 0 { //识别到敏感词 words := make(WordSlice, len(ms)) for i, t := range ms { words[i] = m.AllWords[t.Index] } //规则检查 for _, v := range m.AllRules { if isSubSlice(words, v.Words) { //检查排除规则 if len(v.ExcludeWords) == 0 || !isHasOneSlice(words, v.ExcludeWords) { ret = append(ret, &MatchResult{ Line: line, MatchRule: v.Raw, }) } } } } return } //checkWordsPY 检查中文词组拼音,解决多音字问题 func (m *Megaloscope) checkWordsPY(line string) (ret MatchResults) { ret = make(MatchResults, 0) linePY, _ := pinyin.New(line).Split("").Mode(pinyin.InitialsInCapitals).Convert() ms := m.WordsPYMatcher.Match(linePY) if len(ms) > 0 { //检查是否能对应到词组 words := make(WordSlice, len(ms)) for i, t := range ms { words[i] = m.AllWordsPY[t.Index] } //规则检查 for _, v := range m.AllRules { if len(v.Words) == 1 { // 跳过一个字的规则,避免误判 例:屄/标 continue } if isSubSlice(words, v.WordsPY) { //检查排除规则 if len(v.ExcludeWordsPY) == 0 || !isHasOneSlice(words, v.ExcludeWordsPY) { ret = append(ret, &MatchResult{ Line: line, MatchRule: v.Raw, }) } } } } return } //isSubSlice 比较2个Slice看是否包含 func isSubSlice(src, target WordSlice) bool { tmp := map[string]bool{} for _, w := range src { tmp[w] = true } // for _, w := range target { if _, ok := tmp[w]; !ok { return false } } return true } //isHasOneSlice 是否存在1个 func isHasOneSlice(src, target WordSlice) bool { tmp := map[string]bool{} for _, w := range src { tmp[w] = true } // for _, w := range target { if _, ok := tmp[w]; ok { return true } } return false }