data_processing
/
DataService


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
							package megaloscope

import (
	"regexp"
	"strings"
	"sync"

	"github.com/Chain-Zhang/pinyin"
)

type (
	MatchResult struct {
		Line      string //句子
		MatchRule string //匹配规则
	}
	MatchResults []*MatchResult
)

var split_reg, _ = regexp.Compile("[,.;!?\n。，；？！]")

//Discern 多线程扫描检查一篇文章，是否存在敏感词
func (m *Megaloscope) Discern(src string, threads int) (ret MatchResults) {
	ret = make(MatchResults, 0)
	sentences := split_reg.Split(src, -1)
	mapRet := map[int]MatchResults{}
	lock := new(sync.RWMutex)
	wg := new(sync.WaitGroup)
	threadsLock := make(chan bool, threads)
	var fn = func(lineNo int, line string) {
		defer func() {
			<-threadsLock
			wg.Done()
		}()
		var r MatchResults
		if r = m.checkWords(line); len(r) == 0 {
			r = m.checkWordsPY(line)
		}
		if len(r) > 0 {
			lock.Lock()
			mapRet[lineNo] = r
			lock.Unlock()
		}
	}
	for i, s := range sentences {
		if line := strings.Trim(s, ""); len(line) > 0 {
			threadsLock <- true
			wg.Add(1)
			go fn(i, line)
		}
	}
	wg.Wait()
	//结果汇总(合并)
	for _, v := range mapRet {
		ret = append(ret, v...)
	}
	return
}

//checkWords 检查中文词组
func (m *Megaloscope) checkWords(line string) (ret MatchResults) {
	ret = make(MatchResults, 0)
	ms := m.WordsMatcher.Match(line)
	if len(ms) > 0 { //识别到敏感词
		words := make(WordSlice, len(ms))
		for i, t := range ms {
			words[i] = m.AllWords[t.Index]
		}
		//规则检查
		for _, v := range m.AllRules {
			if isSubSlice(words, v.Words) {
				//检查排除规则
				if len(v.ExcludeWords) == 0 || !isHasOneSlice(words, v.ExcludeWords) {
					ret = append(ret, &MatchResult{
						Line:      line,
						MatchRule: v.Raw,
					})
				}
			}
		}
	}
	return
}

//checkWordsPY 检查中文词组拼音，解决多音字问题
func (m *Megaloscope) checkWordsPY(line string) (ret MatchResults) {
	ret = make(MatchResults, 0)
	linePY, _ := pinyin.New(line).Split("").Mode(pinyin.InitialsInCapitals).Convert()
	ms := m.WordsPYMatcher.Match(linePY)
	if len(ms) > 0 { //检查是否能对应到词组
		words := make(WordSlice, len(ms))
		for i, t := range ms {
			words[i] = m.AllWordsPY[t.Index]
		}
		//规则检查
		for _, v := range m.AllRules {
			if len(v.Words) == 1 {
				// 跳过一个字的规则，避免误判		例：屄/标
				continue
			}
			if isSubSlice(words, v.WordsPY) {
				//检查排除规则
				if len(v.ExcludeWordsPY) == 0 || !isHasOneSlice(words, v.ExcludeWordsPY) {
					ret = append(ret, &MatchResult{
						Line:      line,
						MatchRule: v.Raw,
					})
				}
			}
		}
	}
	return
}

//isSubSlice 比较2个Slice看是否包含
func isSubSlice(src, target WordSlice) bool {
	tmp := map[string]bool{}
	for _, w := range src {
		tmp[w] = true
	}
	//
	for _, w := range target {
		if _, ok := tmp[w]; !ok {
			return false
		}
	}
	return true
}

//isHasOneSlice 是否存在1个
func isHasOneSlice(src, target WordSlice) bool {
	tmp := map[string]bool{}
	for _, w := range src {
		tmp[w] = true
	}
	//
	for _, w := range target {
		if _, ok := tmp[w]; ok {
			return true
		}
	}
	return false
}