123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- package megaloscope
- import (
- "bufio"
- "os"
- "strings"
- "github.com/Chain-Zhang/pinyin"
- )
- /**
- 词组
- */
- type (
- //词组
- WordSlice []string
- //规则
- Rule struct {
- Raw string //规则原始定义
- Words WordSlice //中文词组合
- ExcludeWords WordSlice //排除词
- WordsPY WordSlice //词拼音
- ExcludeWordsPY WordSlice //排除词拼音
- }
- //敏感词检测
- Megaloscope struct {
- AllWords WordSlice //所有词
- AllWordsPY WordSlice //所有词的拼音
- AllRules map[int]*Rule //所有词组
- WordsMatcher *Matcher
- WordsPYMatcher *Matcher
- }
- )
- //
- func NewMegaloscope(filepath string) *Megaloscope {
- m := &Megaloscope{AllRules: make(map[int]*Rule)}
- m.loadRules(filepath)
- return m
- }
- //博采+网站^打击|禁止|杜绝
- //赌博^打击|禁止|杜绝|取缔
- //规则解析
- func (m *Megaloscope) parseRule(line string) *Rule {
- r := &Rule{Raw: line}
- if strings.Contains(line, "^") {
- tmp := strings.Split(line, "^")
- r.Words = strings.Split(tmp[0], "+")
- r.ExcludeWords = strings.Split(tmp[1], "|")
- } else {
- r.Words = strings.Split(line, "+")
- }
- return r
- }
- //加载规则
- func (m *Megaloscope) loadRules(filepath string) error {
- fi, err := os.Open(filepath)
- if err != nil {
- return err
- }
- defer fi.Close()
- reader := bufio.NewReader(fi)
- allWords := make(WordSlice, 0)
- for i := 0; ; i++ {
- bs, _, err := reader.ReadLine()
- if err != nil {
- break
- }
- rule := m.parseRule(string(bs))
- m.AllRules[i] = rule
- allWords = append(allWords, rule.Words...)
- allWords = append(allWords, rule.ExcludeWords...)
- }
- //所有词去重
- tmp := map[string]bool{}
- for _, w := range allWords {
- tmp[w] = true
- }
- //计算词的拼音
- allWordPY := make(map[string]string)
- words := make([]string, len(tmp))
- wordsPY := make([]string, len(tmp))
- index := 0
- for w := range tmp {
- py, _ := pinyin.New(w).Split("").Mode(pinyin.InitialsInCapitals).Convert()
- allWordPY[w] = py
- words[index] = w
- wordsPY[index] = py
- index += 1
- }
- m.AllWords = words
- m.AllWordsPY = wordsPY
- //完善规则中的拼音
- for _, v := range m.AllRules {
- v.WordsPY = make(WordSlice, len(v.Words))
- for i, w := range v.Words {
- v.WordsPY[i] = allWordPY[w]
- }
- if len(v.ExcludeWords) > 0 {
- v.ExcludeWordsPY = make(WordSlice, len(v.ExcludeWords))
- for i, w := range v.ExcludeWords {
- v.ExcludeWordsPY[i] = allWordPY[w]
- }
- }
- }
- m.WordsPYMatcher = BuildNewMatcher(m.AllWordsPY)
- m.WordsMatcher = BuildNewMatcher(m.AllWords)
- return nil
- }
|