package main import ( util "jygit.jydev.jianyu360.cn/data_processing/common_utils" "log" "math" "regexp" "strconv" "strings" ) var REG *regexp.Regexp type RuleDFA struct { Match []DFA //包含的敏感词 MatchNum []int //包含敏感词匹配个数 MisMatch DFA //不包含的敏感词 MisMatchNum int //不包含敏感词匹配个数 } type DFA struct { Link map[string]interface{} } // DealRules 处理识别规则 func DealRules(rules []string) (i_rule []interface{}) { for _, r := range rules { if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则 rs := []rune(r) ru := string(rs[1 : len(rs)-1]) rureg, err := regexp.Compile(ru) if err != nil { log.Println("error---rule:", r) continue } i_rule = append(i_rule, []interface{}{rureg}...) } else { //规则,加入到敏感词匹配 matchnum := 0 mismatchnum := 0 isnum1 := false isnum2 := false numArr := make([]int, 0) ruleDFA := &RuleDFA{ Match: []DFA{}, MisMatch: DFA{}, } tmpArr := strings.Split(r, "^") matchTmp := tmpArr[0] ruleTextArr := REG.FindAllString(matchTmp, -1) for _, match := range ruleTextArr { matchnum, isnum1 = GetNum(match) numArr = append(numArr, matchnum) matchArr := GetRule(match, isnum1) tmpDFA := DFA{ Link: make(map[string]interface{}), } tmpDFA.AddWord(matchArr...) ruleDFA.Match = append(ruleDFA.Match, tmpDFA) } if len(tmpArr) == 2 { mismatch := tmpArr[1] mismatchnum, isnum2 = GetNum(mismatch) mismatchArr := GetRule(mismatch, isnum2) ruleDFA.MisMatch.AddWord(mismatchArr...) } ruleDFA.MatchNum = numArr ruleDFA.MisMatchNum = mismatchnum i_rule = append(i_rule, []interface{}{ruleDFA}...) } } return } func (d *DFA) AddWord(keys ...string) { d.AddWordAll(true, keys...) } func (d *DFA) AddWordAll(haskey bool, keys ...string) { if d.Link == nil { d.Link = make(map[string]interface{}) } for _, key := range keys { nowMap := &d.Link for i := 0; i < len(key); i++ { kc := key[i : i+1] if v, ok := (*nowMap)[kc]; ok { nowMap, _ = v.(*map[string]interface{}) } else { newMap := map[string]interface{}{} newMap["YN"] = "0" (*nowMap)[kc] = &newMap nowMap = &newMap } if i == len(key)-1 { (*nowMap)["YN"] = "1" if haskey { (*nowMap)["K"] = key } } } } } func (d *DFA) CheckSensitiveWord(src string, n int) (bool, []string) { res := make([]string, 0) tmpMap := make(map[string]int) for j := 0; j < len(src); j++ { nowMap := &d.Link for i := j; i < len(src); i++ { word := src[i : i+1] nowMap, _ = (*nowMap)[word].(*map[string]interface{}) if nowMap != nil { // 存在,则判断是否为最后一个 if "1" == util.ObjToString((*nowMap)["YN"]) { s := util.ObjToString((*nowMap)["K"]) tmpMap[s] = 1 //nowMap = &d.Link //匹配到之后继续匹配后边的内容 } } else { //nowMap = &d.Link break } } } if len(tmpMap) >= n { for k, _ := range tmpMap { res = append(res, k) } return true, res } return false, []string{} } // ObjArrToStringArr interface 数组转string 数组 func ObjArrToStringArr(old []interface{}) []string { defer func() { if r := recover(); r != nil { // 在此处添加错误处理逻辑,例如记录错误日志 } }() if old != nil { new := make([]string, 0) for _, v := range old { if strValue, ok := v.(string); ok { new = append(new, strValue) } else { // 在此处添加对非字符串类型值的处理逻辑,例如记录错误日志 } } return new } else { return nil } } // GetRule 获取规则 func GetRule(text string, isnum bool) (matchArr []string) { if isnum { //最后一个不是数字 if strings.HasPrefix(text, "(") && strings.HasSuffix(text, ")") { text = text[1 : len(text)-1] matchArr = strings.Split(text, "|") } } else if strings.HasPrefix(text, "(") && !isnum { text = text[1 : len(text)-2] matchArr = strings.Split(text, "|") } return matchArr } // GetNum 获取匹配或不匹配的个数 func GetNum(rule string) (int, bool) { num := 1 isnum := strings.HasSuffix(rule, ")") if !isnum { //是数字 s := []rune(rule) last := string(s[len(s)-1:]) num = IntAll(last) } return num, isnum } func IntAll(num interface{}) int { return IntAllDef(num, 0) } func IntAllDef(num interface{}, defaultNum int) int { if i, ok := num.(int); ok { return int(i) } else if i0, ok0 := num.(int32); ok0 { return int(i0) } else if i1, ok1 := num.(float64); ok1 { return int(i1) } else if i2, ok2 := num.(int64); ok2 { return int(i2) } else if i3, ok3 := num.(float32); ok3 { return int(i3) } else if i4, ok4 := num.(string); ok4 { in, _ := strconv.Atoi(i4) return int(in) } else if i5, ok5 := num.(int16); ok5 { return int(i5) } else if i6, ok6 := num.(int8); ok6 { return int(i6) } else { return defaultNum } } // TagDFAAnalyRules 单独的标签识别规则 func TagDFAAnalyRules(text string, rules []interface{}) (res []string) { defer util.Catch() for _, r := range rules { rDFA, b := r.(*RuleDFA) //util.Debug(j, "规则===", b, rDFA.Match, rDFA.MatchNum, rDFA.MisMatch, rDFA.MisMatchNum) if b { //规则DFA //util.Debug("res========", res, len(rDFA.MatchNum) == len(rDFA.Match), len(rDFA.MatchNum)) if len(rDFA.MatchNum) == len(rDFA.Match) { for i, matchnum := range rDFA.MatchNum { if matchnum >= 1 { btmp, restmp := rDFA.Match[i].CheckSensitiveWord(text, matchnum) if !btmp { //逗号隔开的每条规则不匹配,继续匹配下一条 //log.Println("继续匹配") break } res = append(res, restmp...) } } } } } return } // DFAAnalyRules DFA识别规则 func DFAAnalyRules(text string, rules []interface{}) (bool, []string) { var arr []string //log.Println("len===", len(rules)) for _, r := range rules { //log.Println("i--------------", i) ruleReg, ok := r.(*regexp.Regexp) if ok { //正则 //log.Println("正则===", ruleReg) textArr := ruleReg.FindAllString(text, -1) if len(textArr) > 0 { regStr := []string{ruleReg.String()} return true, regStr } } else { rDFA, b := r.(*RuleDFA) //log.Println(j, "规则===", b, rDFA.Match, rDFA.MatchNum, rDFA.MisMatch, rDFA.MisMatchNum) if b { //规则DFA //b1, b2 := false, false b1, b2 := false, true var res []string //log.Println("res========", res, len(rDFA.MatchNum) == len(rDFA.Match), len(rDFA.MatchNum)) if len(rDFA.MatchNum) == len(rDFA.Match) { for i, matchnum := range rDFA.MatchNum { if matchnum >= 1 { btmp, restmp := rDFA.Match[i].CheckSensitiveWord(text, matchnum) //log.Println("btmp====", btmp, restmp) if !btmp { //逗号隔开的每条规则不匹配,继续匹配下一条 //log.Println("继续匹配") b2 = false break } res = append(res, restmp...) } } } if !b2 { continue } //走到这一步证明需要匹配的词正确个数满足要求,下面判断不需要匹配的词的情况 mismatchnum := rDFA.MisMatchNum if mismatchnum >= 1 { //有排除词,排除词不应该出现在匹配的文本中 b1, _ = rDFA.MisMatch.CheckSensitiveWord(text, mismatchnum) } else { b1 = false } if !b1 { //不要匹配的词满足情况,跳出 return true, res } else { continue } } } } return false, arr } // MergeLabelData 处理标记权重 func MergeLabelData(labelDatas []LabelData) map[string][]LabelData { result := make(map[string][]LabelData) for _, data := range labelDatas { // 检查是否已存在相同 Sfield 的数据 if existingDatas, ok := result[data.Sfield]; ok { merged := false for i, existingData := range existingDatas { // 如果 Name 和 Sfield 都相同,合并 Weight if existingData.Name == data.Name && existingData.Sfield == data.Sfield { existingDatas[i].Weight = round(existingData.Weight+data.Weight, 2) merged = true break } } // 如果未合并,添加新数据 if !merged { result[data.Sfield] = append(result[data.Sfield], data) } } else { result[data.Sfield] = []LabelData{data} } } return result } // 对浮点数进行四舍五入保留指定位数小数 func round(num float64, decimalPlaces int) float64 { var multiplier float64 = 1 for i := 0; i < decimalPlaces; i++ { multiplier *= 10 } return math.Round(num*multiplier) / multiplier }