123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325 |
- package main
- import (
- util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "log"
- "math"
- "regexp"
- "strconv"
- "strings"
- )
- var REG *regexp.Regexp
- type RuleDFA struct {
- Match []DFA //包含的敏感词
- MatchNum []int //包含敏感词匹配个数
- MisMatch DFA //不包含的敏感词
- MisMatchNum int //不包含敏感词匹配个数
- }
- type DFA struct {
- Link map[string]interface{}
- }
- // DealRules 处理识别规则
- func DealRules(rules []string) (i_rule []interface{}) {
- for _, r := range rules {
- if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
- rs := []rune(r)
- ru := string(rs[1 : len(rs)-1])
- rureg, err := regexp.Compile(ru)
- if err != nil {
- log.Println("error---rule:", r)
- continue
- }
- i_rule = append(i_rule, []interface{}{rureg}...)
- } else { //规则,加入到敏感词匹配
- matchnum := 0
- mismatchnum := 0
- isnum1 := false
- isnum2 := false
- numArr := make([]int, 0)
- ruleDFA := &RuleDFA{
- Match: []DFA{},
- MisMatch: DFA{},
- }
- tmpArr := strings.Split(r, "^")
- matchTmp := tmpArr[0]
- ruleTextArr := REG.FindAllString(matchTmp, -1)
- for _, match := range ruleTextArr {
- matchnum, isnum1 = GetNum(match)
- numArr = append(numArr, matchnum)
- matchArr := GetRule(match, isnum1)
- tmpDFA := DFA{
- Link: make(map[string]interface{}),
- }
- tmpDFA.AddWord(matchArr...)
- ruleDFA.Match = append(ruleDFA.Match, tmpDFA)
- }
- if len(tmpArr) == 2 {
- mismatch := tmpArr[1]
- mismatchnum, isnum2 = GetNum(mismatch)
- mismatchArr := GetRule(mismatch, isnum2)
- ruleDFA.MisMatch.AddWord(mismatchArr...)
- }
- ruleDFA.MatchNum = numArr
- ruleDFA.MisMatchNum = mismatchnum
- i_rule = append(i_rule, []interface{}{ruleDFA}...)
- }
- }
- return
- }
- func (d *DFA) AddWord(keys ...string) {
- d.AddWordAll(true, keys...)
- }
- func (d *DFA) AddWordAll(haskey bool, keys ...string) {
- if d.Link == nil {
- d.Link = make(map[string]interface{})
- }
- for _, key := range keys {
- nowMap := &d.Link
- for i := 0; i < len(key); i++ {
- kc := key[i : i+1]
- if v, ok := (*nowMap)[kc]; ok {
- nowMap, _ = v.(*map[string]interface{})
- } else {
- newMap := map[string]interface{}{}
- newMap["YN"] = "0"
- (*nowMap)[kc] = &newMap
- nowMap = &newMap
- }
- if i == len(key)-1 {
- (*nowMap)["YN"] = "1"
- if haskey {
- (*nowMap)["K"] = key
- }
- }
- }
- }
- }
- func (d *DFA) CheckSensitiveWord(src string, n int) (bool, []string) {
- res := make([]string, 0)
- tmpMap := make(map[string]int)
- for j := 0; j < len(src); j++ {
- nowMap := &d.Link
- for i := j; i < len(src); i++ {
- word := src[i : i+1]
- nowMap, _ = (*nowMap)[word].(*map[string]interface{})
- if nowMap != nil { // 存在,则判断是否为最后一个
- if "1" == util.ObjToString((*nowMap)["YN"]) {
- s := util.ObjToString((*nowMap)["K"])
- tmpMap[s] = 1
- //nowMap = &d.Link //匹配到之后继续匹配后边的内容
- }
- } else {
- //nowMap = &d.Link
- break
- }
- }
- }
- if len(tmpMap) >= n {
- for k, _ := range tmpMap {
- res = append(res, k)
- }
- return true, res
- }
- return false, []string{}
- }
- // ObjArrToStringArr interface 数组转string 数组
- func ObjArrToStringArr(old []interface{}) []string {
- defer func() {
- if r := recover(); r != nil {
- // 在此处添加错误处理逻辑,例如记录错误日志
- }
- }()
- if old != nil {
- new := make([]string, 0)
- for _, v := range old {
- if strValue, ok := v.(string); ok {
- new = append(new, strValue)
- } else {
- // 在此处添加对非字符串类型值的处理逻辑,例如记录错误日志
- }
- }
- return new
- } else {
- return nil
- }
- }
- // GetRule 获取规则
- func GetRule(text string, isnum bool) (matchArr []string) {
- if isnum { //最后一个不是数字
- if strings.HasPrefix(text, "(") && strings.HasSuffix(text, ")") {
- text = text[1 : len(text)-1]
- matchArr = strings.Split(text, "|")
- }
- } else if strings.HasPrefix(text, "(") && !isnum {
- text = text[1 : len(text)-2]
- matchArr = strings.Split(text, "|")
- }
- return matchArr
- }
- // GetNum 获取匹配或不匹配的个数
- func GetNum(rule string) (int, bool) {
- num := 1
- isnum := strings.HasSuffix(rule, ")")
- if !isnum { //是数字
- s := []rune(rule)
- last := string(s[len(s)-1:])
- num = IntAll(last)
- }
- return num, isnum
- }
- func IntAll(num interface{}) int {
- return IntAllDef(num, 0)
- }
- func IntAllDef(num interface{}, defaultNum int) int {
- if i, ok := num.(int); ok {
- return int(i)
- } else if i0, ok0 := num.(int32); ok0 {
- return int(i0)
- } else if i1, ok1 := num.(float64); ok1 {
- return int(i1)
- } else if i2, ok2 := num.(int64); ok2 {
- return int(i2)
- } else if i3, ok3 := num.(float32); ok3 {
- return int(i3)
- } else if i4, ok4 := num.(string); ok4 {
- in, _ := strconv.Atoi(i4)
- return int(in)
- } else if i5, ok5 := num.(int16); ok5 {
- return int(i5)
- } else if i6, ok6 := num.(int8); ok6 {
- return int(i6)
- } else {
- return defaultNum
- }
- }
- // TagDFAAnalyRules 单独的标签识别规则
- func TagDFAAnalyRules(text string, rules []interface{}) (res []string) {
- defer util.Catch()
- for _, r := range rules {
- rDFA, b := r.(*RuleDFA)
- //util.Debug(j, "规则===", b, rDFA.Match, rDFA.MatchNum, rDFA.MisMatch, rDFA.MisMatchNum)
- if b { //规则DFA
- //util.Debug("res========", res, len(rDFA.MatchNum) == len(rDFA.Match), len(rDFA.MatchNum))
- if len(rDFA.MatchNum) == len(rDFA.Match) {
- for i, matchnum := range rDFA.MatchNum {
- if matchnum >= 1 {
- btmp, restmp := rDFA.Match[i].CheckSensitiveWord(text, matchnum)
- if !btmp { //逗号隔开的每条规则不匹配,继续匹配下一条
- //log.Println("继续匹配")
- break
- }
- res = append(res, restmp...)
- }
- }
- }
- }
- }
- return
- }
- // DFAAnalyRules DFA识别规则
- func DFAAnalyRules(text string, rules []interface{}) (bool, []string) {
- var arr []string
- //log.Println("len===", len(rules))
- for _, r := range rules {
- //log.Println("i--------------", i)
- ruleReg, ok := r.(*regexp.Regexp)
- if ok { //正则
- //log.Println("正则===", ruleReg)
- textArr := ruleReg.FindAllString(text, -1)
- if len(textArr) > 0 {
- regStr := []string{ruleReg.String()}
- return true, regStr
- }
- } else {
- rDFA, b := r.(*RuleDFA)
- //log.Println(j, "规则===", b, rDFA.Match, rDFA.MatchNum, rDFA.MisMatch, rDFA.MisMatchNum)
- if b { //规则DFA
- //b1, b2 := false, false
- b1, b2 := false, true
- var res []string
- //log.Println("res========", res, len(rDFA.MatchNum) == len(rDFA.Match), len(rDFA.MatchNum))
- if len(rDFA.MatchNum) == len(rDFA.Match) {
- for i, matchnum := range rDFA.MatchNum {
- if matchnum >= 1 {
- btmp, restmp := rDFA.Match[i].CheckSensitiveWord(text, matchnum)
- //log.Println("btmp====", btmp, restmp)
- if !btmp { //逗号隔开的每条规则不匹配,继续匹配下一条
- //log.Println("继续匹配")
- b2 = false
- break
- }
- res = append(res, restmp...)
- }
- }
- }
- if !b2 {
- continue
- }
- //走到这一步证明需要匹配的词正确个数满足要求,下面判断不需要匹配的词的情况
- mismatchnum := rDFA.MisMatchNum
- if mismatchnum >= 1 { //有排除词,排除词不应该出现在匹配的文本中
- b1, _ = rDFA.MisMatch.CheckSensitiveWord(text, mismatchnum)
- } else {
- b1 = false
- }
- if !b1 { //不要匹配的词满足情况,跳出
- return true, res
- } else {
- continue
- }
- }
- }
- }
- return false, arr
- }
- // MergeLabelData 处理标记权重
- func MergeLabelData(labelDatas []LabelData) map[string][]LabelData {
- result := make(map[string][]LabelData)
- for _, data := range labelDatas {
- // 检查是否已存在相同 Sfield 的数据
- if existingDatas, ok := result[data.Sfield]; ok {
- merged := false
- for i, existingData := range existingDatas {
- // 如果 Name 和 Sfield 都相同,合并 Weight
- if existingData.Name == data.Name && existingData.Sfield == data.Sfield {
- existingDatas[i].Weight = round(existingData.Weight+data.Weight, 2)
- merged = true
- break
- }
- }
- // 如果未合并,添加新数据
- if !merged {
- result[data.Sfield] = append(result[data.Sfield], data)
- }
- } else {
- result[data.Sfield] = []LabelData{data}
- }
- }
- return result
- }
- // 对浮点数进行四舍五入保留指定位数小数
- func round(num float64, decimalPlaces int) float64 {
- var multiplier float64 = 1
- for i := 0; i < decimalPlaces; i++ {
- multiplier *= 10
- }
- return math.Round(num*multiplier) / multiplier
- }
|