tools.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. package main
  2. import (
  3. util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  4. "log"
  5. "math"
  6. "regexp"
  7. "strconv"
  8. "strings"
  9. )
  10. var REG *regexp.Regexp
  11. type RuleDFA struct {
  12. Match []DFA //包含的敏感词
  13. MatchNum []int //包含敏感词匹配个数
  14. MisMatch DFA //不包含的敏感词
  15. MisMatchNum int //不包含敏感词匹配个数
  16. }
  17. type DFA struct {
  18. Link map[string]interface{}
  19. }
  20. // DealRules 处理识别规则
  21. func DealRules(rules []string) (i_rule []interface{}) {
  22. for _, r := range rules {
  23. if strings.HasPrefix(r, "'") && strings.HasSuffix(r, "'") { //正则
  24. rs := []rune(r)
  25. ru := string(rs[1 : len(rs)-1])
  26. rureg, err := regexp.Compile(ru)
  27. if err != nil {
  28. log.Println("error---rule:", r)
  29. continue
  30. }
  31. i_rule = append(i_rule, []interface{}{rureg}...)
  32. } else { //规则,加入到敏感词匹配
  33. matchnum := 0
  34. mismatchnum := 0
  35. isnum1 := false
  36. isnum2 := false
  37. numArr := make([]int, 0)
  38. ruleDFA := &RuleDFA{
  39. Match: []DFA{},
  40. MisMatch: DFA{},
  41. }
  42. tmpArr := strings.Split(r, "^")
  43. matchTmp := tmpArr[0]
  44. ruleTextArr := REG.FindAllString(matchTmp, -1)
  45. for _, match := range ruleTextArr {
  46. matchnum, isnum1 = GetNum(match)
  47. numArr = append(numArr, matchnum)
  48. matchArr := GetRule(match, isnum1)
  49. tmpDFA := DFA{
  50. Link: make(map[string]interface{}),
  51. }
  52. tmpDFA.AddWord(matchArr...)
  53. ruleDFA.Match = append(ruleDFA.Match, tmpDFA)
  54. }
  55. if len(tmpArr) == 2 {
  56. mismatch := tmpArr[1]
  57. mismatchnum, isnum2 = GetNum(mismatch)
  58. mismatchArr := GetRule(mismatch, isnum2)
  59. ruleDFA.MisMatch.AddWord(mismatchArr...)
  60. }
  61. ruleDFA.MatchNum = numArr
  62. ruleDFA.MisMatchNum = mismatchnum
  63. i_rule = append(i_rule, []interface{}{ruleDFA}...)
  64. }
  65. }
  66. return
  67. }
  68. func (d *DFA) AddWord(keys ...string) {
  69. d.AddWordAll(true, keys...)
  70. }
  71. func (d *DFA) AddWordAll(haskey bool, keys ...string) {
  72. if d.Link == nil {
  73. d.Link = make(map[string]interface{})
  74. }
  75. for _, key := range keys {
  76. nowMap := &d.Link
  77. for i := 0; i < len(key); i++ {
  78. kc := key[i : i+1]
  79. if v, ok := (*nowMap)[kc]; ok {
  80. nowMap, _ = v.(*map[string]interface{})
  81. } else {
  82. newMap := map[string]interface{}{}
  83. newMap["YN"] = "0"
  84. (*nowMap)[kc] = &newMap
  85. nowMap = &newMap
  86. }
  87. if i == len(key)-1 {
  88. (*nowMap)["YN"] = "1"
  89. if haskey {
  90. (*nowMap)["K"] = key
  91. }
  92. }
  93. }
  94. }
  95. }
  96. func (d *DFA) CheckSensitiveWord(src string, n int) (bool, []string) {
  97. res := make([]string, 0)
  98. tmpMap := make(map[string]int)
  99. for j := 0; j < len(src); j++ {
  100. nowMap := &d.Link
  101. for i := j; i < len(src); i++ {
  102. word := src[i : i+1]
  103. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  104. if nowMap != nil { // 存在,则判断是否为最后一个
  105. if "1" == util.ObjToString((*nowMap)["YN"]) {
  106. s := util.ObjToString((*nowMap)["K"])
  107. tmpMap[s] = 1
  108. //nowMap = &d.Link //匹配到之后继续匹配后边的内容
  109. }
  110. } else {
  111. //nowMap = &d.Link
  112. break
  113. }
  114. }
  115. }
  116. if len(tmpMap) >= n {
  117. for k, _ := range tmpMap {
  118. res = append(res, k)
  119. }
  120. return true, res
  121. }
  122. return false, []string{}
  123. }
  124. // ObjArrToStringArr interface 数组转string 数组
  125. func ObjArrToStringArr(old []interface{}) []string {
  126. defer func() {
  127. if r := recover(); r != nil {
  128. // 在此处添加错误处理逻辑,例如记录错误日志
  129. }
  130. }()
  131. if old != nil {
  132. new := make([]string, 0)
  133. for _, v := range old {
  134. if strValue, ok := v.(string); ok {
  135. new = append(new, strValue)
  136. } else {
  137. // 在此处添加对非字符串类型值的处理逻辑,例如记录错误日志
  138. }
  139. }
  140. return new
  141. } else {
  142. return nil
  143. }
  144. }
  145. // GetRule 获取规则
  146. func GetRule(text string, isnum bool) (matchArr []string) {
  147. if isnum { //最后一个不是数字
  148. if strings.HasPrefix(text, "(") && strings.HasSuffix(text, ")") {
  149. text = text[1 : len(text)-1]
  150. matchArr = strings.Split(text, "|")
  151. }
  152. } else if strings.HasPrefix(text, "(") && !isnum {
  153. text = text[1 : len(text)-2]
  154. matchArr = strings.Split(text, "|")
  155. }
  156. return matchArr
  157. }
  158. // GetNum 获取匹配或不匹配的个数
  159. func GetNum(rule string) (int, bool) {
  160. num := 1
  161. isnum := strings.HasSuffix(rule, ")")
  162. if !isnum { //是数字
  163. s := []rune(rule)
  164. last := string(s[len(s)-1:])
  165. num = IntAll(last)
  166. }
  167. return num, isnum
  168. }
  169. func IntAll(num interface{}) int {
  170. return IntAllDef(num, 0)
  171. }
  172. func IntAllDef(num interface{}, defaultNum int) int {
  173. if i, ok := num.(int); ok {
  174. return int(i)
  175. } else if i0, ok0 := num.(int32); ok0 {
  176. return int(i0)
  177. } else if i1, ok1 := num.(float64); ok1 {
  178. return int(i1)
  179. } else if i2, ok2 := num.(int64); ok2 {
  180. return int(i2)
  181. } else if i3, ok3 := num.(float32); ok3 {
  182. return int(i3)
  183. } else if i4, ok4 := num.(string); ok4 {
  184. in, _ := strconv.Atoi(i4)
  185. return int(in)
  186. } else if i5, ok5 := num.(int16); ok5 {
  187. return int(i5)
  188. } else if i6, ok6 := num.(int8); ok6 {
  189. return int(i6)
  190. } else {
  191. return defaultNum
  192. }
  193. }
  194. // TagDFAAnalyRules 单独的标签识别规则
  195. func TagDFAAnalyRules(text string, rules []interface{}) (res []string) {
  196. defer util.Catch()
  197. for _, r := range rules {
  198. rDFA, b := r.(*RuleDFA)
  199. //util.Debug(j, "规则===", b, rDFA.Match, rDFA.MatchNum, rDFA.MisMatch, rDFA.MisMatchNum)
  200. if b { //规则DFA
  201. //util.Debug("res========", res, len(rDFA.MatchNum) == len(rDFA.Match), len(rDFA.MatchNum))
  202. if len(rDFA.MatchNum) == len(rDFA.Match) {
  203. for i, matchnum := range rDFA.MatchNum {
  204. if matchnum >= 1 {
  205. btmp, restmp := rDFA.Match[i].CheckSensitiveWord(text, matchnum)
  206. if !btmp { //逗号隔开的每条规则不匹配,继续匹配下一条
  207. //log.Println("继续匹配")
  208. break
  209. }
  210. res = append(res, restmp...)
  211. }
  212. }
  213. }
  214. }
  215. }
  216. return
  217. }
  218. // DFAAnalyRules DFA识别规则
  219. func DFAAnalyRules(text string, rules []interface{}) (bool, []string) {
  220. var arr []string
  221. //log.Println("len===", len(rules))
  222. for _, r := range rules {
  223. //log.Println("i--------------", i)
  224. ruleReg, ok := r.(*regexp.Regexp)
  225. if ok { //正则
  226. //log.Println("正则===", ruleReg)
  227. textArr := ruleReg.FindAllString(text, -1)
  228. if len(textArr) > 0 {
  229. regStr := []string{ruleReg.String()}
  230. return true, regStr
  231. }
  232. } else {
  233. rDFA, b := r.(*RuleDFA)
  234. //log.Println(j, "规则===", b, rDFA.Match, rDFA.MatchNum, rDFA.MisMatch, rDFA.MisMatchNum)
  235. if b { //规则DFA
  236. //b1, b2 := false, false
  237. b1, b2 := false, true
  238. var res []string
  239. //log.Println("res========", res, len(rDFA.MatchNum) == len(rDFA.Match), len(rDFA.MatchNum))
  240. if len(rDFA.MatchNum) == len(rDFA.Match) {
  241. for i, matchnum := range rDFA.MatchNum {
  242. if matchnum >= 1 {
  243. btmp, restmp := rDFA.Match[i].CheckSensitiveWord(text, matchnum)
  244. //log.Println("btmp====", btmp, restmp)
  245. if !btmp { //逗号隔开的每条规则不匹配,继续匹配下一条
  246. //log.Println("继续匹配")
  247. b2 = false
  248. break
  249. }
  250. res = append(res, restmp...)
  251. }
  252. }
  253. }
  254. if !b2 {
  255. continue
  256. }
  257. //走到这一步证明需要匹配的词正确个数满足要求,下面判断不需要匹配的词的情况
  258. mismatchnum := rDFA.MisMatchNum
  259. if mismatchnum >= 1 { //有排除词,排除词不应该出现在匹配的文本中
  260. b1, _ = rDFA.MisMatch.CheckSensitiveWord(text, mismatchnum)
  261. } else {
  262. b1 = false
  263. }
  264. if !b1 { //不要匹配的词满足情况,跳出
  265. return true, res
  266. } else {
  267. continue
  268. }
  269. }
  270. }
  271. }
  272. return false, arr
  273. }
  274. // MergeLabelData 处理标记权重
  275. func MergeLabelData(labelDatas []LabelData) map[string][]LabelData {
  276. result := make(map[string][]LabelData)
  277. for _, data := range labelDatas {
  278. // 检查是否已存在相同 Sfield 的数据
  279. if existingDatas, ok := result[data.Sfield]; ok {
  280. merged := false
  281. for i, existingData := range existingDatas {
  282. // 如果 Name 和 Sfield 都相同,合并 Weight
  283. if existingData.Name == data.Name && existingData.Sfield == data.Sfield {
  284. existingDatas[i].Weight = round(existingData.Weight+data.Weight, 2)
  285. merged = true
  286. break
  287. }
  288. }
  289. // 如果未合并,添加新数据
  290. if !merged {
  291. result[data.Sfield] = append(result[data.Sfield], data)
  292. }
  293. } else {
  294. result[data.Sfield] = []LabelData{data}
  295. }
  296. }
  297. return result
  298. }
  299. // 对浮点数进行四舍五入保留指定位数小数
  300. func round(num float64, decimalPlaces int) float64 {
  301. var multiplier float64 = 1
  302. for i := 0; i < decimalPlaces; i++ {
  303. multiplier *= 10
  304. }
  305. return math.Round(num*multiplier) / multiplier
  306. }