file.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. package main
  2. import (
  3. "bufio"
  4. "fmt"
  5. "go.uber.org/zap"
  6. "io"
  7. util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
  9. "os"
  10. "regexp"
  11. "strconv"
  12. "strings"
  13. )
  14. var (
  15. CmmonDFA *DFA //常用字
  16. NotCommonDFA *DFA //不常用字
  17. TimesLimit int //常用字界限
  18. CmmLmt, NcmmLmt float64 //更新界限
  19. HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则
  20. SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
  21. SpecialReg = regexp.MustCompile("图片(\\d)+") //
  22. )
  23. func InitFileInfo() {
  24. TimesLimit = 500
  25. CmmLmt = 0.5
  26. NcmmLmt = 0.1
  27. CmmonDFA = &DFA{}
  28. NotCommonDFA = &DFA{}
  29. LoadDict("CommonDict.txt") //初始化常用字典
  30. }
  31. //DFA
  32. type DFA struct {
  33. Link map[string]interface{}
  34. }
  35. func (d *DFA) AddWord(keys ...string) {
  36. d.AddWordAll(true, keys...)
  37. }
  38. func (d *DFA) AddWordAll(haskey bool, keys ...string) {
  39. if d.Link == nil {
  40. d.Link = make(map[string]interface{})
  41. }
  42. for _, key := range keys {
  43. nowMap := &d.Link
  44. for i := 0; i < len(key); i++ {
  45. kc := key[i : i+1]
  46. if v, ok := (*nowMap)[kc]; ok {
  47. nowMap, _ = v.(*map[string]interface{})
  48. } else {
  49. newMap := map[string]interface{}{}
  50. newMap["YN"] = "0"
  51. (*nowMap)[kc] = &newMap
  52. nowMap = &newMap
  53. }
  54. if i == len(key)-1 {
  55. (*nowMap)["YN"] = "1"
  56. if haskey {
  57. (*nowMap)["K"] = key
  58. }
  59. }
  60. }
  61. }
  62. }
  63. func (d *DFA) CheckSensitiveWord(src string) []string {
  64. res := make([]string, 0)
  65. for j := 0; j < len(src); j++ {
  66. nowMap := &d.Link
  67. for i := j; i < len(src); i++ {
  68. word := src[i : i+1]
  69. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  70. if nowMap != nil { // 存在,则判断是否为最后一个
  71. if "1" == util.ObjToString((*nowMap)["YN"]) {
  72. s := util.ObjToString((*nowMap)["K"])
  73. res = append(res, s)
  74. }
  75. } else {
  76. break
  77. }
  78. }
  79. }
  80. return res
  81. }
  82. //加载统计的常用词
  83. func LoadDict(path string) {
  84. dictFile, err := os.Open(path)
  85. if err != nil {
  86. log.Error("Load Common.txt Error")
  87. os.Exit(-1)
  88. }
  89. defer dictFile.Close()
  90. reader := bufio.NewReader(dictFile)
  91. var (
  92. text string
  93. frequency int
  94. )
  95. // 逐行读入分词
  96. line := 0
  97. for {
  98. line++
  99. size, fsErr := fmt.Fscanln(reader, &text, &frequency) //读每行赋值
  100. if fsErr == io.EOF { //读取到结尾
  101. break
  102. }
  103. if size == 2 { //正确数据
  104. if frequency >= TimesLimit { //常用字
  105. CmmonDFA.AddWord(text)
  106. } else { //非常用字
  107. NotCommonDFA.AddWord(text)
  108. }
  109. } else {
  110. log.Error("Read Line Error:", zap.Int("line", line))
  111. }
  112. }
  113. }
  114. //解析附件
  115. func AnalysisFile(filetext string) bool {
  116. defer util.Catch()
  117. //过滤空格
  118. filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
  119. if filetextTmp == "" { //附件为空
  120. return false
  121. }
  122. //特殊情况:图片0 图片1
  123. filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
  124. if filetextTmp == "" { //附件为空
  125. return false
  126. }
  127. //中文匹配
  128. HanArr := HanReg.FindAllString(filetextTmp, -1)
  129. hanText := strings.Join(HanArr, "")
  130. hanTextLen := len([]rune(hanText))
  131. //长度过滤
  132. if hanTextLen <= 100 {
  133. return false
  134. }
  135. //qu.Debug(hanTextLen, hanText)
  136. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  137. commonLen := len(commonArr)
  138. //qu.Debug(commonLen, commonArr)
  139. //commonText := strings.Join(commonArr, "")
  140. notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
  141. notCommonLen := len(notCommonArr)
  142. //qu.Debug(notCommonLen, notCommonArr)
  143. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  144. commonRatio := float64(commonLen) / float64(hanTextLen)
  145. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  146. //qu.Debug(commonRatio)
  147. notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
  148. notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
  149. if commonRatio >= CmmLmt && notCommonRatio < NcmmLmt {
  150. return true
  151. }
  152. return false
  153. }
  154. //测试方法
  155. func AnalysisFileTest(detail string) (bool, string, int, float64, float64) {
  156. //qu.Debug(detail)
  157. defer util.Catch()
  158. //过滤空格
  159. filetextTmp := SpaceReg.ReplaceAllString(detail, "")
  160. if filetextTmp == "" { //附件为空
  161. return false, "", 0, 0, 0
  162. }
  163. //特殊情况:图片0 图片1
  164. filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
  165. if filetextTmp == "" { //附件为空
  166. return false, "", 1, 0, 0
  167. }
  168. //中文匹配
  169. HanArr := HanReg.FindAllString(filetextTmp, -1)
  170. hanText := strings.Join(HanArr, "")
  171. hanTextLen := len([]rune(hanText))
  172. //长度过滤
  173. if hanTextLen <= 100 {
  174. return false, "", 2, 0, 0
  175. }
  176. //qu.Debug(textLen, text)
  177. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  178. commonLen := len(commonArr)
  179. //commonText := strings.Join(commonArr, "")
  180. notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
  181. notCommonLen := len(notCommonArr)
  182. //notCommonText := strings.Join(notCommonArr, "")
  183. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  184. commonRatio := float64(commonLen) / float64(hanTextLen)
  185. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  186. notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
  187. notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
  188. return true, filetextTmp, 10, commonRatio, notCommonRatio
  189. }