file.go 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. package main
  2. import (
  3. "bufio"
  4. "fmt"
  5. "io"
  6. "log"
  7. "os"
  8. "regexp"
  9. "strconv"
  10. "strings"
  11. util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  12. )
  13. var (
  14. CmmonDFA *DFA //常用字
  15. NotCommonDFA *DFA //不常用字
  16. TimesLimit int //常用字界限
  17. CmmLmt, NcmmLmt float64 //更新界限
  18. HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则
  19. SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
  20. SpecialReg = regexp.MustCompile("图片(\\d)+") //
  21. )
  22. func InitFileInfo() {
  23. TimesLimit = 500
  24. CmmLmt = 0.5
  25. NcmmLmt = 0.1
  26. CmmonDFA = &DFA{}
  27. NotCommonDFA = &DFA{}
  28. LoadDict("CommonDict.txt") //初始化常用字典
  29. }
  30. // DFA
  31. type DFA struct {
  32. Link map[string]interface{}
  33. }
  34. func (d *DFA) AddWord(keys ...string) {
  35. d.AddWordAll(true, keys...)
  36. }
  37. func (d *DFA) AddWordAll(haskey bool, keys ...string) {
  38. if d.Link == nil {
  39. d.Link = make(map[string]interface{})
  40. }
  41. for _, key := range keys {
  42. nowMap := &d.Link
  43. for i := 0; i < len(key); i++ {
  44. kc := key[i : i+1]
  45. if v, ok := (*nowMap)[kc]; ok {
  46. nowMap, _ = v.(*map[string]interface{})
  47. } else {
  48. newMap := map[string]interface{}{}
  49. newMap["YN"] = "0"
  50. (*nowMap)[kc] = &newMap
  51. nowMap = &newMap
  52. }
  53. if i == len(key)-1 {
  54. (*nowMap)["YN"] = "1"
  55. if haskey {
  56. (*nowMap)["K"] = key
  57. }
  58. }
  59. }
  60. }
  61. }
  62. func (d *DFA) CheckSensitiveWord(src string) []string {
  63. res := make([]string, 0)
  64. for j := 0; j < len(src); j++ {
  65. nowMap := &d.Link
  66. for i := j; i < len(src); i++ {
  67. word := src[i : i+1]
  68. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  69. if nowMap != nil { // 存在,则判断是否为最后一个
  70. if "1" == util.ObjToString((*nowMap)["YN"]) {
  71. s := util.ObjToString((*nowMap)["K"])
  72. res = append(res, s)
  73. }
  74. } else {
  75. break
  76. }
  77. }
  78. }
  79. return res
  80. }
  81. // 加载统计的常用词
  82. func LoadDict(path string) {
  83. dictFile, err := os.Open(path)
  84. if err != nil {
  85. log.Println("Load Common.txt Error")
  86. os.Exit(-1)
  87. }
  88. defer dictFile.Close()
  89. reader := bufio.NewReader(dictFile)
  90. var (
  91. text string
  92. frequency int
  93. )
  94. // 逐行读入分词
  95. line := 0
  96. for {
  97. line++
  98. size, fsErr := fmt.Fscanln(reader, &text, &frequency) //读每行赋值
  99. if fsErr == io.EOF { //读取到结尾
  100. break
  101. }
  102. if size == 2 { //正确数据
  103. if frequency >= TimesLimit { //常用字
  104. CmmonDFA.AddWord(text)
  105. } else { //非常用字
  106. NotCommonDFA.AddWord(text)
  107. }
  108. } else {
  109. log.Println("Read Line Error: line ", line)
  110. }
  111. }
  112. }
  113. // 解析附件
  114. func AnalysisFile(filetext string) bool {
  115. defer util.Catch()
  116. //过滤空格
  117. filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
  118. if filetextTmp == "" { //附件为空
  119. return false
  120. }
  121. //特殊情况:图片0 图片1
  122. filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
  123. if filetextTmp == "" { //附件为空
  124. return false
  125. }
  126. //中文匹配
  127. HanArr := HanReg.FindAllString(filetextTmp, -1)
  128. hanText := strings.Join(HanArr, "")
  129. hanTextLen := len([]rune(hanText))
  130. //长度过滤
  131. if hanTextLen <= 100 {
  132. return false
  133. }
  134. //qu.Debug(hanTextLen, hanText)
  135. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  136. commonLen := len(commonArr)
  137. //qu.Debug(commonLen, commonArr)
  138. //commonText := strings.Join(commonArr, "")
  139. notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
  140. notCommonLen := len(notCommonArr)
  141. //qu.Debug(notCommonLen, notCommonArr)
  142. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  143. commonRatio := float64(commonLen) / float64(hanTextLen)
  144. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  145. //qu.Debug(commonRatio)
  146. notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
  147. notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
  148. if commonRatio >= CmmLmt && notCommonRatio < NcmmLmt {
  149. return true
  150. }
  151. return false
  152. }
  153. // 测试方法
  154. func AnalysisFileTest(detail string) (bool, string, int, float64, float64) {
  155. //qu.Debug(detail)
  156. defer util.Catch()
  157. //过滤空格
  158. filetextTmp := SpaceReg.ReplaceAllString(detail, "")
  159. if filetextTmp == "" { //附件为空
  160. return false, "", 0, 0, 0
  161. }
  162. //特殊情况:图片0 图片1
  163. filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
  164. if filetextTmp == "" { //附件为空
  165. return false, "", 1, 0, 0
  166. }
  167. //中文匹配
  168. HanArr := HanReg.FindAllString(filetextTmp, -1)
  169. hanText := strings.Join(HanArr, "")
  170. hanTextLen := len([]rune(hanText))
  171. //长度过滤
  172. if hanTextLen <= 100 {
  173. return false, "", 2, 0, 0
  174. }
  175. //qu.Debug(textLen, text)
  176. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  177. commonLen := len(commonArr)
  178. //commonText := strings.Join(commonArr, "")
  179. notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
  180. notCommonLen := len(notCommonArr)
  181. //notCommonText := strings.Join(notCommonArr, "")
  182. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  183. commonRatio := float64(commonLen) / float64(hanTextLen)
  184. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  185. notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
  186. notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
  187. return true, filetextTmp, 10, commonRatio, notCommonRatio
  188. }