file.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. package main
  2. import (
  3. "bufio"
  4. "fmt"
  5. "io"
  6. cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil"
  7. "os"
  8. "regexp"
  9. "strconv"
  10. "strings"
  11. )
  12. var (
  13. CmmonDFA *DFA //常用字
  14. NotCommonDFA *DFA //不常用字
  15. TimesLimit int //常用字界限
  16. UpdateLimit float64 //更新界限
  17. OssSite map[string]float64 //解析附件站点集合
  18. HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则
  19. SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
  20. SpecialReg = regexp.MustCompile("图片(\\d)+") //
  21. SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
  22. )
  23. func InitFileInfo() {
  24. OssSite = map[string]float64{}
  25. TimesLimit = cu.IntAll(Config["timeslimit"])
  26. UpdateLimit = cu.Float64All(Config["updatelimit"])
  27. for site, b := range Config["osssite"].(map[string]interface{}) {
  28. OssSite[site] = cu.Float64All(b)
  29. }
  30. fmt.Println(TimesLimit, UpdateLimit, OssSite)
  31. CmmonDFA = &DFA{}
  32. NotCommonDFA = &DFA{}
  33. LoadDict("common.txt") //初始化常用字典
  34. }
  35. // DFA
  36. type DFA struct {
  37. Link map[string]interface{}
  38. }
  39. func (d *DFA) AddWord(keys ...string) {
  40. d.AddWordAll(true, keys...)
  41. }
  42. func (d *DFA) AddWordAll(haskey bool, keys ...string) {
  43. if d.Link == nil {
  44. d.Link = make(map[string]interface{})
  45. }
  46. for _, key := range keys {
  47. nowMap := &d.Link
  48. for i := 0; i < len(key); i++ {
  49. kc := key[i : i+1]
  50. if v, ok := (*nowMap)[kc]; ok {
  51. nowMap, _ = v.(*map[string]interface{})
  52. } else {
  53. newMap := map[string]interface{}{}
  54. newMap["YN"] = "0"
  55. (*nowMap)[kc] = &newMap
  56. nowMap = &newMap
  57. }
  58. if i == len(key)-1 {
  59. (*nowMap)["YN"] = "1"
  60. if haskey {
  61. (*nowMap)["K"] = key
  62. }
  63. }
  64. }
  65. }
  66. }
  67. func (d *DFA) CheckSensitiveWord(src string) []string {
  68. res := make([]string, 0)
  69. for j := 0; j < len(src); j++ {
  70. nowMap := &d.Link
  71. for i := j; i < len(src); i++ {
  72. word := src[i : i+1]
  73. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  74. if nowMap != nil { // 存在,则判断是否为最后一个
  75. if "1" == cu.ObjToString((*nowMap)["YN"]) {
  76. s := cu.ObjToString((*nowMap)["K"])
  77. res = append(res, s)
  78. }
  79. } else {
  80. break
  81. }
  82. }
  83. }
  84. return res
  85. }
  86. // 加载统计的常用词
  87. func LoadDict(path string) {
  88. dictFile, err := os.Open(path)
  89. if err != nil {
  90. fmt.Println("Load Common.txt Error")
  91. os.Exit(-1)
  92. }
  93. defer dictFile.Close()
  94. reader := bufio.NewReader(dictFile)
  95. var (
  96. text string
  97. frequency int
  98. )
  99. // 逐行读入分词
  100. line := 0
  101. for {
  102. line++
  103. size, fsErr := fmt.Fscanln(reader, &text, &frequency) //读每行赋值
  104. if fsErr == io.EOF { //读取到结尾
  105. break
  106. }
  107. if size == 2 { //正确数据
  108. if frequency >= TimesLimit { //常用字
  109. CmmonDFA.AddWord(text)
  110. } else { //非常用字
  111. //NotCommonDFA.AddWord(text)
  112. }
  113. } else {
  114. fmt.Println("Read Line Error:", line)
  115. }
  116. }
  117. }
  118. func AnalysisFile(replaceSite bool, limitRatio float64, tmp map[string]interface{}) (bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
  119. defer cu.Catch()
  120. filetext, byOcr := GetFileText(tmp) //解析附件
  121. if filetext == "" {
  122. return false, filetext
  123. }
  124. if !replaceSite { //不是指定站点解析的数据,若是ocr识别的不进行替换
  125. return !byOcr, filetext
  126. } else if replaceSite && !byOcr { //指定站点解析的数据,非ocr识别,认为附件可替换正文
  127. return true, filetext
  128. }
  129. //下面是指定站点附件识别后,按准确率判断是否替换detail
  130. //特殊情况:图片0 图片1
  131. filetextTmp := SpecialReg.ReplaceAllString(filetext, "")
  132. if filetextTmp == "" { //附件为空
  133. return false, filetext
  134. }
  135. //中文匹配
  136. HanArr := HanReg.FindAllString(filetextTmp, -1)
  137. hanText := strings.Join(HanArr, "")
  138. hanLen := len([]rune(hanText))
  139. //filetextTmp = sp.FilterDetail(filetextTmp) //只保留文本内容
  140. //filetextLen := len([]rune(filetextTmp))
  141. //长度过滤
  142. if hanLen <= 100 {
  143. return false, filetext
  144. }
  145. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  146. commonLen := len(commonArr)
  147. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  148. commonRatio := float64(commonLen) / float64(hanLen)
  149. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  150. if commonRatio >= limitRatio {
  151. return true, filetext
  152. }
  153. return false, filetext
  154. }
  155. // 解析附件
  156. func AnalysisFile_back(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
  157. defer cu.Catch()
  158. filetext, _ := GetFileText(tmp) //解析附件
  159. //过滤空格
  160. filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
  161. if filetextTmp == "" { //附件为空
  162. return false, false, filetext
  163. }
  164. //特殊情况:图片0 图片1
  165. filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
  166. if filetextTmp == "" { //附件为空
  167. return false, false, filetext
  168. }
  169. //中文匹配
  170. HanArr := HanReg.FindAllString(filetextTmp, -1)
  171. hanText := strings.Join(HanArr, "")
  172. hanTextLen := len([]rune(hanText))
  173. //长度过滤
  174. if hanTextLen <= 20 {
  175. return false, false, filetext
  176. } else if replaceSite && 20 < hanTextLen && hanTextLen <= 100 {
  177. return false, false, filetext
  178. }
  179. //fmt.Println(hanTextLen, hanText)
  180. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  181. commonLen := len(commonArr)
  182. //fmt.Println(commonLen, commonArr)
  183. //commonText := strings.Join(commonArr, "")
  184. //notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
  185. //notCommonLen := len(notCommonArr)
  186. //fmt.Println(notCommonLen, notCommonArr)
  187. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  188. commonRatio := float64(commonLen) / float64(hanTextLen)
  189. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  190. if commonRatio < 0.5 { //常用字占比低于x<50%
  191. return false, false, filetext
  192. } else if replaceSite {
  193. if commonRatio < UpdateLimit { //50%<x<UpdateLimit
  194. return false, true, filetext
  195. } else { //x>=UpdateLimit
  196. return true, true, filetext
  197. }
  198. }
  199. //fmt.Println(commonRatio)
  200. //notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
  201. //notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
  202. return false, true, filetext
  203. }
  204. // 测试方法
  205. func AnalysisFileTest(detail string) (bool, string, int, float64, float64) {
  206. //fmt.Println(detail)
  207. defer cu.Catch()
  208. //过滤空格
  209. filetextTmp := SpaceReg.ReplaceAllString(detail, "")
  210. if filetextTmp == "" { //附件为空
  211. return false, "", 0, 0, 0
  212. }
  213. //特殊情况:图片0 图片1
  214. filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
  215. if filetextTmp == "" { //附件为空
  216. return false, "", 1, 0, 0
  217. }
  218. //中文匹配
  219. HanArr := HanReg.FindAllString(filetextTmp, -1)
  220. hanText := strings.Join(HanArr, "")
  221. hanTextLen := len([]rune(hanText))
  222. //长度过滤
  223. if hanTextLen <= 100 {
  224. return false, "", 2, 0, 0
  225. }
  226. //fmt.Println(textLen, text)
  227. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  228. commonLen := len(commonArr)
  229. fmt.Println(commonLen, commonArr)
  230. //commonText := strings.Join(commonArr, "")
  231. notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
  232. notCommonLen := len(notCommonArr)
  233. fmt.Println(notCommonLen, notCommonArr)
  234. //notCommonText := strings.Join(notCommonArr, "")
  235. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  236. commonRatio := float64(commonLen) / float64(hanTextLen)
  237. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  238. notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
  239. notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
  240. return true, filetextTmp, 10, commonRatio, notCommonRatio
  241. }