file.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. package main
  2. import (
  3. "bufio"
  4. "fmt"
  5. "io"
  6. cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil"
  7. su "jygit.jydev.jianyu360.cn/data_capture/myself_util/spiderutil"
  8. "os"
  9. "regexp"
  10. "strconv"
  11. "strings"
  12. )
  13. var (
  14. CmmonDFA *DFA //常用字
  15. NotCommonDFA *DFA //不常用字
  16. TimesLimit int //常用字界限
  17. UpdateLimit float64 //更新界限
  18. OssSite map[string]float64 //解析附件站点集合
  19. HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则
  20. SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
  21. SpecialReg = regexp.MustCompile("图片(\\d)+") //
  22. SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
  23. )
  24. func DealFile(tmp map[string]interface{}) {
  25. site := cu.ObjToString(tmp["site"]) //解析附件站点
  26. if limitRatio := OssSite[site]; limitRatio > 0 { //配置站点解析附件,根据准确率情况替换正文
  27. replace, filetext := AnalysisFile(true, limitRatio, tmp)
  28. if replace { //替换正文
  29. tmp["detail"] = filetext
  30. }
  31. } else { //其它网站附件信息,detail无效,只有一个附件且不是ocr识别的,替换正文
  32. //判断detail是否有效
  33. detail := cu.ObjToString(tmp["detail"])
  34. detail = su.FilterDetail(detail) //只保留文本内容
  35. if len([]rune(detail)) <= 5 || (len([]rune(detail)) <= 50 && SpecialTextReg.MatchString(detail)) {
  36. replace, filetext := AnalysisFile(false, 0, tmp)
  37. if replace { //替换正文
  38. tmp["detail"] = filetext
  39. }
  40. }
  41. }
  42. }
  43. func InitFileInfo() {
  44. OssSite = map[string]float64{}
  45. TimesLimit = cu.IntAll(Config["timeslimit"])
  46. UpdateLimit = cu.Float64All(Config["updatelimit"])
  47. for site, b := range Config["osssite"].(map[string]interface{}) {
  48. OssSite[site] = cu.Float64All(b)
  49. }
  50. fmt.Println(TimesLimit, UpdateLimit, OssSite)
  51. CmmonDFA = &DFA{}
  52. NotCommonDFA = &DFA{}
  53. LoadDict("common.txt") //初始化常用字典
  54. }
  55. // DFA
  56. type DFA struct {
  57. Link map[string]interface{}
  58. }
  59. func (d *DFA) AddWord(keys ...string) {
  60. d.AddWordAll(true, keys...)
  61. }
  62. func (d *DFA) AddWordAll(haskey bool, keys ...string) {
  63. if d.Link == nil {
  64. d.Link = make(map[string]interface{})
  65. }
  66. for _, key := range keys {
  67. nowMap := &d.Link
  68. for i := 0; i < len(key); i++ {
  69. kc := key[i : i+1]
  70. if v, ok := (*nowMap)[kc]; ok {
  71. nowMap, _ = v.(*map[string]interface{})
  72. } else {
  73. newMap := map[string]interface{}{}
  74. newMap["YN"] = "0"
  75. (*nowMap)[kc] = &newMap
  76. nowMap = &newMap
  77. }
  78. if i == len(key)-1 {
  79. (*nowMap)["YN"] = "1"
  80. if haskey {
  81. (*nowMap)["K"] = key
  82. }
  83. }
  84. }
  85. }
  86. }
  87. func (d *DFA) CheckSensitiveWord(src string) []string {
  88. res := make([]string, 0)
  89. for j := 0; j < len(src); j++ {
  90. nowMap := &d.Link
  91. for i := j; i < len(src); i++ {
  92. word := src[i : i+1]
  93. nowMap, _ = (*nowMap)[word].(*map[string]interface{})
  94. if nowMap != nil { // 存在,则判断是否为最后一个
  95. if "1" == cu.ObjToString((*nowMap)["YN"]) {
  96. s := cu.ObjToString((*nowMap)["K"])
  97. res = append(res, s)
  98. }
  99. } else {
  100. break
  101. }
  102. }
  103. }
  104. return res
  105. }
  106. // 加载统计的常用词
  107. func LoadDict(path string) {
  108. dictFile, err := os.Open(path)
  109. if err != nil {
  110. fmt.Println("Load Common.txt Error")
  111. os.Exit(-1)
  112. }
  113. defer dictFile.Close()
  114. reader := bufio.NewReader(dictFile)
  115. var (
  116. text string
  117. frequency int
  118. )
  119. // 逐行读入分词
  120. line := 0
  121. for {
  122. line++
  123. size, fsErr := fmt.Fscanln(reader, &text, &frequency) //读每行赋值
  124. if fsErr == io.EOF { //读取到结尾
  125. break
  126. }
  127. if size == 2 { //正确数据
  128. if frequency >= TimesLimit { //常用字
  129. CmmonDFA.AddWord(text)
  130. } else { //非常用字
  131. //NotCommonDFA.AddWord(text)
  132. }
  133. } else {
  134. fmt.Println("Read Line Error:", line)
  135. }
  136. }
  137. }
  138. func AnalysisFile(replaceSite bool, limitRatio float64, tmp map[string]interface{}) (bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
  139. defer cu.Catch()
  140. filetext, byOcr := GetFileText(tmp) //解析附件
  141. if filetext == "" {
  142. return false, filetext
  143. }
  144. if !replaceSite { //不是指定站点解析的数据,若是ocr识别的不进行替换
  145. return !byOcr, filetext
  146. } else if replaceSite && !byOcr { //指定站点解析的数据,非ocr识别,认为附件可替换正文
  147. return true, filetext
  148. }
  149. //下面是指定站点附件识别后,按准确率判断是否替换detail
  150. //特殊情况:图片0 图片1
  151. filetextTmp := SpecialReg.ReplaceAllString(filetext, "")
  152. if filetextTmp == "" { //附件为空
  153. return false, filetext
  154. }
  155. //中文匹配
  156. HanArr := HanReg.FindAllString(filetextTmp, -1)
  157. hanText := strings.Join(HanArr, "")
  158. hanLen := len([]rune(hanText))
  159. //filetextTmp = sp.FilterDetail(filetextTmp) //只保留文本内容
  160. //filetextLen := len([]rune(filetextTmp))
  161. //长度过滤
  162. if hanLen <= 100 {
  163. return false, filetext
  164. }
  165. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  166. commonLen := len(commonArr)
  167. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  168. commonRatio := float64(commonLen) / float64(hanLen)
  169. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  170. if commonRatio >= limitRatio {
  171. return true, filetext
  172. }
  173. return false, filetext
  174. }
  175. // 解析附件
  176. func AnalysisFile_back(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
  177. defer cu.Catch()
  178. filetext, _ := GetFileText(tmp) //解析附件
  179. //过滤空格
  180. filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
  181. if filetextTmp == "" { //附件为空
  182. return false, false, filetext
  183. }
  184. //特殊情况:图片0 图片1
  185. filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
  186. if filetextTmp == "" { //附件为空
  187. return false, false, filetext
  188. }
  189. //中文匹配
  190. HanArr := HanReg.FindAllString(filetextTmp, -1)
  191. hanText := strings.Join(HanArr, "")
  192. hanTextLen := len([]rune(hanText))
  193. //长度过滤
  194. if hanTextLen <= 20 {
  195. return false, false, filetext
  196. } else if replaceSite && 20 < hanTextLen && hanTextLen <= 100 {
  197. return false, false, filetext
  198. }
  199. //fmt.Println(hanTextLen, hanText)
  200. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  201. commonLen := len(commonArr)
  202. //fmt.Println(commonLen, commonArr)
  203. //commonText := strings.Join(commonArr, "")
  204. //notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
  205. //notCommonLen := len(notCommonArr)
  206. //fmt.Println(notCommonLen, notCommonArr)
  207. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  208. commonRatio := float64(commonLen) / float64(hanTextLen)
  209. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  210. if commonRatio < 0.5 { //常用字占比低于x<50%
  211. return false, false, filetext
  212. } else if replaceSite {
  213. if commonRatio < UpdateLimit { //50%<x<UpdateLimit
  214. return false, true, filetext
  215. } else { //x>=UpdateLimit
  216. return true, true, filetext
  217. }
  218. }
  219. //fmt.Println(commonRatio)
  220. //notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
  221. //notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
  222. return false, true, filetext
  223. }
  224. // 测试方法
  225. func AnalysisFileTest(detail string) (bool, string, int, float64, float64) {
  226. //fmt.Println(detail)
  227. defer cu.Catch()
  228. //过滤空格
  229. filetextTmp := SpaceReg.ReplaceAllString(detail, "")
  230. if filetextTmp == "" { //附件为空
  231. return false, "", 0, 0, 0
  232. }
  233. //特殊情况:图片0 图片1
  234. filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
  235. if filetextTmp == "" { //附件为空
  236. return false, "", 1, 0, 0
  237. }
  238. //中文匹配
  239. HanArr := HanReg.FindAllString(filetextTmp, -1)
  240. hanText := strings.Join(HanArr, "")
  241. hanTextLen := len([]rune(hanText))
  242. //长度过滤
  243. if hanTextLen <= 100 {
  244. return false, "", 2, 0, 0
  245. }
  246. //fmt.Println(textLen, text)
  247. commonArr := CmmonDFA.CheckSensitiveWord(hanText)
  248. commonLen := len(commonArr)
  249. fmt.Println(commonLen, commonArr)
  250. //commonText := strings.Join(commonArr, "")
  251. notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
  252. notCommonLen := len(notCommonArr)
  253. fmt.Println(notCommonLen, notCommonArr)
  254. //notCommonText := strings.Join(notCommonArr, "")
  255. //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
  256. commonRatio := float64(commonLen) / float64(hanTextLen)
  257. commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
  258. notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
  259. notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
  260. return true, filetextTmp, 10, commonRatio, notCommonRatio
  261. }