|
@@ -0,0 +1,253 @@
|
|
|
+package main
|
|
|
+
|
|
|
+import (
|
|
|
+ "bufio"
|
|
|
+ "fmt"
|
|
|
+ "io"
|
|
|
+ cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil"
|
|
|
+ "os"
|
|
|
+ "regexp"
|
|
|
+ "strconv"
|
|
|
+ "strings"
|
|
|
+)
|
|
|
+
|
|
|
+var (
|
|
|
+ CmmonDFA *DFA //常用字
|
|
|
+ NotCommonDFA *DFA //不常用字
|
|
|
+ TimesLimit int //常用字界限
|
|
|
+ UpdateLimit float64 //更新界限
|
|
|
+ OssSite map[string]float64 //解析附件站点集合
|
|
|
+ HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则
|
|
|
+ SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
|
|
|
+ SpecialReg = regexp.MustCompile("图片(\\d)+") //
|
|
|
+ SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
|
|
|
+)
|
|
|
+
|
|
|
+func InitFileInfo() {
|
|
|
+ OssSite = map[string]float64{}
|
|
|
+ TimesLimit = cu.IntAll(Config["timeslimit"])
|
|
|
+ UpdateLimit = cu.Float64All(Config["updatelimit"])
|
|
|
+ for site, b := range Config["osssite"].(map[string]interface{}) {
|
|
|
+ OssSite[site] = cu.Float64All(b)
|
|
|
+ }
|
|
|
+ fmt.Println(TimesLimit, UpdateLimit, OssSite)
|
|
|
+ CmmonDFA = &DFA{}
|
|
|
+ NotCommonDFA = &DFA{}
|
|
|
+ LoadDict("common.txt") //初始化常用字典
|
|
|
+}
|
|
|
+
|
|
|
+// DFA
|
|
|
+type DFA struct {
|
|
|
+ Link map[string]interface{}
|
|
|
+}
|
|
|
+
|
|
|
+func (d *DFA) AddWord(keys ...string) {
|
|
|
+ d.AddWordAll(true, keys...)
|
|
|
+}
|
|
|
+func (d *DFA) AddWordAll(haskey bool, keys ...string) {
|
|
|
+ if d.Link == nil {
|
|
|
+ d.Link = make(map[string]interface{})
|
|
|
+ }
|
|
|
+ for _, key := range keys {
|
|
|
+ nowMap := &d.Link
|
|
|
+ for i := 0; i < len(key); i++ {
|
|
|
+ kc := key[i : i+1]
|
|
|
+ if v, ok := (*nowMap)[kc]; ok {
|
|
|
+ nowMap, _ = v.(*map[string]interface{})
|
|
|
+ } else {
|
|
|
+ newMap := map[string]interface{}{}
|
|
|
+ newMap["YN"] = "0"
|
|
|
+ (*nowMap)[kc] = &newMap
|
|
|
+ nowMap = &newMap
|
|
|
+ }
|
|
|
+ if i == len(key)-1 {
|
|
|
+ (*nowMap)["YN"] = "1"
|
|
|
+ if haskey {
|
|
|
+ (*nowMap)["K"] = key
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func (d *DFA) CheckSensitiveWord(src string) []string {
|
|
|
+ res := make([]string, 0)
|
|
|
+ for j := 0; j < len(src); j++ {
|
|
|
+ nowMap := &d.Link
|
|
|
+ for i := j; i < len(src); i++ {
|
|
|
+ word := src[i : i+1]
|
|
|
+ nowMap, _ = (*nowMap)[word].(*map[string]interface{})
|
|
|
+ if nowMap != nil { // 存在,则判断是否为最后一个
|
|
|
+ if "1" == cu.ObjToString((*nowMap)["YN"]) {
|
|
|
+ s := cu.ObjToString((*nowMap)["K"])
|
|
|
+ res = append(res, s)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return res
|
|
|
+}
|
|
|
+
|
|
|
+// 加载统计的常用词
|
|
|
+func LoadDict(path string) {
|
|
|
+ dictFile, err := os.Open(path)
|
|
|
+ if err != nil {
|
|
|
+ fmt.Println("Load Common.txt Error")
|
|
|
+ os.Exit(-1)
|
|
|
+ }
|
|
|
+ defer dictFile.Close()
|
|
|
+ reader := bufio.NewReader(dictFile)
|
|
|
+ var (
|
|
|
+ text string
|
|
|
+ frequency int
|
|
|
+ )
|
|
|
+
|
|
|
+ // 逐行读入分词
|
|
|
+ line := 0
|
|
|
+ for {
|
|
|
+ line++
|
|
|
+ size, fsErr := fmt.Fscanln(reader, &text, &frequency) //读每行赋值
|
|
|
+ if fsErr == io.EOF { //读取到结尾
|
|
|
+ break
|
|
|
+ }
|
|
|
+ if size == 2 { //正确数据
|
|
|
+ if frequency >= TimesLimit { //常用字
|
|
|
+ CmmonDFA.AddWord(text)
|
|
|
+ } else { //非常用字
|
|
|
+ //NotCommonDFA.AddWord(text)
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ fmt.Println("Read Line Error:", line)
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func AnalysisFile(replaceSite bool, limitRatio float64, tmp map[string]interface{}) (bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
|
|
|
+ defer cu.Catch()
|
|
|
+ filetext, byOcr := GetFileText(tmp) //解析附件
|
|
|
+ if filetext == "" {
|
|
|
+ return false, filetext
|
|
|
+ }
|
|
|
+ if !replaceSite { //不是指定站点解析的数据,若是ocr识别的不进行替换
|
|
|
+ return !byOcr, filetext
|
|
|
+ } else if replaceSite && !byOcr { //指定站点解析的数据,非ocr识别,认为附件可替换正文
|
|
|
+ return true, filetext
|
|
|
+ }
|
|
|
+ //下面是指定站点附件识别后,按准确率判断是否替换detail
|
|
|
+ //特殊情况:图片0 图片1
|
|
|
+ filetextTmp := SpecialReg.ReplaceAllString(filetext, "")
|
|
|
+ if filetextTmp == "" { //附件为空
|
|
|
+ return false, filetext
|
|
|
+ }
|
|
|
+ //中文匹配
|
|
|
+ HanArr := HanReg.FindAllString(filetextTmp, -1)
|
|
|
+ hanText := strings.Join(HanArr, "")
|
|
|
+ hanLen := len([]rune(hanText))
|
|
|
+ //filetextTmp = sp.FilterDetail(filetextTmp) //只保留文本内容
|
|
|
+ //filetextLen := len([]rune(filetextTmp))
|
|
|
+ //长度过滤
|
|
|
+ if hanLen <= 100 {
|
|
|
+ return false, filetext
|
|
|
+ }
|
|
|
+ commonArr := CmmonDFA.CheckSensitiveWord(hanText)
|
|
|
+ commonLen := len(commonArr)
|
|
|
+ //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
|
|
|
+ commonRatio := float64(commonLen) / float64(hanLen)
|
|
|
+ commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
|
|
|
+ if commonRatio >= limitRatio {
|
|
|
+ return true, filetext
|
|
|
+ }
|
|
|
+ return false, filetext
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+// 解析附件
|
|
|
+func AnalysisFile_back(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
|
|
|
+ defer cu.Catch()
|
|
|
+ filetext, _ := GetFileText(tmp) //解析附件
|
|
|
+ //过滤空格
|
|
|
+ filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
|
|
|
+ if filetextTmp == "" { //附件为空
|
|
|
+ return false, false, filetext
|
|
|
+ }
|
|
|
+ //特殊情况:图片0 图片1
|
|
|
+ filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
|
|
|
+ if filetextTmp == "" { //附件为空
|
|
|
+ return false, false, filetext
|
|
|
+ }
|
|
|
+ //中文匹配
|
|
|
+ HanArr := HanReg.FindAllString(filetextTmp, -1)
|
|
|
+ hanText := strings.Join(HanArr, "")
|
|
|
+ hanTextLen := len([]rune(hanText))
|
|
|
+ //长度过滤
|
|
|
+ if hanTextLen <= 20 {
|
|
|
+ return false, false, filetext
|
|
|
+ } else if replaceSite && 20 < hanTextLen && hanTextLen <= 100 {
|
|
|
+ return false, false, filetext
|
|
|
+ }
|
|
|
+ //fmt.Println(hanTextLen, hanText)
|
|
|
+ commonArr := CmmonDFA.CheckSensitiveWord(hanText)
|
|
|
+ commonLen := len(commonArr)
|
|
|
+ //fmt.Println(commonLen, commonArr)
|
|
|
+ //commonText := strings.Join(commonArr, "")
|
|
|
+ //notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
|
|
|
+ //notCommonLen := len(notCommonArr)
|
|
|
+ //fmt.Println(notCommonLen, notCommonArr)
|
|
|
+ //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
|
|
|
+ commonRatio := float64(commonLen) / float64(hanTextLen)
|
|
|
+ commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
|
|
|
+ if commonRatio < 0.5 { //常用字占比低于x<50%
|
|
|
+ return false, false, filetext
|
|
|
+ } else if replaceSite {
|
|
|
+ if commonRatio < UpdateLimit { //50%<x<UpdateLimit
|
|
|
+ return false, true, filetext
|
|
|
+ } else { //x>=UpdateLimit
|
|
|
+ return true, true, filetext
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //fmt.Println(commonRatio)
|
|
|
+ //notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
|
|
|
+ //notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
|
|
|
+ return false, true, filetext
|
|
|
+}
|
|
|
+
|
|
|
+// 测试方法
|
|
|
+func AnalysisFileTest(detail string) (bool, string, int, float64, float64) {
|
|
|
+ //fmt.Println(detail)
|
|
|
+ defer cu.Catch()
|
|
|
+ //过滤空格
|
|
|
+ filetextTmp := SpaceReg.ReplaceAllString(detail, "")
|
|
|
+ if filetextTmp == "" { //附件为空
|
|
|
+ return false, "", 0, 0, 0
|
|
|
+ }
|
|
|
+ //特殊情况:图片0 图片1
|
|
|
+ filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "")
|
|
|
+ if filetextTmp == "" { //附件为空
|
|
|
+ return false, "", 1, 0, 0
|
|
|
+ }
|
|
|
+ //中文匹配
|
|
|
+ HanArr := HanReg.FindAllString(filetextTmp, -1)
|
|
|
+ hanText := strings.Join(HanArr, "")
|
|
|
+ hanTextLen := len([]rune(hanText))
|
|
|
+ //长度过滤
|
|
|
+ if hanTextLen <= 100 {
|
|
|
+ return false, "", 2, 0, 0
|
|
|
+ }
|
|
|
+ //fmt.Println(textLen, text)
|
|
|
+ commonArr := CmmonDFA.CheckSensitiveWord(hanText)
|
|
|
+ commonLen := len(commonArr)
|
|
|
+ fmt.Println(commonLen, commonArr)
|
|
|
+ //commonText := strings.Join(commonArr, "")
|
|
|
+ notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText)
|
|
|
+ notCommonLen := len(notCommonArr)
|
|
|
+ fmt.Println(notCommonLen, notCommonArr)
|
|
|
+ //notCommonText := strings.Join(notCommonArr, "")
|
|
|
+ //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
|
|
|
+ commonRatio := float64(commonLen) / float64(hanTextLen)
|
|
|
+ commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
|
|
|
+ notCommonRatio := float64(notCommonLen) / float64(hanTextLen)
|
|
|
+ notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64)
|
|
|
+ return true, filetextTmp, 10, commonRatio, notCommonRatio
|
|
|
+}
|