package main import ( "bufio" "fmt" "io" cu "jygit.jydev.jianyu360.cn/data_capture/myself_util/commonutil" su "jygit.jydev.jianyu360.cn/data_capture/myself_util/spiderutil" "os" "regexp" "strconv" "strings" ) var ( CmmonDFA *DFA //常用字 NotCommonDFA *DFA //不常用字 TimesLimit int //常用字界限 UpdateLimit float64 //更新界限 OssSite map[string]float64 //解析附件站点集合 HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则 SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则 SpecialReg = regexp.MustCompile("图片(\\d)+") // SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)") ) func DealFile(tmp map[string]interface{}) { site := cu.ObjToString(tmp["site"]) //解析附件站点 if limitRatio := OssSite[site]; limitRatio > 0 { //配置站点解析附件,根据准确率情况替换正文 replace, filetext := AnalysisFile(true, limitRatio, tmp) if replace { //替换正文 tmp["detail"] = filetext } } else { //其它网站附件信息,detail无效,只有一个附件且不是ocr识别的,替换正文 //判断detail是否有效 detail := cu.ObjToString(tmp["detail"]) detail = su.FilterDetail(detail) //只保留文本内容 if len([]rune(detail)) <= 5 || (len([]rune(detail)) <= 50 && SpecialTextReg.MatchString(detail)) { replace, filetext := AnalysisFile(false, 0, tmp) if replace { //替换正文 tmp["detail"] = filetext } } } } func InitFileInfo() { OssSite = map[string]float64{} TimesLimit = cu.IntAll(Config["timeslimit"]) UpdateLimit = cu.Float64All(Config["updatelimit"]) for site, b := range Config["osssite"].(map[string]interface{}) { OssSite[site] = cu.Float64All(b) } fmt.Println(TimesLimit, UpdateLimit, OssSite) CmmonDFA = &DFA{} NotCommonDFA = &DFA{} LoadDict("common.txt") //初始化常用字典 } // DFA type DFA struct { Link map[string]interface{} } func (d *DFA) AddWord(keys ...string) { d.AddWordAll(true, keys...) } func (d *DFA) AddWordAll(haskey bool, keys ...string) { if d.Link == nil { d.Link = make(map[string]interface{}) } for _, key := range keys { nowMap := &d.Link for i := 0; i < len(key); i++ { kc := key[i : i+1] if v, ok := (*nowMap)[kc]; ok { nowMap, _ = v.(*map[string]interface{}) } else { newMap := map[string]interface{}{} newMap["YN"] = "0" (*nowMap)[kc] = &newMap nowMap = &newMap } if i == len(key)-1 { (*nowMap)["YN"] = "1" if haskey { (*nowMap)["K"] = key } } } } } func (d *DFA) CheckSensitiveWord(src string) []string { res := make([]string, 0) for j := 0; j < len(src); j++ { nowMap := &d.Link for i := j; i < len(src); i++ { word := src[i : i+1] nowMap, _ = (*nowMap)[word].(*map[string]interface{}) if nowMap != nil { // 存在,则判断是否为最后一个 if "1" == cu.ObjToString((*nowMap)["YN"]) { s := cu.ObjToString((*nowMap)["K"]) res = append(res, s) } } else { break } } } return res } // 加载统计的常用词 func LoadDict(path string) { dictFile, err := os.Open(path) if err != nil { fmt.Println("Load Common.txt Error") os.Exit(-1) } defer dictFile.Close() reader := bufio.NewReader(dictFile) var ( text string frequency int ) // 逐行读入分词 line := 0 for { line++ size, fsErr := fmt.Fscanln(reader, &text, &frequency) //读每行赋值 if fsErr == io.EOF { //读取到结尾 break } if size == 2 { //正确数据 if frequency >= TimesLimit { //常用字 CmmonDFA.AddWord(text) } else { //非常用字 //NotCommonDFA.AddWord(text) } } else { fmt.Println("Read Line Error:", line) } } } func AnalysisFile(replaceSite bool, limitRatio float64, tmp map[string]interface{}) (bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常 defer cu.Catch() filetext, byOcr := GetFileText(tmp) //解析附件 if filetext == "" { return false, filetext } if !replaceSite { //不是指定站点解析的数据,若是ocr识别的不进行替换 return !byOcr, filetext } else if replaceSite && !byOcr { //指定站点解析的数据,非ocr识别,认为附件可替换正文 return true, filetext } //下面是指定站点附件识别后,按准确率判断是否替换detail //特殊情况:图片0 图片1 filetextTmp := SpecialReg.ReplaceAllString(filetext, "") if filetextTmp == "" { //附件为空 return false, filetext } //中文匹配 HanArr := HanReg.FindAllString(filetextTmp, -1) hanText := strings.Join(HanArr, "") hanLen := len([]rune(hanText)) //filetextTmp = sp.FilterDetail(filetextTmp) //只保留文本内容 //filetextLen := len([]rune(filetextTmp)) //长度过滤 if hanLen <= 100 { return false, filetext } commonArr := CmmonDFA.CheckSensitiveWord(hanText) commonLen := len(commonArr) //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%) commonRatio := float64(commonLen) / float64(hanLen) commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64) if commonRatio >= limitRatio { return true, filetext } return false, filetext } // 解析附件 func AnalysisFile_back(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常 defer cu.Catch() filetext, _ := GetFileText(tmp) //解析附件 //过滤空格 filetextTmp := SpaceReg.ReplaceAllString(filetext, "") if filetextTmp == "" { //附件为空 return false, false, filetext } //特殊情况:图片0 图片1 filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "") if filetextTmp == "" { //附件为空 return false, false, filetext } //中文匹配 HanArr := HanReg.FindAllString(filetextTmp, -1) hanText := strings.Join(HanArr, "") hanTextLen := len([]rune(hanText)) //长度过滤 if hanTextLen <= 20 { return false, false, filetext } else if replaceSite && 20 < hanTextLen && hanTextLen <= 100 { return false, false, filetext } //fmt.Println(hanTextLen, hanText) commonArr := CmmonDFA.CheckSensitiveWord(hanText) commonLen := len(commonArr) //fmt.Println(commonLen, commonArr) //commonText := strings.Join(commonArr, "") //notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText) //notCommonLen := len(notCommonArr) //fmt.Println(notCommonLen, notCommonArr) //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%) commonRatio := float64(commonLen) / float64(hanTextLen) commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64) if commonRatio < 0.5 { //常用字占比低于x<50% return false, false, filetext } else if replaceSite { if commonRatio < UpdateLimit { //50%=UpdateLimit return true, true, filetext } } //fmt.Println(commonRatio) //notCommonRatio := float64(notCommonLen) / float64(hanTextLen) //notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64) return false, true, filetext } // 测试方法 func AnalysisFileTest(detail string) (bool, string, int, float64, float64) { //fmt.Println(detail) defer cu.Catch() //过滤空格 filetextTmp := SpaceReg.ReplaceAllString(detail, "") if filetextTmp == "" { //附件为空 return false, "", 0, 0, 0 } //特殊情况:图片0 图片1 filetextTmp = SpecialReg.ReplaceAllString(filetextTmp, "") if filetextTmp == "" { //附件为空 return false, "", 1, 0, 0 } //中文匹配 HanArr := HanReg.FindAllString(filetextTmp, -1) hanText := strings.Join(HanArr, "") hanTextLen := len([]rune(hanText)) //长度过滤 if hanTextLen <= 100 { return false, "", 2, 0, 0 } //fmt.Println(textLen, text) commonArr := CmmonDFA.CheckSensitiveWord(hanText) commonLen := len(commonArr) fmt.Println(commonLen, commonArr) //commonText := strings.Join(commonArr, "") notCommonArr := NotCommonDFA.CheckSensitiveWord(hanText) notCommonLen := len(notCommonArr) fmt.Println(notCommonLen, notCommonArr) //notCommonText := strings.Join(notCommonArr, "") //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%) commonRatio := float64(commonLen) / float64(hanTextLen) commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64) notCommonRatio := float64(notCommonLen) / float64(hanTextLen) notCommonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", notCommonRatio), 64) return true, filetextTmp, 10, commonRatio, notCommonRatio }