|
@@ -12,22 +12,23 @@ import (
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
- CmmonDFA *DFA //常用字
|
|
|
- NotCommonDFA *DFA //不常用字
|
|
|
- TimesLimit int //常用字界限
|
|
|
- UpdateLimit float64 //更新界限
|
|
|
- OssSite map[string]bool //解析附件站点集合
|
|
|
- HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则
|
|
|
- SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
|
|
|
- SpecialReg = regexp.MustCompile("图片(\\d)+") //
|
|
|
+ CmmonDFA *DFA //常用字
|
|
|
+ NotCommonDFA *DFA //不常用字
|
|
|
+ TimesLimit int //常用字界限
|
|
|
+ UpdateLimit float64 //更新界限
|
|
|
+ OssSite map[string]float64 //解析附件站点集合
|
|
|
+ HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+") //中文正则
|
|
|
+ SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
|
|
|
+ SpecialReg = regexp.MustCompile("图片(\\d)+") //
|
|
|
+ SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
|
|
|
)
|
|
|
|
|
|
func InitFileInfo() {
|
|
|
- OssSite = map[string]bool{}
|
|
|
+ OssSite = map[string]float64{}
|
|
|
TimesLimit = qu.IntAll(Config["timeslimit"])
|
|
|
UpdateLimit = qu.Float64All(Config["updatelimit"])
|
|
|
for site, b := range Config["osssite"].(map[string]interface{}) {
|
|
|
- OssSite[site] = b.(bool)
|
|
|
+ OssSite[site] = qu.Float64All(b)
|
|
|
}
|
|
|
qu.Debug(TimesLimit, UpdateLimit, OssSite)
|
|
|
CmmonDFA = &DFA{}
|
|
@@ -123,10 +124,49 @@ func LoadDict(path string) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+func AnalysisFile(replaceSite bool, limitRatio float64, tmp map[string]interface{}) (bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
|
|
|
+ defer qu.Catch()
|
|
|
+ filetext, byOcr := GetFileText(tmp) //解析附件
|
|
|
+ if filetext == "" {
|
|
|
+ return false, filetext
|
|
|
+ }
|
|
|
+ if !replaceSite { //不是指定站点解析的数据,若是ocr识别的不进行替换
|
|
|
+ return !byOcr, filetext
|
|
|
+ } else if replaceSite && !byOcr { //指定站点解析的数据,非ocr识别,认为附件可替换正文
|
|
|
+ return true, filetext
|
|
|
+ }
|
|
|
+ //下面是指定站点附件识别后,按准确率判断是否替换detail
|
|
|
+ //特殊情况:图片0 图片1
|
|
|
+ filetextTmp := SpecialReg.ReplaceAllString(filetext, "")
|
|
|
+ if filetextTmp == "" { //附件为空
|
|
|
+ return false, filetext
|
|
|
+ }
|
|
|
+ //中文匹配
|
|
|
+ HanArr := HanReg.FindAllString(filetextTmp, -1)
|
|
|
+ hanText := strings.Join(HanArr, "")
|
|
|
+ hanLen := len([]rune(hanText))
|
|
|
+ //filetextTmp = sp.FilterDetail(filetextTmp) //只保留文本内容
|
|
|
+ //filetextLen := len([]rune(filetextTmp))
|
|
|
+ //长度过滤
|
|
|
+ if hanLen <= 100 {
|
|
|
+ return false, filetext
|
|
|
+ }
|
|
|
+ commonArr := CmmonDFA.CheckSensitiveWord(hanText)
|
|
|
+ commonLen := len(commonArr)
|
|
|
+ //解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
|
|
|
+ commonRatio := float64(commonLen) / float64(hanLen)
|
|
|
+ commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
|
|
|
+ if commonRatio >= limitRatio {
|
|
|
+ return true, filetext
|
|
|
+ }
|
|
|
+ return false, filetext
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
//解析附件
|
|
|
-func AnalysisFile(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
|
|
|
+func AnalysisFile_back(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
|
|
|
defer qu.Catch()
|
|
|
- filetext := GetFileText(tmp) //解析附件
|
|
|
+ filetext, _ := GetFileText(tmp) //解析附件
|
|
|
//过滤空格
|
|
|
filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
|
|
|
if filetextTmp == "" { //附件为空
|