Răsfoiți Sursa

附件迁移正文替换功能修改

maxiaoshan 1 an în urmă
părinte
comite
0f89bfc94a

+ 2 - 3
src/saveServer/src/config.json

@@ -14,8 +14,7 @@
 	"api": "http://172.17.145.179:19281/_send/_mail",
 	"to": "maxiaoshan@topnet.net.cn,zhangjinkun@topnet.net.cn",
 	"osssite": {
-		"中国招标投标公共服务平台": true
+		"中国招标投标公共服务平台": 0.99
 	},
-	"timeslimit": 500,
-    "updatelimit": 0.99
+	"timeslimit": 500
 }

+ 52 - 12
src/saveServer/src/file.go

@@ -12,22 +12,23 @@ import (
 )
 
 var (
-	CmmonDFA     *DFA                                             //常用字
-	NotCommonDFA *DFA                                             //不常用字
-	TimesLimit   int                                              //常用字界限
-	UpdateLimit  float64                                          //更新界限
-	OssSite      map[string]bool                                  //解析附件站点集合
-	HanReg       = regexp.MustCompile("[\u4e00-\u9fa5]+")         //中文正则
-	SpaceReg     = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
-	SpecialReg   = regexp.MustCompile("图片(\\d)+")                 //
+	CmmonDFA       *DFA                                             //常用字
+	NotCommonDFA   *DFA                                             //不常用字
+	TimesLimit     int                                              //常用字界限
+	UpdateLimit    float64                                          //更新界限
+	OssSite        map[string]float64                               //解析附件站点集合
+	HanReg         = regexp.MustCompile("[\u4e00-\u9fa5]+")         //中文正则
+	SpaceReg       = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //空格正则
+	SpecialReg     = regexp.MustCompile("图片(\\d)+")                 //
+	SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
 )
 
 func InitFileInfo() {
-	OssSite = map[string]bool{}
+	OssSite = map[string]float64{}
 	TimesLimit = qu.IntAll(Config["timeslimit"])
 	UpdateLimit = qu.Float64All(Config["updatelimit"])
 	for site, b := range Config["osssite"].(map[string]interface{}) {
-		OssSite[site] = b.(bool)
+		OssSite[site] = qu.Float64All(b)
 	}
 	qu.Debug(TimesLimit, UpdateLimit, OssSite)
 	CmmonDFA = &DFA{}
@@ -123,10 +124,49 @@ func LoadDict(path string) {
 	}
 }
 
+func AnalysisFile(replaceSite bool, limitRatio float64, tmp map[string]interface{}) (bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
+	defer qu.Catch()
+	filetext, byOcr := GetFileText(tmp) //解析附件
+	if filetext == "" {
+		return false, filetext
+	}
+	if !replaceSite { //不是指定站点解析的数据,若是ocr识别的不进行替换
+		return !byOcr, filetext
+	} else if replaceSite && !byOcr { //指定站点解析的数据,非ocr识别,认为附件可替换正文
+		return true, filetext
+	}
+	//下面是指定站点附件识别后,按准确率判断是否替换detail
+	//特殊情况:图片0 图片1
+	filetextTmp := SpecialReg.ReplaceAllString(filetext, "")
+	if filetextTmp == "" { //附件为空
+		return false, filetext
+	}
+	//中文匹配
+	HanArr := HanReg.FindAllString(filetextTmp, -1)
+	hanText := strings.Join(HanArr, "")
+	hanLen := len([]rune(hanText))
+	//filetextTmp = sp.FilterDetail(filetextTmp) //只保留文本内容
+	//filetextLen := len([]rune(filetextTmp))
+	//长度过滤
+	if hanLen <= 100 {
+		return false, filetext
+	}
+	commonArr := CmmonDFA.CheckSensitiveWord(hanText)
+	commonLen := len(commonArr)
+	//解析常用字和非常用字占比(由于常用字或非常用字集不全,会导致比例相加不为100%)
+	commonRatio := float64(commonLen) / float64(hanLen)
+	commonRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", commonRatio), 64)
+	if commonRatio >= limitRatio {
+		return true, filetext
+	}
+	return false, filetext
+
+}
+
 //解析附件
-func AnalysisFile(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
+func AnalysisFile_back(replaceSite bool, tmp map[string]interface{}) (bool, bool, string) { //第一个bool:是否替换正文;第二个bool:附件是否正常
 	defer qu.Catch()
-	filetext := GetFileText(tmp) //解析附件
+	filetext, _ := GetFileText(tmp) //解析附件
 	//过滤空格
 	filetextTmp := SpaceReg.ReplaceAllString(filetext, "")
 	if filetextTmp == "" { //附件为空

+ 28 - 10
src/saveServer/src/main.go

@@ -7,6 +7,7 @@ import (
 	"mongodb"
 	"net/http"
 	qu "qfw/util"
+	sp "spiderutil"
 	"sync"
 	"time"
 
@@ -227,19 +228,36 @@ func FileDataMove(query map[string]interface{}) {
 			//更新
 			set := map[string]interface{}{"moveok": true}
 			set["biddingid"] = mongodb.BsonIdToSId(newId)
-
-			site := qu.ObjToString(tmp["site"]) //解析附件站点
-			IsReplaceDetailSite := OssSite[site]
-			if IsReplaceDetailSite {
-				replace, fileOk, filetext := AnalysisFile(IsReplaceDetailSite, tmp) //解析附件是否替换到detail
-				if replace {
-					tmp["detail"] = filetext //替换正文
+			site := qu.ObjToString(tmp["site"])              //解析附件站点
+			if limitRatio := OssSite[site]; limitRatio > 0 { //配置站点解析附件,根据准确率情况替换正文
+				replace, filetext := AnalysisFile(true, limitRatio, tmp)
+				if replace { //替换正文
+					tmp["detail"] = filetext
+					set["filetext"] = true
 				}
-				if !fileOk { //附件异常
-					set["filerr"] = true
-					//set["filetext"] = filetext//文本过大,导致更新mongo失败
+			} else { //其它网站附件信息,detail无效,只有一个附件且不是ocr识别的,替换正文
+				//判断detail是否有效
+				detail := qu.ObjToString(tmp["detail"])
+				detail = sp.FilterDetail(detail) //只保留文本内容
+				if len([]rune(detail)) <= 5 || (len([]rune(detail)) <= 50 && SpecialTextReg.MatchString(detail)) {
+					replace, filetext := AnalysisFile(false, 0, tmp)
+					if replace { //替换正文
+						tmp["detail"] = filetext
+						set["filetext"] = true
+					}
 				}
 			}
+			//IsReplaceDetailSite := OssSite[site]
+			//if IsReplaceDetailSite {
+			//	replace, fileOk, filetext := AnalysisFile_back(IsReplaceDetailSite, tmp) //解析附件是否替换到detail
+			//	if replace {
+			//		tmp["detail"] = filetext //替换正文
+			//	}
+			//	if !fileOk { //附件异常
+			//		set["filerr"] = true
+			//		//set["filetext"] = filetext//文本过大,导致更新mongo失败
+			//	}
+			//}
 			update = append(update, map[string]interface{}{
 				"$set": set,
 			})

+ 17 - 8
src/saveServer/src/ossclient.go

@@ -50,15 +50,24 @@ func OssGetObject(objectName string) string {
 	return string(data)
 }
 
-func GetFileText(tmp map[string]interface{}) (filetext string) {
+func GetFileText(tmp map[string]interface{}) (filetext string, byOcr bool) {
+	byOcr = true
 	if attchMap, ok := tmp["attach_text"].(map[string]interface{}); attchMap != nil && ok {
-		for _, tmpData1 := range attchMap {
-			if tmpData2, ok := tmpData1.(map[string]interface{}); tmpData2 != nil && ok {
-				for _, result := range tmpData2 {
-					if resultMap, ok := result.(map[string]interface{}); resultMap != nil && ok {
-						if attach_url := util.ObjToString(resultMap["attach_url"]); attach_url != "" {
-							bs := OssGetObject(attach_url) //oss读数据
-							filetext += bs + "\n"
+		if len(attchMap) == 1 {
+			for _, tmpData1 := range attchMap {
+				if tmpData2, ok := tmpData1.(map[string]interface{}); tmpData2 != nil && ok {
+					if len(tmpData2) == 1 {
+						for _, result := range tmpData2 {
+							if resultMap, ok := result.(map[string]interface{}); resultMap != nil && ok {
+								if attach_url := util.ObjToString(resultMap["attach_url"]); attach_url != "" {
+									filetext = OssGetObject(attach_url) //oss读数据
+									//filetext += bs + "\n"
+									//附件文本是否是通过ocr识别的
+									if resultMap["ocr"] != nil && util.IntAll(resultMap["ocr"]) == 0 {
+										byOcr = false
+									}
+								}
+							}
 						}
 					}
 				}