Przeglądaj źródła

正文判重:含有效附件的数据不进行正文判重

mxs 1 rok temu
rodzic
commit
3b4b887bf5
1 zmienionych plików z 39 dodań i 31 usunięć
  1. 39 31
      src/saveServer/processdata.go

+ 39 - 31
src/saveServer/processdata.go

@@ -95,11 +95,38 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
 		errorData(LEVEL_ERROR, true, "publishtime", "Field Value Is Null", href, title, &warn, tmp)
 		tmp["publishtime"] = nowTime
 	}
+	//附件校验
+	downloadFileOk := false      //标记此条数据是否有上传oss成功的附件
+	fileUploadErrIsSave := false //避免一条数据出现多个异常附件,保存多条错误信息
+	fileSizeUrlErrIsSave := false
+	if projectinfo, ok := tmp["projectinfo"].(map[string]interface{}); ok {
+		if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok {
+			for _, data := range attachments {
+				if d, ok := data.(map[string]interface{}); ok {
+					org_url := qutil.ObjToString(d["org_url"])
+					fid := qutil.ObjToString(d["fid"])
+					size := qutil.ObjToString(d["size"])
+					if !fileUploadErrIsSave && org_url != "" && fid == "" { //附件下载上传问题
+						errorData(LEVEL_WARN, true, "projectinfo", "Attachment Upload Failed", href, title, &warn, tmp)
+						fileUploadErrIsSave = true
+					}
+					if !fileSizeUrlErrIsSave && (strings.HasSuffix(org_url, "javascript:void(0)") || size == "3.7 KB") {
+						errorData(LEVEL_WARN, true, "projectinfo", "File Size Or Url Error", href, title, &warn, tmp)
+						fileSizeUrlErrIsSave = true
+					}
+					if fid != "" { //附件上传成功
+						downloadFileOk = true
+					}
+				}
+			}
+		}
+	}
+
 	detail := qutil.ObjToString(tmp["detail"])
 	uuid := sp.Sha(detail) //拟建数据2022-04-26也改为detail判重
 	//数据判重
 	if tmp["repeat"] == nil { //判断repeat,为了异常数据重推时不进行redis判重
-		b, res = dataRepeat(tmp, href, hashHref, uuid)
+		b, res = dataRepeat(tmp, href, hashHref, uuid, downloadFileOk)
 		if b && res == 4 {
 			return
 		}
@@ -129,34 +156,8 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
 		delete(tmp, "s_title") //删除s_title
 	}
 	iscompete, _ := tmp["iscompete"].(bool) //判断是否是2021-11-01之后的新爬虫,新爬虫iscompete不能为空
-	//2.4附件校验
-	downloadFileOk := false      //标记此条数据是否有上传oss成功的附件
-	fileUploadErrIsSave := false //避免一条数据出现多个异常附件,保存多条错误信息
-	fileSizeUrlErrIsSave := false
-	if projectinfo, ok := tmp["projectinfo"].(map[string]interface{}); ok {
-		if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok {
-			for _, data := range attachments {
-				if d, ok := data.(map[string]interface{}); ok {
-					org_url := qutil.ObjToString(d["org_url"])
-					fid := qutil.ObjToString(d["fid"])
-					size := qutil.ObjToString(d["size"])
-					if !fileUploadErrIsSave && org_url != "" && fid == "" { //附件下载上传问题
-						errorData(LEVEL_WARN, true, "projectinfo", "Attachment Upload Failed", href, title, &warn, tmp)
-						fileUploadErrIsSave = true
-					}
-					if !fileSizeUrlErrIsSave && (strings.HasSuffix(org_url, "javascript:void(0)") || size == "3.7 KB") {
-						errorData(LEVEL_WARN, true, "projectinfo", "File Size Or Url Error", href, title, &warn, tmp)
-						fileSizeUrlErrIsSave = true
-					}
-					if fid != "" { //附件上传成功
-						downloadFileOk = true
-					}
-				}
-			}
-		}
-	}
 
-	//2.5校验mustfield(title,href,detail,projectname)
+	//2.4校验mustfield(title,href,detail,projectname)
 	code := qutil.ObjToString(tmp["spidercode"])
 	nocheck := strings.HasSuffix(code, "_nocheck")
 	for _, f := range mustfield {
@@ -271,7 +272,7 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
 		}
 	}
 
-	//2.6部分字段校验
+	//2.5部分字段校验
 	for _, cf := range checkfield {
 		if qutil.ObjToString(tmp[cf]) == "" {
 			errorData(LEVEL_ERROR, true, cf, "Field Value Is Null", href, title, &warn, tmp)
@@ -348,7 +349,7 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
 }
 
 // 数据判重
-func dataRepeat(tmp map[string]interface{}, href, hashHref, uuid string) (b bool, res int) {
+func dataRepeat(tmp map[string]interface{}, href, hashHref, uuid string, downloadFileOk bool) (b bool, res int) {
 	detail := qutil.ObjToString(tmp["detail"])
 	filterDetail := sp.FilterDetail(detail) //只保留文本内容
 	//新版数据判重
@@ -402,7 +403,14 @@ func dataRepeat(tmp map[string]interface{}, href, hashHref, uuid string) (b bool
 		tmp["hashref"] = hashHref
 		SaveMgoCache <- tmp                                         //记录重复数据
 		if repeatby == "bloom_detail" || repeatby == "sha_detail" { //被正文判重,保留数据,打上标记
-			tmp["extracttype"] = -1
+			/*
+				日期:2024-04-26
+				逻辑:含有效附件的数据不进行正文判重
+				提出者:张金坤
+			*/
+			if !downloadFileOk { //含有效附件,不进行正文判重及extracttype != -1
+				tmp["extracttype"] = -1
+			}
 			return false, 4
 		}
 		return true, 4