Browse Source

_nocheck后缀爬虫数据特殊过滤处理

mxs 1 year ago
parent
commit
6466e3e536
1 changed files with 8 additions and 6 deletions
  1. 8 6
      src/saveServer/processdata.go

+ 8 - 6
src/saveServer/processdata.go

@@ -156,6 +156,8 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
 	}
 
 	//2.5校验mustfield(title,href,detail,projectname)
+	code := qutil.ObjToString(tmp["spidercode"])
+	nocheck := strings.HasSuffix(code, "_nocheck")
 	for _, f := range mustfield {
 		if f == "projectname" && infoformat != 2 { //拟建数据中特有字段projectname校验
 			continue
@@ -179,7 +181,7 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
 		} else if randomLen >= 5 && (f == "title" || f == "projectname") { //文本长度大算个数
 			randomErr = true
 		}
-		if randomErr { //乱码异常
+		if randomErr && !nocheck { //乱码异常
 			res = 3
 			errorData(LEVEL_ERROR, false, f, "Field Value Contains Random Code", href, title, &warn, tmp)
 			return
@@ -223,12 +225,12 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
 					if hanArrLen <= 10 { //汉字个数小于10,定为异常无效数据
 						if downloadFileOk { //若有附件,修改正文,有效数据
 							tmp["detail"] = DETAIL_TEXT
-						} else { //无附件,异常数据
+						} else if !nocheck { //无附件,异常数据
 							res = 3
 							errorData(LEVEL_ERROR, false, "projectinfo", "Detail File Err", href, title, &warn, tmp)
 							return
 						}
-					} else { //10-50个汉字且正文中有链接的数据,定为正文疑似无效detail_isvalidity=0
+					} else if !nocheck { //10-50个汉字且正文中有链接的数据,定为正文疑似无效detail_isvalidity=0
 						tmp["detail_isvalidity"] = 0
 					}
 					//}
@@ -344,7 +346,7 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
 	return
 }
 
-//数据判重
+// 数据判重
 func dataRepeat(tmp map[string]interface{}, href, hashHref, uuid string) (b bool, res int) {
 	detail := qutil.ObjToString(tmp["detail"])
 	filterDetail := sp.FilterDetail(detail) //只保留文本内容
@@ -486,7 +488,7 @@ func errorData(level int, entry bool, field, info, href string, title interface{
 	*warn = append(*warn, data)
 }
 
-//公共资源、政府采购等站点替换附件
+// 公共资源、政府采购等站点替换附件
 func ReplaceFile(site, sha string, data map[string]interface{}) {
 	defer qutil.Catch()
 	if siteReg.MatchString(site) {
@@ -553,7 +555,7 @@ func ReplaceFile(site, sha string, data map[string]interface{}) {
 	}
 }
 
-//用于无正文信息、有相同头部、正文为附件的信息,detail参与判重后,summary字段替换到detail
+// 用于无正文信息、有相同头部、正文为附件的信息,detail参与判重后,summary字段替换到detail
 func ReplaceDetail(data map[string]interface{}) {
 	defer qutil.Catch()
 	summary := qutil.ObjToString(data["summary"])