|
@@ -351,7 +351,9 @@ func NewSaveBidding(tmp map[string]interface{}) (b bool, res int, mgoid, mgocoll
|
|
|
// 数据判重
|
|
|
func dataRepeat(tmp map[string]interface{}, href, hashHref, uuid string, downloadFileOk bool) (b bool, res int) {
|
|
|
detail := qutil.ObjToString(tmp["detail"])
|
|
|
- filterDetail := sp.FilterDetail(detail) //只保留文本内容
|
|
|
+ hanArr := reg_han.FindAllString(detail, -1) //获取汉字集合
|
|
|
+ hanArrLen := len(hanArr) //汉字个数
|
|
|
+ filterDetail := sp.FilterDetail(detail) //只保留文本内容
|
|
|
//新版数据判重
|
|
|
publishtime := qutil.Int64All(tmp["publishtime"])
|
|
|
isExist := false //记录判重结果
|
|
@@ -405,10 +407,10 @@ func dataRepeat(tmp map[string]interface{}, href, hashHref, uuid string, downloa
|
|
|
if repeatby == "bloom_detail" || repeatby == "sha_detail" { //被正文判重,保留数据,打上标记
|
|
|
/*
|
|
|
日期:2024-04-26
|
|
|
- 逻辑:含有效附件的数据不进行正文判重
|
|
|
+ 逻辑:含有效附件且正文汉字个数小于100的数据不进行正文判重
|
|
|
提出者:张金坤
|
|
|
*/
|
|
|
- if !downloadFileOk { //含有效附件,不进行正文判重及extracttype != -1
|
|
|
+ if hanArrLen > 100 || !downloadFileOk {
|
|
|
tmp["extracttype"] = -1
|
|
|
}
|
|
|
return false, 4
|