apple 5 years ago
parent
commit
e45cca97b7
2 changed files with 110 additions and 105 deletions
  1. 106 103
      udpfilterdup/src/datamap.go
  2. 4 2
      udpfilterdup/src/main.go

+ 106 - 103
udpfilterdup/src/datamap.go

@@ -263,90 +263,90 @@ L:
 				if v.id == info.id { //正常重复
 					return false, v, ""
 				}
-				//if v.id == "5c761a4fa5cb26b9b73d9512" &&info.id=="5c767bd1a5cb26b9b7a61597" {
-				//	log.Println("测试数据")
-				//}
-
-				if info.subtype == v.subtype {
-					if info.site != "" {
-						sitelock.Lock()
-						dict := SiteMap[info.site]
-						sitelock.Unlock()
-						if dict != nil {
-							if info.area == "全国" && dict["area"] != "" {
-								info.is_site = true
-								info.area = qutil.ObjToString(dict["area"])
-								info.city = qutil.ObjToString(dict["city"])
-							} else {
-								if info.city == "" && dict["city"] != "" {
-									info.is_site = true
-									info.area = qutil.ObjToString(dict["area"])
-									info.city = qutil.ObjToString(dict["city"])
-								}
-							}
+				if info.site != "" {
+					sitelock.Lock()
+					dict := SiteMap[info.site]
+					sitelock.Unlock()
+					if dict != nil {
+						if (info.area == "全国" && dict["area"] != "")||
+							(info.city == "" && dict["city"] != ""){
+							info.is_site = true
+							info.area = qutil.ObjToString(dict["area"])
+							info.city = qutil.ObjToString(dict["city"])
 						}
 					}
-					//前置条件1 - 站点相关
-					if info.site != "" && info.site == v.site {
-						if info.href != "" && info.href == v.href {
-							reason = "href相同"
-							b = true
-							source = v
-							reasons = reason
-							break L
-						}
-						if info.href != "" && info.href != v.href {
-							reason = "href不同-"
-						}
+				}
+				//前置条件1 - 站点相关
+				if info.site != "" && info.site == v.site {
+					if info.href != "" && info.href == v.href {
+						reason = "href相同"
+						b = true
+						source = v
+						reasons = reason
+						break L
 					}
-
-					//前置条件2 - 标题相关,有且一个关键词
-					if ((info.titleSpecialWord && !v.titleSpecialWord) || (info.specialWord && !v.specialWord)) &&
-						info.title != v.title && v.title != "" && info.title != "" {
-						continue
+					if info.href != "" && info.href != v.href {
+						reason = "href不同-"
 					}
+				}
 
-					//前置条件3 - 标题相关,均含有关键词
-					if ((info.titleSpecialWord && v.titleSpecialWord) || (info.specialWord && v.specialWord)) &&
-						len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 && v.title != "" && info.title != "" {
+				//前置条件2 - 标题相关,有且一个关键词
+				if ((info.titleSpecialWord && !v.titleSpecialWord) || (info.specialWord && !v.specialWord)) &&
+					info.title != v.title && v.title != "" && info.title != "" {
+					continue
+				}
 
-						letter1,letter2:=v.title,info.title
-						res, _ := regexp.Compile("[0-9a-zA-Z]+");
-						if res.MatchString(letter1)||res.MatchString(letter2) {
-							letter1=convertArabicNumeralsAndLetters(letter1)
-							letter2=convertArabicNumeralsAndLetters(letter2)
-						}
-						if strings.Contains(letter1,"重新招标")|| strings.Contains(letter2,"重新招标"){
-							letter1,letter2=dealWithSpecialPhrases(letter1,letter2)
-						}
-						if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
-							continue
-						}else {
-							reason = reason + "标题关键词且包含关系"
-							if !againRepeat(v, info) {//继续二级金额判断
-								b = true
-								source = v
-								reasons = reason
-								break L
-							}
-						}
-					}
+				//前置条件3 - 标题相关,均含有关键词
+				if ((info.titleSpecialWord && v.titleSpecialWord) || (info.specialWord && v.specialWord)) &&
+					len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 && v.title != "" && info.title != "" {
 
-					//新增快速数据过少判重
-					if LowHeavy {
-						repeat := false
-						if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
+					letter1,letter2:=v.title,info.title
+					res, _ := regexp.Compile("[0-9a-zA-Z]+");
+					if res.MatchString(letter1)||res.MatchString(letter2) {
+						letter1=convertArabicNumeralsAndLetters(letter1)
+						letter2=convertArabicNumeralsAndLetters(letter2)
+					}
+					if strings.Contains(letter1,"重新招标")|| strings.Contains(letter2,"重新招标"){
+						letter1,letter2=dealWithSpecialPhrases(letter1,letter2)
+					}
+					if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
+						continue
+					}else {
+						reason = reason + "标题关键词且包含关系"
+						if !againRepeat(v, info) {//继续二级金额判断
 							b = true
 							source = v
 							reasons = reason
 							break L
 						}
 					}
+				}
 
+				//新增快速数据过少判重
+				if LowHeavy {
+					repeat := false
+					if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+				}
 
-					//代理机构相同-非空相等
-					if v.agency != "" && info.agency != "" && v.agency == info.agency {
-						reason = reason + "同机构-"
+				//代理机构相同-非空相等
+				if v.agency != "" && info.agency != "" && v.agency == info.agency {
+					reason = reason + "同机构-"
+					repeat := false
+					if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+				} else {
+					reason = reason + "非同机构-"
+					if info.city != "" && info.city == v.city {
+						reason = reason + "同城-"
 						repeat := false
 						if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 							b = true
@@ -355,25 +355,13 @@ L:
 							break L
 						}
 					} else {
-						reason = reason + "非同机构-"
-						if info.city != "" && info.city == v.city {
-							reason = reason + "同城-"
-							repeat := false
-							if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
-								b = true
-								source = v
-								reasons = reason
-								break L
-							}
-						} else {
-							reason = reason + "不同城-"
-							repeat := false
-							if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
-								b = true
-								source = v
-								reasons = reason
-								break L
-							}
+						reason = reason + "不同城-"
+						repeat := false
+						if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break L
 						}
 					}
 				}
@@ -474,6 +462,21 @@ func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
 			d.data[k] = data
 		}
 	}
+
+	//添加省
+	isAreaExist :=false
+	for _,v:= range d.areakeys {
+		if v==newData.area {
+			isAreaExist = true
+		}
+	}
+
+	if !isAreaExist {
+		areaArr := d.areakeys
+		areaArr = append(areaArr,newData.area)
+		d.areakeys = areaArr
+	}
+
 	d.lock.Unlock()
 }
 
@@ -804,38 +807,38 @@ func tenderRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 	var ss string
 	p1, p2, p3, p4, p9, p10, p11 := false, false, false, false, false, false, false
 	if v.projectname != "" && v.projectname == info.projectname {
-		ss = ss + "p1(名称)-"
+		ss = ss + "p1-名称-"
 		p1 = true
 	}
 	if v.buyer != "" && v.buyer == info.buyer {
-		ss = ss + "p2(单位)-"
+		ss = ss + "p2-单位-"
 		p2 = true
 	}
 	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
 		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
-		ss = ss + "p3(编号组)-"
+		ss = ss + "p3-编号组-"
 		p3 = true
 	}
 	if v.budget != 0 && v.budget == info.budget {
-		ss = ss + "p4(预算)-"
+		ss = ss + "p4-预算-"
 		p4 = true
 	}
 	if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
-		ss = ss + "p9(开标时间)-"
+		ss = ss + "p9-开标时间-"
 		p9 = true
 	}
 	if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
-		ss = ss + "p10(开标地点)-"
+		ss = ss + "p10-开标地点-"
 		p10 = true
 	}
 	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
 		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
-		ss = ss + "p11(标题)-"
+		ss = ss + "p11-标题-"
 		p11 = true
 	}
 
-	if  (p1 && p2 && p4) || (p1 && p2 && p9) ||
-		(p1 && p2 && p10) || (p1 && p2 && p11) || (p1 && p3 && p9) || (p1 && p3 && p10) ||
+	if  (p1 && p2 && p3) || (p1 && p2 && p4) || (p1 && p2 && p9) || (p1 && p2 && p10) ||
+		(p1 && p2 && p11) || (p1 && p3 && p9) || (p1 && p3 && p10) ||
 		(p1 && p4 && p9) || (p1 && p4 && p10) || (p2 && p3 && p4) ||
 		(p2 && p3 && p9) || (p2 && p3 && p10) || (p2 && p3 && p11) ||
 		(p2 && p4 && p9) || (p2 && p4 && p10) || (p2 && p4 && p11) ||
@@ -914,31 +917,31 @@ func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 	var ss string
 	p1, p2, p3, p5, p6, p11 := false, false, false, false, false, false
 	if v.projectname != "" && v.projectname == info.projectname {
-		ss = ss + "p1(项目名称)-"
+		ss = ss + "p1-项目名称-"
 		p1 = true
 	}
 	if v.buyer != "" && v.buyer == info.buyer {
-		ss = ss + "p2(单位)-"
+		ss = ss + "p2-单位-"
 		p2 = true
 	}
 	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
 		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
-		ss = ss + "p3(编号组)-"
+		ss = ss + "p3-编号组--"
 		p3 = true
 	}
 	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
-		ss = ss + "p5(中标金)-"
+		ss = ss + "p5-中标金-"
 		p5 = true
 	}
 	if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
-		ss = ss + "p6(中标人)-"
+		ss = ss + "p6-中标人-"
 		p6 = true
 	}
 
 
 	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
 		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
-		ss = ss + "p11(标题)-"
+		ss = ss + "p11-标题-"
 		p11 = true
 	}
 

+ 4 - 2
udpfilterdup/src/main.go

@@ -737,8 +737,10 @@ func basicDataScore(v *Info, info *Info) bool {
 	/*
 	  权重评估
 	  网站优先级判定规则:
-	  1、中央>省>市>县区
-	  2、政府采购>公共资源>采购单位官网>招标代理公司/平台
+	  1、国家>省级>市级>县区
+	  2、政府采购>公共资源>官方网站|政府门户>社会公共招标平台|企业招标平台
+	  3、同sitetype-分析weight
+	  4、要素打分-分析
 	*/
 	v_score, info_score := -1, -1
 	dict_v := SiteMap[v.site]