Răsfoiți Sursa

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

fengweiqiang 5 ani în urmă
părinte
comite
2d59fd7863

+ 4 - 0
udpcreateindex/src/biddingall.go

@@ -66,6 +66,10 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 	var compare bson.M
 	bnil := false
 	for tmp := make(map[string]interface{}); query.Next(tmp); n++ {
+		// if qutil.IntAll(tmp["dataging"]) == 1 { //dataging=1不生索引
+		// 	tmp = make(map[string]interface{})
+		// 	continue
+		// }
 		update := map[string]interface{}{}
 		del := map[string]interface{}{} //记录extract没有值而bidding中有值的字段
 		//对比方法----------------

+ 4 - 0
udpcreateindex/src/biddingindex.go

@@ -95,6 +95,10 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 	log.Println("开始迭代..")
 	for n, tmp := range infos {
 		n1++
+		if qutil.IntAll(tmp["dataging"]) == 1 { //dataging=1不生索引
+			tmp = make(map[string]interface{})
+			continue
+		}
 		update := map[string]interface{}{} //要更新的mongo数据
 		//对比方法----------------
 		tid := qutil.BsonIdToSId(tmp["_id"])

+ 4 - 4
udpcreateindex/src/config.json

@@ -30,11 +30,11 @@
     },
     "bidding": {
         "db": "mxs",
-        "collect": "bidding_test",
+        "collect": "test1",
         "index": "bidding_v2",
         "type": "bidding",
         "extractdb": "mxs",
-        "extractcollect": "extract",
+        "extractcollect": "test2",
         "indexfields":[ 
         "buyerzipcode","winnertel","winnerperson","contractcode","winneraddr","agencyaddr","buyeraddr","signaturedate","projectperiod","projectaddr","agencytel","agencyperson","buyerperson","agency","projectscope","projectcode","bidopentime","supervisorrate","buyertel","bidamount","winner","buyer","budget","projectname","bidstatus","buyerclass","topscopeclass","s_subscopeclass","area","city","district","s_winner","_id","title","detail","site","comeintime","href","infoformat","publishtime","s_sha","spidercode","subtype","toptype","projectinfo"
         ],
@@ -62,7 +62,7 @@
 		"addr": "192.168.3.207:27092",
 		"pool": 5,
         "db": "mxs",
-        "collect": "qyxy",
+        "collect": "test2",
         "index": "qyxy_ent",
         "type": "qyxy_ent"
     },
@@ -92,7 +92,7 @@
         "db": "mxs"
     },
     "elastic": {
-        "addr": "http://127.0.0.1:9800",
+        "addr": "http://192.168.3.128:9800",
         "pool": 12
     }
 }

+ 207 - 1
udpcreateindex/src/qyxyindex.go

@@ -14,7 +14,7 @@ var (
 	timeReg = regexp.MustCompile("[\\d]{4}-[\\d]{2}-[\\d]{2}")
 )
 
-func qyxyTask(q map[string]interface{}) {
+func qyxyTask1(q map[string]interface{}) {
 	defer util.Catch()
 	//	savelock := sync.Mutex{}
 	//连接
@@ -233,3 +233,209 @@ func qyxyTask(q map[string]interface{}) {
 	}
 	log.Println("create qyxy index...over", n)
 }
+
+func qyxyTask(q map[string]interface{}) {
+	defer util.Catch()
+	//	savelock := sync.Mutex{}
+	//连接
+	session := qyxydb.GetMgoConn(86400)
+	defer qyxydb.DestoryMongoConn(session)
+	//
+	c, _ := qyxy_ent["collect"].(string)
+	db, _ := qyxy_ent["db"].(string)
+	index, _ := qyxy_ent["index"].(string)
+	itype, _ := qyxy_ent["type"].(string)
+	count, _ := session.DB(db).C(c).Find(&q).Count()
+	savepool := make(chan bool, 10)
+
+	log.Println("企业信用索引	查询语句:", q, "同步总数:", count, "elastic库:", index)
+	query := session.DB(db).C(c).Find(q).Iter()
+
+	arr := make([]map[string]interface{}, savesizei)
+	var n int
+	i := 0
+	for tmp := make(map[string]interface{}); query.Next(tmp); i = i + 1 {
+		//delete(tmp, "_id")
+		tmp["_id"] = tmp["company_id"]
+		// delete(tmp, "cancels")
+		// delete(tmp, "cancel_date")
+		// delete(tmp, "intellectuals")
+		// delete(tmp, "chattels")
+		// delete(tmp, "checks")
+		// delete(tmp, "revoke_date")
+		delete(tmp, "changes")
+		// delete(tmp, "partners")
+
+		// if tmp["establish_date"] != nil {
+		// 	establish_date_time, ok := tmp["establish_date"].(time.Time)
+		// 	if ok {
+		// 		tmp["establish_date"] = establish_date_time.Unix()
+		// 	} else {
+		// 		tmp["establish_date"] = 0
+		// 		util.Debug(tmp["company_id"], "establish_date")
+		// 	}
+		// }
+
+		// if tmp["lastupdatetime"] != nil {
+		// 	lastupdatetime_time, ok := tmp["lastupdatetime"].(time.Time)
+		// 	if ok {
+		// 		tmp["lastupdatetime"] = lastupdatetime_time.Unix()
+		// 	} else {
+		// 		tmp["lastupdatetime"] = 0
+		// 		util.Debug(tmp["company_id"], "lastupdatetime")
+		// 	}
+		// }
+
+		// if tmp["issue_date"] != nil {
+		// 	issue_date_time, ok := tmp["issue_date"].(time.Time)
+		// 	if ok {
+		// 		tmp["issue_date"] = issue_date_time.Unix()
+		// 	} else {
+		// 		tmp["issue_date"] = 0
+		// 		util.Debug(tmp["company_id"], "issue_date")
+		// 	}
+		// }
+
+		// if operation_startdate, ok := tmp["operation_startdate"].(string); operation_startdate != "" && ok {
+		// 	operation_startdate = timeReg.FindString(operation_startdate)
+		// 	tmp["operation_startdate"] = operation_startdate + " 00:00:00"
+		// }
+
+		// if operation_enddate, ok := tmp["operation_enddate"].(string); operation_enddate != "" && ok {
+		// 	operation_enddate = timeReg.FindString(operation_enddate)
+		// 	tmp["operation_enddate"] = operation_enddate + " 00:00:00"
+		// }
+
+		// //operations
+		// if operations, ok := tmp["operations"].([]interface{}); ok && len(operations) > 0 {
+		// 	for _, operation := range operations {
+		// 		if tmp1, ok := operation.(map[string]interface{}); tmp1 != nil && ok && len(tmp1) > 0 {
+		// 			if included_time, ok := tmp1["included_time"].(string); ok && included_time != "" {
+		// 				included_time = timeReg.FindString(included_time)
+		// 				tmp1["included_time"] = included_time + " 00:00:00"
+		// 			}
+		// 			if removed_time, ok := tmp1["removed_time"].(string); ok && removed_time != "" {
+		// 				removed_time = timeReg.FindString(removed_time)
+		// 				tmp1["removed_time"] = removed_time + " 00:00:00"
+		// 			}
+		// 		}
+		// 	}
+		// }
+		// //punishes
+		// if punishes, ok := tmp["punishes"].([]interface{}); ok && len(punishes) > 0 {
+		// 	for _, punishe := range punishes {
+		// 		if tmp1, ok := punishe.(map[string]interface{}); tmp1 != nil && ok && len(tmp1) > 0 {
+		// 			if public_date, ok := tmp1["public_date"].(string); ok && public_date != "" {
+		// 				public_date = timeReg.FindString(public_date)
+		// 				tmp1["public_date"] = public_date + " 00:00:00"
+		// 			}
+		// 			if punish_date, ok := tmp1["punish_date"].(string); ok && punish_date != "" {
+		// 				punish_date = timeReg.FindString(punish_date)
+		// 				tmp1["punish_date"] = punish_date + " 00:00:00"
+		// 			}
+		// 		}
+		// 	}
+		// }
+		// //annual_reports
+		// if annual_reports, ok := tmp["annual_reports"].([]interface{}); ok && len(annual_reports) > 0 {
+		// 	for _, annual_report := range annual_reports {
+		// 		if tmp1, ok := annual_report.(map[string]interface{}); tmp1 != nil && ok && len(tmp1) > 0 {
+		// 			if report_changes, ok := tmp1["report_changes"].([]interface{}); ok && len(report_changes) > 0 {
+		// 				for _, report_change := range report_changes {
+		// 					if tmp2, ok := report_change.(map[string]interface{}); tmp2 != nil && ok && len(tmp2) > 0 {
+		// 						if change_date, ok := tmp2["change_date"].(string); ok && change_date != "" {
+		// 							change_date = timeReg.FindString(change_date)
+		// 							tmp2["change_date"] = change_date + " 00:00:00"
+		// 						}
+		// 					}
+		// 				}
+		// 			}
+		// 			if report_partners, ok := tmp1["report_partners"].([]interface{}); ok && len(report_partners) > 0 {
+		// 				for _, report_partner := range report_partners {
+		// 					if tmp2, ok := report_partner.(map[string]interface{}); tmp2 != nil && ok && len(tmp2) > 0 {
+		// 						if stock_realdate, ok := tmp2["stock_realdate"].(string); ok && stock_realdate != "" {
+		// 							stock_realdate = timeReg.FindString(stock_realdate)
+		// 							tmp2["stock_realdate"] = stock_realdate + " 00:00:00"
+		// 						}
+		// 						if stock_date, ok := tmp2["stock_date"].(string); ok && stock_date != "" {
+		// 							stock_date = timeReg.FindString(stock_date)
+		// 							tmp2["stock_date"] = stock_date + " 00:00:00"
+		// 						}
+		// 					}
+		// 				}
+		// 			}
+		// 			if report_equity_changes, ok := tmp1["report_equity_changes"].([]interface{}); ok && len(report_equity_changes) > 0 {
+		// 				for _, report_equity_change := range report_equity_changes {
+		// 					if tmp2, ok := report_equity_change.(map[string]interface{}); tmp2 != nil && ok && len(tmp2) > 0 {
+		// 						if change_date, ok := tmp2["change_date"].(string); ok && change_date != "" {
+		// 							change_date = timeReg.FindString(change_date)
+		// 							tmp2["change_date"] = change_date + " 00:00:00"
+		// 						}
+		// 					}
+		// 				}
+		// 			}
+		// 			if report_out_guarantees, ok := tmp1["report_out_guarantees"].([]interface{}); ok && len(report_out_guarantees) > 0 {
+		// 				for _, report_out_guarantee := range report_out_guarantees {
+		// 					if tmp2, ok := report_out_guarantee.(map[string]interface{}); tmp2 != nil && ok && len(tmp2) > 0 {
+		// 						if perform_time, ok := tmp2["perform_time"].(string); ok && perform_time != "" {
+		// 							perform_time = timeReg.FindString(perform_time)
+		// 							tmp2["perform_time"] = perform_time + " 00:00:00"
+		// 						}
+		// 						if guarantee_time, ok := tmp2["guarantee_time"].(string); ok && guarantee_time != "" {
+		// 							guarantee_time = timeReg.FindString(guarantee_time)
+		// 							tmp2["guarantee_time"] = guarantee_time + " 00:00:00"
+		// 						}
+		// 					}
+		// 				}
+		// 			}
+
+		// 		}
+		// 	}
+		// }
+		arr[i] = tmp
+		n++
+		if i == savesizei-1 {
+			savepool <- true
+			tmps := arr
+			go func(tmpn *[]map[string]interface{}) {
+				defer func() {
+					<-savepool
+				}()
+				elastic.BulkSave(index, itype, tmpn, true)
+			}(&tmps)
+			i = 0
+			arr = make([]map[string]interface{}, savesizei)
+		}
+		if n%savesizei == 0 {
+			log.Println("当前:", n)
+		}
+
+		// n++
+		// savelock.Lock()
+		// arr = append(arr, tmp)
+		// //生索引
+		// if len(arr) >= savesizei-1 {
+		// 	tmps := arr
+		// 	elastic.BulkSave(index, itype, &tmps, true)
+		// 	time.Sleep(1 * time.Second)
+		// 	arr = []map[string]interface{}{}
+		// }
+		// savelock.Unlock()
+		// //计数
+		// if n%savesizei == 0 {
+		// 	log.Println("当前:", n)
+		// }
+		tmp = make(map[string]interface{})
+
+	}
+	// savelock.Lock()
+	// if len(arr) > 0 {
+	// 	tmps := arr
+	// 	elastic.BulkSave(index, itype, &tmps, true)
+	// }
+	// savelock.Unlock()
+	if i > 0 {
+		elastic.BulkSave(index, itype, &arr, true)
+	}
+	log.Println("create qyxy index...over", n)
+}

+ 12 - 5
udpfilterdup/src/config.json

@@ -2,10 +2,10 @@
     "udpport": ":1485",
     "dupdays": 5,
     "mongodb": {
-        "addr": "127.0.0.1:27092",
+        "addr": "192.168.3.207:27092",
         "pool": 10,
-        "db": "qfw",
-        "extract": "extract_v20190111",
+        "db": "extract_kf",
+        "extract": "a_testbidding_new",
         "site": {
             "dbname": "qfw",
             "coll": "site"
@@ -17,11 +17,18 @@
     },
     "nextNode": [],
     "isMerger": false,
-    "threads": 1,
+    "threads": 5,
     "isSort":false,
+    "lowHeavy":true,
     "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
     "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批)",
-    "specialtitle_2": "项目([0-9a-zA-Z一二三四五六七八九十零123456789])",
+    "specialtitle_2": "项目[(][0-9a-zA-Z一二三四五六七八九十零123456789][)]",
     "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
 }
 
+
+
+
+
+

+ 196 - 18
udpfilterdup/src/datamap.go

@@ -5,6 +5,7 @@ import (
 	"log"
 	qutil "qfw/util"
 	"qfw/util/mongodb"
+	"regexp"
 	"strings"
 	"sync"
 	"time"
@@ -28,7 +29,7 @@ type Info struct {
 	publishtime    int64   //发布时间
 	comeintime     int64   //入库时间
 	bidopentime    int64   //开标时间
-	agencyaddr     string  //开标地点
+	bidopenaddress string  //开标地点
 
 	site string //站点
 	href string //正文的url
@@ -218,13 +219,13 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.publishtime = qutil.Int64All(tmp["publishtime"])
 	info.comeintime = qutil.Int64All(tmp["comeintime"])
 	info.bidopentime = qutil.Int64All(tmp["bidopentime"])
-	info.agencyaddr = qutil.ObjToString(tmp["agencyaddr"])
+	info.bidopenaddress = qutil.ObjToString(tmp["bidopenaddress"])
 	info.site = qutil.ObjToString(tmp["site"])
 	info.href = qutil.ObjToString(tmp["href"])
 	info.repeatid = qutil.ObjToString(tmp["repeatid"])
 
 	info.specialWord = FilterRegTitle.MatchString(info.title)
-	info.titleSpecialWord = FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
+	info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) ||FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
 	info.mergemap = *qutil.ObjToMap(tmp["merge_map"])
 	if info.mergemap == nil {
 		info.mergemap = make(map[string]interface{}, 0)
@@ -316,6 +317,19 @@ L:
 						}
 					}
 
+
+					//新增快速数据过少判重
+					if LowHeavy {
+						repeat := false
+						if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break
+						}
+					}
+
+
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
 						reason = reason + "同机构-"
@@ -459,6 +473,17 @@ L:
 						}
 					}
 
+					//新增快速数据过少判重
+					if LowHeavy {
+						repeat := false
+						if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break
+						}
+					}
+
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
 						reason = reason + "同机构-"
@@ -616,7 +641,7 @@ func (d *datamap) update(t int64) {
 	//log.Println("更新前后数据:", all, all1)
 }
 
-func (d *datamap) GetLatelyFiveDay(t int64) []string {
+func (d *datamap) GetLatelyFiveDay(t int64) []string  {
 	array := make([]string, d.days)
 	now := time.Unix(t, 0)
 	for i := 0; i < d.days; i++ {
@@ -628,9 +653,121 @@ func (d *datamap) GetLatelyFiveDay(t int64) []string {
 
 /*
 **************************
-******* 以下为判重 ********
+******** 以下为判重 ********
 **************************
  */
+
+ //快速低质量数据判重
+func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
+	//首先判定是否为低质量数据    info目标数据
+	if info.agency==v.agency&&info.title!=""&&
+		info.title==v.title &&
+		info.projectname==""&&info.projectcode==""&&info.contractnumber==""&&info.buyer=="" {
+		isValue:=0//五要素判断
+		if info.budget != 0 {//预算
+			isValue++
+		}
+		if info.bidopentime != 0{//开标时间
+			isValue++
+		}
+		if info.bidopenaddress!=""{//开标地点
+			isValue++
+		}
+		if info.winner != ""{//中标单位
+			isValue++
+		}
+		if info.bidamount != 0 {//中标金额
+			isValue++
+		}
+		if isValue==0 {
+			//if info.site!=v.site {
+			//	log.Println("符合低质量条件条件0",info.id,"--",v.id)
+			//}
+			//log.Println("符合低质量条件条件0",info.id,"--",v.id)
+			reason = reason + "---要素均为空,标题包含关系"
+			return true, reason
+		}else if isValue==1 {
+			isMeet := false
+			if isMeet, reason = judgeLowQualityData(v, info, reason); isMeet {
+				log.Println("符合低质量条件条件1",info.id,"--",v.id)
+				reason = reason + "---有且一个要素组合"
+				return true, reason
+			}
+		}else {
+
+		}
+	}
+	return false,reason
+}
+
+//类别细节原因记录
+func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
+	if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
+		info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
+		info.subtype == "变更" || info.subtype == "其他" {
+		//招标结果
+		if info.budget != 0 && info.budget == v.budget{//预算
+			reason = reason + "---招标类:预算"
+			return true,reason
+		}
+		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+			reason = reason + "---招标类:开标时间"
+			return true,reason
+		}
+		if info.bidopenaddress!="" && info.bidopenaddress == v.bidopenaddress{//开标地点
+			reason = reason + "---招标类:开标地点"
+			return true,reason
+		}
+	} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
+		//中标结果
+		if v.winner != "" && info.winner == v.winner{//中标单位
+			reason = reason + "---中标类:中标单位"
+			return true,reason
+		}
+		if v.bidamount != 0 && info.bidamount == v.bidamount{//中标金额
+			reason = reason + "---中标类:中标金额"
+			return true,reason
+		}
+	} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+		//合同
+		if info.budget != 0 && info.budget == v.budget{//预算
+			reason = reason + "---合同类:预算"
+			return true,reason
+		}
+		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+			reason = reason + "---合同类:开标时间"
+			return true,reason
+		}
+		if info.bidopenaddress!="" && info.bidopenaddress == v.bidopenaddress{//开标地点
+			reason = reason + "---合同类:开标地点"
+			return true,reason
+		}
+		if v.winner != "" && info.winner == v.winner{//中标单位
+			reason = reason + "---合同类:中标单位"
+			return true,reason
+		}
+		if v.bidamount != 0 && info.bidamount == v.bidamount{//中标金额
+			reason = reason + "---合同类:中标金额"
+			return true,reason
+		}
+	} else {
+		//招标结果
+		if info.budget != 0 && info.budget == v.budget{//预算
+			reason = reason + "---类别空-招标类:预算"
+			return true,reason
+		}
+		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+			reason = reason + "---类别空-招标类:开标时间"
+			return true,reason
+		}
+		if info.bidopenaddress!="" && info.bidopenaddress == v.bidopenaddress{//开标地点
+			reason = reason + "---类别空-招标类:开标地点"
+			return true,reason
+		}
+	}
+	return false,reason
+}
+
 //判重方法1
 func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 
@@ -795,7 +932,7 @@ func tenderRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 		ss = ss + "p9(开标时间)-"
 		p9 = true
 	}
-	if v.agencyaddr != "" && v.agencyaddr == info.agencyaddr {
+	if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
 		ss = ss + "p10(开标地点)-"
 		p10 = true
 	}
@@ -839,9 +976,9 @@ func tenderRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 	if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
 		m++
 	}
-	if v.agencyaddr != "" && v.agencyaddr == info.agencyaddr {
-		m++
-	}
+	//if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
+	//	m++
+	//}
 	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
 		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
 		m++
@@ -872,7 +1009,7 @@ func tenderRepeat_C(v *Info, info *Info) bool {
 	if v.bidopentime != 0 && info.bidopentime != 0 && v.bidopentime != info.bidopentime {
 		return true
 	}
-	if v.agencyaddr != "" && info.agencyaddr != "" && v.agencyaddr != info.agencyaddr {
+	if v.bidopenaddress != "" && info.bidopenaddress != "" && v.bidopenaddress != info.bidopenaddress {
 		return true
 	}
 
@@ -897,14 +1034,25 @@ func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 		ss = ss + "p3(编号组)-"
 		p3 = true
 	}
-	if v.bidamount != 0 && v.bidamount == info.bidamount {
+	//if v.bidamount != 0 && v.bidamount == info.bidamount {
+	//	ss = ss + "p5(中标金)-"
+	//	p5 = true
+	//}
+	//if v.winner != "" && v.winner == info.winner {
+	//	ss = ss + "p6(中标人)-"
+	//	p6 = true
+	//}
+
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
 		ss = ss + "p5(中标金)-"
 		p5 = true
 	}
-	if v.winner != "" && v.winner == info.winner {
+	if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
 		ss = ss + "p6(中标人)-"
 		p6 = true
 	}
+
+
 	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
 		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
 		ss = ss + "p11(标题)-"
@@ -939,10 +1087,10 @@ func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
 		m++
 	}
-	if v.bidamount != 0 && v.bidamount == info.bidamount {
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
 		m++
 	}
-	if v.winner != "" && v.winner == info.winner {
+	if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
 		m++
 	}
 	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
@@ -964,10 +1112,14 @@ func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 //中标_C
 func winningRepeat_C(v *Info, info *Info) bool {
 
-	if v.bidamount != 0 && info.bidamount != 0 && v.bidamount != info.bidamount {
+	//if v.bidamount != 0 && info.bidamount != 0 && v.bidamount != info.bidamount {
+	//	return true
+	//}
+	if v.bidamount != 0 && info.bidamount != 0 && isBidWinningAmount(v.bidamount,info.bidamount) {
 		return true
 	}
-	if v.winner != "" && info.winner != "" && v.winner != info.winner {
+
+	if v.winner != "" && info.winner != "" && deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) {
 		return true
 	}
 	//原始地址...
@@ -1018,6 +1170,7 @@ func contractRepeat_C(v *Info, info *Info) bool {
 	return false
 }
 
+//再次金额判断
 func againRepeat(v *Info, info *Info) bool {
 	//相同采购单位下
 	if info.buyer != "" && v.buyer == info.buyer {
@@ -1032,8 +1185,8 @@ func againRepeat(v *Info, info *Info) bool {
 			info.subtype == "流标" || info.subtype == "合同" || info.subtype == "验收" ||
 			info.subtype == "违规" {
 			//中标金额单位满足条件
-			if (v.bidamount != info.bidamount && v.bidamount != 0 && info.bidamount != 0) ||
-				(v.winner != info.winner && v.winner != "" && info.winner != "") {
+			if (isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0) ||
+				(deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "") {
 				return true
 			}
 		} else {
@@ -1043,3 +1196,28 @@ func againRepeat(v *Info, info *Info) bool {
 
 	return false
 }
+
+//删除中标单位字符串中多余的空格(含tab)
+func deleteExtraSpace(s string) string {
+	//删除字符串中的多余空格,有多个空格时,仅保留一个空格
+	s1 := strings.Replace(s, "  ", " ", -1)      //替换tab为空格
+	regstr := "\\s{2,}"                          //两个及两个以上空格的正则表达式
+	reg, _ := regexp.Compile(regstr)             //编译正则表达式
+	s2 := make([]byte, len(s1))                  //定义字符数组切片
+	copy(s2, s1)                                 //将字符串复制到切片
+	spc_index := reg.FindStringIndex(string(s2)) //在字符串中搜索
+	for len(spc_index) > 0 {                     //找到适配项
+		s2 = append(s2[:spc_index[0]+1], s2[spc_index[1]:]...) //删除多余空格
+		spc_index = reg.FindStringIndex(string(s2))            //继续在字符串中搜索
+	}
+	return string(s2)
+}
+
+//中标金额倍率:10000
+func isBidWinningAmount(f1 float64 ,f2 float64) bool {
+
+	if f1==f2||f1*10000==f2||f2*10000==f1 {
+		return false
+	}
+	return true
+}

+ 264 - 301
udpfilterdup/src/main.go

@@ -32,6 +32,7 @@ var (
 
 	//正则筛选相关
 	FilterRegTitle   = regexp.MustCompile("^_$")
+	FilterRegTitle_0 = regexp.MustCompile("^_$")
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
 	FilterRegTitle_2 = regexp.MustCompile("^_$")
 
@@ -39,14 +40,14 @@ var (
 	Is_Sort          bool                              //是否排序
 	threadNum        int                               //线程数量
 	SiteMap          map[string]map[string]interface{} //站点map
-	idtype, sid, eid string                            //测试人员判重使用
+	LowHeavy		 bool							   //低质量数据判重
+	sid, eid string                            //测试人员判重使用
 )
 
 func init() {
 	flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
 	flag.StringVar(&sid, "sid", "", "开始id")
 	flag.StringVar(&eid, "eid", "", "结束id")
-	flag.StringVar(&idtype, "idtype", "", "id类型,默认ObjectId:0,String:1")
 	flag.Parse()
 	//172.17.145.163:27080
 	util.ReadConfig(&Sysconfig)
@@ -63,12 +64,13 @@ func init() {
 	//加载数据
 	DM = NewDatamap(dupdays, lastid)
 	FilterRegTitle = regexp.MustCompile(util.ObjToString(Sysconfig["specialwords"]))
+	FilterRegTitle_0 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_0"]))
 	FilterRegTitle_1 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_1"]))
 	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
 	isMerger = Sysconfig["isMerger"].(bool)
 	Is_Sort = Sysconfig["isSort"].(bool)
 	threadNum = util.IntAllDef(Sysconfig["threads"], 1)
-
+	LowHeavy =  Sysconfig["lowHeavy"].(bool)
 	//站点配置
 	site := mconf["site"].(map[string]interface{})
 	SiteMap = make(map[string]map[string]interface{}, 0)
@@ -103,12 +105,16 @@ func mainT() {
 	/*
 		ObjectId("5da3f31aa5cb26b9b798d3aa")
 		ObjectId("5da418c4a5cb26b9b7e3e9a6")
-		ObjectId("5df5071ce9d1f601e495fa54")
-		ObjectId("5e09c05f0cf41612e0626abc")
+
+		ObjectId("5da3f2c5a5cb26b9b79847fc")
+		ObjectId("5db2735ba5cb26b9b7c99c6f")
 	*/
 	log.Println("测试开始")
-	sid = "5da3f31aa5cb26b9b798d3aa"
-	eid = "5da418c4a5cb26b9b7e3e9a6"
+	sid = "5da3f2c5a5cb26b9b79847fc"
+	eid = "5db2735ba5cb26b9b7c99c6f"
+
+
+
 	mapinfo := map[string]interface{}{}
 	if sid == "" || eid == "" {
 		log.Println("sid,eid参数不能为空")
@@ -160,32 +166,23 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 
 //开始判重程序
 func task(data []byte, mapInfo map[string]interface{}) {
-	fmt.Println("开始数据判重")
+	log.Println("开始数据判重")
 	defer util.Catch()
 	//区间id
-	q := map[string]interface{}{}
-	if idtype == "1" {
-		q = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  mapInfo["gtid"].(string),
-				"$lte": mapInfo["lteid"].(string),
-			},
-		}
-	} else {
-		q = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
-				"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
-			},
-		}
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
 	}
 	log.Println(mgo.DbName, extract, q)
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
 
 	//是否排序
-	it := sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
+	it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("_id").Iter()
 	if Is_Sort {
+		log.Println("排序:publishtime")
 		it = sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
 	}
 	//it = sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
@@ -193,12 +190,17 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	log.Println("线程数:", threadNum)
 	pool := make(chan bool, threadNum)
 	wg := &sync.WaitGroup{}
-	//mapLock := &sync.Mutex{}
 	n, repeateN := 0, 0
+
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
 		if n%10000 == 0 {
 			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
 		}
+		if util.IntAll(tmp["repeat"]) == 1 {
+			tmp = make(map[string]interface{})
+			repeateN++
+			continue
+		}
 		pool <- true
 		wg.Add(1)
 		go func(tmp map[string]interface{}) {
@@ -207,132 +209,123 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
-			//是否为无效数据
-			if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
-				updateExtract = append(updateExtract, []map[string]interface{}{
-					map[string]interface{}{
-						"_id": tmp["_id"],
-					},
-					map[string]interface{}{
-						"$set": map[string]interface{}{
-							"repeat": -1,
+			if !LowHeavy {	//是否进行低质量数据判重
+				if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						map[string]interface{}{
+							"_id": tmp["_id"],
 						},
-					},
-				})
-				if len(updateExtract) > 500 {
-					mgo.UpSertBulk(extract, updateExtract...)
-					updateExtract = [][]map[string]interface{}{}
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat": -1,//无效数据标签
+							},
+						},
+					})
+					if len(updateExtract) > 500 {
+						mgo.UpSertBulk(extract, updateExtract...)
+						updateExtract = [][]map[string]interface{}{}
+					}
+					return
 				}
-			} else {
-				b, source, reason := DM.check(info)
-				if b { //有重复,生成更新语句,更新抽取和更新招标
-					repeateN++
-					var is_replace = false
-					var mergeArr = []int64{}                    //更改合并数组记录
-					var newData = &Info{}                       //更换新的数据池数据
-					var repeat_idMap = map[string]interface{}{} //记录判重的
-					var merge_idMap = map[string]interface{}{}  //记录合并的
-					if idtype == "1" {                          //先临时决定一个id
-						repeat_idMap["_id"] = info.id
-						merge_idMap["_id"] = source.id
-					} else {
+			}
+
+			b, source, reason := DM.check(info)
+			if b { //有重复,生成更新语句,更新抽取和更新招标
+				repeateN++
+				var is_replace = false
+				var mergeArr = []int64{}                    //更改合并数组记录
+				var newData = &Info{}                       //更换新的数据池数据
+				var repeat_idMap = map[string]interface{}{} //记录判重的
+				var merge_idMap = map[string]interface{}{}  //记录合并的
+				repeat_idMap["_id"] = StringTOBsonId(info.id)
+				merge_idMap["_id"] = StringTOBsonId(source.id)
+				repeat_id := source.id//初始化一个数据
+
+				if isMerger {//合并相关
+					basic_bool := basicDataScore(source, info)
+					if basic_bool {
+						//已原始数据为标准 - 对比数据打判重标签-
+						newData, mergeArr, is_replace = mergeDataFields(source, info)
+						DM.replaceSourceData(newData, source.id) //替换
+						//对比数据打重复标签的id,原始数据id的记录
 						repeat_idMap["_id"] = StringTOBsonId(info.id)
 						merge_idMap["_id"] = StringTOBsonId(source.id)
+						repeat_id = source.id
+					} else {
+						//已对比数据为标准 ,数据池的数据打判重标签
+						newData, mergeArr, is_replace = mergeDataFields(info, source)
+						DM.replaceSourceData(newData, source.id) //替换
+						//原始数据打重复标签的id,   对比数据id的记录
+						repeat_idMap["_id"] = StringTOBsonId(source.id)
+						merge_idMap["_id"] = StringTOBsonId(info.id)
+						repeat_id = info.id
 					}
-					repeat_id := source.id
-					//以下合并相关
-					if isMerger {
-						basic_bool := basicDataScore(source, info)
-						if basic_bool {
-							//已原始数据为标准 - 对比数据打判重标签-
-							newData, mergeArr, is_replace = mergeDataFields(source, info)
-							DM.replaceSourceData(newData, source.id) //替换
-							//对比数据打重复标签的id,原始数据id的记录
-							if idtype == "1" {
-								repeat_idMap["_id"] = info.id
-								merge_idMap["_id"] = source.id
-							} else {
-								repeat_idMap["_id"] = StringTOBsonId(info.id)
-								merge_idMap["_id"] = StringTOBsonId(source.id)
-							}
-							repeat_id = source.id
-						} else {
-							//已对比数据为标准 ,数据池的数据打判重标签
-							newData, mergeArr, is_replace = mergeDataFields(info, source)
-							DM.replaceSourceData(newData, source.id) //替换
 
-							//原始数据打重复标签的id,   对比数据id的记录
-							if idtype == "1" {
-								repeat_idMap["_id"] = source.id
-								merge_idMap["_id"] = info.id
-							} else {
-								repeat_idMap["_id"] = StringTOBsonId(source.id)
-								merge_idMap["_id"] = StringTOBsonId(info.id)
-							}
-							repeat_id = info.id
+					merge_map := make(map[string]interface{}, 0)
+					if is_replace { //有过合并-更新数据
+						merge_map = map[string]interface{}{
+							"$set": map[string]interface{}{
+								"merge": newData.mergemap,
+							},
 						}
-
-						merge_map := make(map[string]interface{}, 0)
-						if is_replace { //有过合并-更新数据
-
-							merge_map = map[string]interface{}{
-								"$set": map[string]interface{}{
-									"merge": newData.mergemap,
-								},
-							}
-
-							//更新合并后的数据
-							for _, value := range mergeArr {
-								if value == 0 {
-									merge_map["$set"].(map[string]interface{})["area"] = newData.area
-									merge_map["$set"].(map[string]interface{})["city"] = newData.city
-								} else if value == 1 {
-									merge_map["$set"].(map[string]interface{})["area"] = newData.area
-									merge_map["$set"].(map[string]interface{})["city"] = newData.city
-								} else if value == 2 {
-									merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-								} else if value == 3 {
-									merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-								} else if value == 4 {
-									merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-								} else if value == 5 {
-									merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
-								} else if value == 6 {
-									merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
-								} else if value == 7 {
-									merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-								} else if value == 8 {
-									merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-								} else if value == 9 {
-									merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
-								} else if value == 10 {
-									merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
-								} else if value == 11 {
-									merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
-								} else {
-								}
+						//更新合并后的数据
+						for _, value := range mergeArr {
+							if value == 0 {
+								merge_map["$set"].(map[string]interface{})["area"] = newData.area
+								merge_map["$set"].(map[string]interface{})["city"] = newData.city
+							} else if value == 1 {
+								merge_map["$set"].(map[string]interface{})["area"] = newData.area
+								merge_map["$set"].(map[string]interface{})["city"] = newData.city
+							} else if value == 2 {
+								merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
+							} else if value == 3 {
+								merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
+							} else if value == 4 {
+								merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
+							} else if value == 5 {
+								merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
+							} else if value == 6 {
+								merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
+							} else if value == 7 {
+								merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
+							} else if value == 8 {
+								merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
+							} else if value == 9 {
+								merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+							} else if value == 10 {
+								merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
+							} else if value == 11 {
+								merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
+							} else {
 							}
-							//模板数据更新
-							updateExtract = append(updateExtract, []map[string]interface{}{
-								merge_idMap,
-								merge_map,
-							})
 						}
+						//模板数据更新
+						updateExtract = append(updateExtract, []map[string]interface{}{
+							merge_idMap,
+							merge_map,
+						})
 					}
+				}else { //高质量数据
+					basic_bool := basicDataScore(source, info)
+					if !basic_bool {
+						DM.replaceSourceData(info, source.id) //替换
+						repeat_idMap["_id"] = StringTOBsonId(source.id)
+						repeat_id = info.id
+					}
+				}
 
-					//重复数据打标签
-					updateExtract = append(updateExtract, []map[string]interface{}{
-						repeat_idMap,
-						map[string]interface{}{
-							"$set": map[string]interface{}{
-								"repeat":        1,
-								"repeat_reason": reason,
-								"repeat_id":     repeat_id,
-							},
+				//重复数据打标签
+				updateExtract = append(updateExtract, []map[string]interface{}{
+					repeat_idMap,
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat":        1,
+							"repeat_reason": reason,
+							"repeat_id":     repeat_id,
 						},
-					})
+					},
+				})
 
-				}
 			}
 		}(tmp)
 		if len(updateExtract) > 500 {
@@ -344,7 +337,6 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	wg.Wait()
 	if len(updateExtract) > 0 {
 		mgo.UpSertBulk(extract, updateExtract...)
-		//mgo.UpdateBulk(bidding, updateBidding...)
 	}
 	log.Println("this task over.", n, "repeateN:", repeateN, mapInfo["stop"])
 
@@ -379,28 +371,17 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
 
-	var q map[string]interface{}
-	if idtype == "1" {
-		q = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  mapInfo["gtid"].(string),
-				"$lte": mapInfo["lteid"].(string),
-			},
-		}
-	} else {
-		q = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
-				"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
-			},
-		}
+	q:= map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
 	}
-
 	it := sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
 	minTime, maxTime := int64(0), int64(0)
 	for tmp := make(map[string]interface{}); it.Next(&tmp); {
 		//取出最大最小时间
-		info_time:=tmp["comeintime"]
+		info_time := tmp["comeintime"]
 		if Is_Sort {
 			info_time = tmp["publishtime"]
 		}
@@ -426,27 +407,18 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	gtid, lteid := util.BsonIdToSId(mapInfo["gtid"].(string)), util.BsonIdToSId(mapInfo["lteid"].(string))
 	fmt.Println(gtid, lteid)
 	HM = NewHistorymap(gtid, lteid, minTime, maxTime)
+
 	fmt.Println("开始历史数据判重")
 
 	defer util.Catch()
 	//区间id
 	sess_history := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess_history)
-	var q_history map[string]interface{}
-	if idtype == "1" {
-		q_history = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  mapInfo["gtid"].(string),
-				"$lte": mapInfo["lteid"].(string),
-			},
-		}
-	} else {
-		q_history = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
-				"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
-			},
-		}
+	q_history := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
 	}
 	log.Println(mgo.DbName, extract, q_history)
 
@@ -459,7 +431,6 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	log.Println("线程数:", threadNum)
 	pool := make(chan bool, threadNum)
 	wg := &sync.WaitGroup{}
-	//mapLock := &sync.Mutex{}
 	n, repeateN := 0, 0
 	for tmp := make(map[string]interface{}); it_history.Next(&tmp); n++ {
 		if n%10000 == 0 {
@@ -473,147 +444,139 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
-			if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
-				updateExtract = append(updateExtract, []map[string]interface{}{
-					map[string]interface{}{
-						"_id": tmp["_id"],
-					},
-					map[string]interface{}{
-						"$set": map[string]interface{}{
-							"repeat": -1,
+			if !LowHeavy {	//是否进行低质量数据判重
+				if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						map[string]interface{}{
+							"_id": tmp["_id"],
 						},
-					},
-				})
-				if len(updateExtract) > 500 {
-					mgo.UpSertBulk(extract, updateExtract...)
-					updateExtract = [][]map[string]interface{}{}
-				}
-			} else {
-				b, source, reason := HM.checkHistory(info)
-				if b { //有重复,生成更新语句,更新抽取和更新招标
-					if reason == "未判重记录" {
-						fmt.Println("未判重记录")
-						//把info的数据判重的标签更换,并新增字段
-						DM.replaceSourceData(info, info.id) //替换即添加
-						updateExtract = append(updateExtract, []map[string]interface{}{
-							map[string]interface{}{
-								"_id": tmp["_id"],
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat": -1,//无效数据标签
 							},
-							map[string]interface{}{
-								"$set": map[string]interface{}{
-									"repeat":   0,
-									"repeatid": -2,
-								},
+						},
+					})
+					if len(updateExtract) > 500 {
+						mgo.UpSertBulk(extract, updateExtract...)
+						updateExtract = [][]map[string]interface{}{}
+					}
+					return
+				}
+			}
+			b, source, reason := HM.checkHistory(info)
+			if b { //有重复,生成更新语句,更新抽取和更新招标
+				if reason == "未判重记录" {
+					fmt.Println("未判重记录")
+					//把info的数据判重的标签更换,并新增字段
+					HM.replaceSourceData(info, info.id) //替换即添加
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						map[string]interface{}{
+							"_id": tmp["_id"],
+						},
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat":   0,
+								"repeatid": -2,
 							},
-						})
-					} else {
-						repeateN++
-						var is_replace = false
-						var mergeArr = []int64{}                    //更改合并数组记录
-						var newData = &Info{}                       //更换新的数据池数据
-						var repeat_idMap = map[string]interface{}{} //记录判重的
-						var merge_idMap = map[string]interface{}{}  //记录合并的
-						if idtype == "1" {                          //先临时决定一个id
-							repeat_idMap["_id"] = info.id
-							merge_idMap["_id"] = source.id
-						} else {
+						},
+					})
+				} else {
+					repeateN++
+					var is_replace = false
+					var mergeArr = []int64{}                    //更改合并数组记录
+					var newData = &Info{}                       //更换新的数据池数据
+					var repeat_idMap = map[string]interface{}{} //记录判重的
+					var merge_idMap = map[string]interface{}{}  //记录合并的
+					repeat_idMap["_id"] = StringTOBsonId(info.id)
+					merge_idMap["_id"] = StringTOBsonId(source.id)
+					repeat_id := source.id
+					//以下合并相关
+					if isMerger {
+						basic_bool := basicDataScore(source, info)
+						if basic_bool {
+							//已原始数据为标准 - 对比数据打判重标签-
+							newData, mergeArr, is_replace = mergeDataFields(source, info)
+							HM.replaceSourceData(newData, source.id) //替换
+							//对比数据打重复标签的id,原始数据id的记录
 							repeat_idMap["_id"] = StringTOBsonId(info.id)
 							merge_idMap["_id"] = StringTOBsonId(source.id)
+							repeat_id = source.id
+						} else {
+							//已对比数据为标准 ,数据池的数据打判重标签
+							newData, mergeArr, is_replace = mergeDataFields(info, source)
+							HM.replaceSourceData(newData, source.id) //替换
+							//原始数据打重复标签的id,   对比数据id的记录
+							repeat_idMap["_id"] = StringTOBsonId(source.id)
+							merge_idMap["_id"] = StringTOBsonId(info.id)
+							repeat_id = info.id
 						}
-						repeat_id := source.id
-						//以下合并相关
-						if isMerger {
-							basic_bool := basicDataScore(source, info)
-							if basic_bool {
-								//已原始数据为标准 - 对比数据打判重标签-
-								newData, mergeArr, is_replace = mergeDataFields(source, info)
-								DM.replaceSourceData(newData, source.id) //替换
-								//对比数据打重复标签的id,原始数据id的记录
-								if idtype == "1" {
-									repeat_idMap["_id"] = info.id
-									merge_idMap["_id"] = source.id
-								} else {
-									repeat_idMap["_id"] = StringTOBsonId(info.id)
-									merge_idMap["_id"] = StringTOBsonId(source.id)
-								}
-								repeat_id = source.id
-							} else {
-								//已对比数据为标准 ,数据池的数据打判重标签
-								newData, mergeArr, is_replace = mergeDataFields(info, source)
-								DM.replaceSourceData(newData, source.id) //替换
-
-								//原始数据打重复标签的id,   对比数据id的记录
-								if idtype == "1" {
-									repeat_idMap["_id"] = source.id
-									merge_idMap["_id"] = info.id
-								} else {
-									repeat_idMap["_id"] = StringTOBsonId(source.id)
-									merge_idMap["_id"] = StringTOBsonId(info.id)
-								}
-								repeat_id = info.id
-							}
 
-							merge_map := make(map[string]interface{}, 0)
-							if is_replace { //有过合并-更新数据
-
-								merge_map = map[string]interface{}{
-									"$set": map[string]interface{}{
-										"merge": newData.mergemap,
-									},
-								}
+						merge_map := make(map[string]interface{}, 0)
+						if is_replace { //有过合并-更新数据
+							merge_map = map[string]interface{}{
+								"$set": map[string]interface{}{
+									"merge": newData.mergemap,
+								},
+							}
 
-								//更新合并后的数据
-								for _, value := range mergeArr {
-									if value == 0 {
-										merge_map["$set"].(map[string]interface{})["area"] = newData.area
-										merge_map["$set"].(map[string]interface{})["city"] = newData.city
-									} else if value == 1 {
-										merge_map["$set"].(map[string]interface{})["area"] = newData.area
-										merge_map["$set"].(map[string]interface{})["city"] = newData.city
-									} else if value == 2 {
-										merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-									} else if value == 3 {
-										merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-									} else if value == 4 {
-										merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-									} else if value == 5 {
-										merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
-									} else if value == 6 {
-										merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
-									} else if value == 7 {
-										merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-									} else if value == 8 {
-										merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-									} else if value == 9 {
-										merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
-									} else if value == 10 {
-										merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
-									} else if value == 11 {
-										merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
-									} else {
-									}
+							//更新合并后的数据
+							for _, value := range mergeArr {
+								if value == 0 {
+									merge_map["$set"].(map[string]interface{})["area"] = newData.area
+									merge_map["$set"].(map[string]interface{})["city"] = newData.city
+								} else if value == 1 {
+									merge_map["$set"].(map[string]interface{})["area"] = newData.area
+									merge_map["$set"].(map[string]interface{})["city"] = newData.city
+								} else if value == 2 {
+									merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
+								} else if value == 3 {
+									merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
+								} else if value == 4 {
+									merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
+								} else if value == 5 {
+									merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
+								} else if value == 6 {
+									merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
+								} else if value == 7 {
+									merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
+								} else if value == 8 {
+									merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
+								} else if value == 9 {
+									merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+								} else if value == 10 {
+									merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
+								} else if value == 11 {
+									merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
+								} else {
 								}
-								//模板数据更新
-								updateExtract = append(updateExtract, []map[string]interface{}{
-									merge_idMap,
-									merge_map,
-								})
 							}
+							//模板数据更新
+							updateExtract = append(updateExtract, []map[string]interface{}{
+								merge_idMap,
+								merge_map,
+							})
 						}
+					}else { //高质量数据
+						basic_bool := basicDataScore(source, info)
+						if !basic_bool {
+							HM.replaceSourceData(info, source.id) //替换
+							repeat_idMap["_id"] = StringTOBsonId(source.id)
+							repeat_id = info.id
+						}
+					}
 
-						//重复数据打标签
-						updateExtract = append(updateExtract, []map[string]interface{}{
-							repeat_idMap,
-							map[string]interface{}{
-								"$set": map[string]interface{}{
-									"repeat":        1,
-									"repeat_reason": reason,
-									"repeat_id":     repeat_id,
-								},
+					//重复数据打标签
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						repeat_idMap,
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat":        1,
+								"repeat_reason": reason,
+								"repeat_id":     repeat_id,
 							},
-						})
+						},
+					})
 
-					}
 				}
 			}
 		}(tmp)
@@ -885,7 +848,7 @@ func basicDataScore(v *Info, info *Info) bool {
 	if v.bidopentime != 0 {
 		m++
 	}
-	if v.agencyaddr != "" {
+	if v.bidopenaddress != "" {
 		m++
 	}
 	if v.agency != "" {
@@ -916,7 +879,7 @@ func basicDataScore(v *Info, info *Info) bool {
 	if info.bidopentime != 0 {
 		n++
 	}
-	if info.agencyaddr != "" {
+	if info.bidopenaddress != "" {
 		n++
 	}
 	if info.agency != "" {