瀏覽代碼

低质量数据判重

apple 5 年之前
父節點
當前提交
64c4288b00
共有 3 個文件被更改,包括 477 次插入315 次删除
  1. 12 5
      udpfilterdup/src/config.json
  2. 189 11
      udpfilterdup/src/datamap.go
  3. 276 299
      udpfilterdup/src/main.go

+ 12 - 5
udpfilterdup/src/config.json

@@ -2,10 +2,10 @@
     "udpport": ":1485",
     "dupdays": 5,
     "mongodb": {
-        "addr": "127.0.0.1:27092",
+        "addr": "192.168.3.207:27092",
         "pool": 10,
-        "db": "qfw",
-        "extract": "extract_v20190111",
+        "db": "extract_kf",
+        "extract": "a_testbidding_new",
         "site": {
             "dbname": "qfw",
             "coll": "site"
@@ -17,11 +17,18 @@
     },
     "nextNode": [],
     "isMerger": false,
-    "threads": 1,
+    "threads": 5,
     "isSort":false,
+    "lowHeavy":true,
     "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
     "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批)",
-    "specialtitle_2": "项目([0-9a-zA-Z一二三四五六七八九十零123456789])",
+    "specialtitle_2": "项目[(][0-9a-zA-Z一二三四五六七八九十零123456789][)]",
     "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
 }
 
+
+
+
+
+

+ 189 - 11
udpfilterdup/src/datamap.go

@@ -5,6 +5,7 @@ import (
 	"log"
 	qutil "qfw/util"
 	"qfw/util/mongodb"
+	"regexp"
 	"strings"
 	"sync"
 	"time"
@@ -224,7 +225,7 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.repeatid = qutil.ObjToString(tmp["repeatid"])
 
 	info.specialWord = FilterRegTitle.MatchString(info.title)
-	info.titleSpecialWord = FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
+	info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) ||FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
 	info.mergemap = *qutil.ObjToMap(tmp["merge_map"])
 	if info.mergemap == nil {
 		info.mergemap = make(map[string]interface{}, 0)
@@ -316,6 +317,19 @@ L:
 						}
 					}
 
+
+					//新增快速数据过少判重
+					if LowHeavy {
+						repeat := false
+						if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break
+						}
+					}
+
+
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
 						reason = reason + "同机构-"
@@ -459,6 +473,17 @@ L:
 						}
 					}
 
+					//新增快速数据过少判重
+					if LowHeavy {
+						repeat := false
+						if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break
+						}
+					}
+
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
 						reason = reason + "同机构-"
@@ -616,7 +641,7 @@ func (d *datamap) update(t int64) {
 	//log.Println("更新前后数据:", all, all1)
 }
 
-func (d *datamap) GetLatelyFiveDay(t int64) []string {
+func (d *datamap) GetLatelyFiveDay(t int64) []string  {
 	array := make([]string, d.days)
 	now := time.Unix(t, 0)
 	for i := 0; i < d.days; i++ {
@@ -628,9 +653,121 @@ func (d *datamap) GetLatelyFiveDay(t int64) []string {
 
 /*
 **************************
-******* 以下为判重 ********
+******** 以下为判重 ********
 **************************
  */
+
+ //快速低质量数据判重
+func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
+	//首先判定是否为低质量数据    info目标数据
+	if info.agency==v.agency&&info.title!=""&&
+		info.title==v.title &&
+		info.projectname==""&&info.projectcode==""&&info.contractnumber==""&&info.buyer=="" {
+		isValue:=0//五要素判断
+		if info.budget != 0 {//预算
+			isValue++
+		}
+		if info.bidopentime != 0{//开标时间
+			isValue++
+		}
+		if info.agencyaddr!=""{//开标地点
+			isValue++
+		}
+		if info.winner != ""{//中标单位
+			isValue++
+		}
+		if info.bidamount != 0 {//中标金额
+			isValue++
+		}
+		if isValue==0 {
+			//if info.site!=v.site {
+			//	log.Println("符合低质量条件条件0",info.id,"--",v.id)
+			//}
+			//log.Println("符合低质量条件条件0",info.id,"--",v.id)
+			reason = reason + "---要素均为空,标题包含关系"
+			return true, reason
+		}else if isValue==1 {
+			isMeet := false
+			if isMeet, reason = judgeLowQualityData(v, info, reason); isMeet {
+				log.Println("符合低质量条件条件1",info.id,"--",v.id)
+				reason = reason + "---有且一个要素组合"
+				return true, reason
+			}
+		}else {
+
+		}
+	}
+	return false,reason
+}
+
+//类别细节原因记录
+func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
+	if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
+		info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
+		info.subtype == "变更" || info.subtype == "其他" {
+		//招标结果
+		if info.budget != 0 && info.budget == v.budget{//预算
+			reason = reason + "---招标类:预算"
+			return true,reason
+		}
+		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+			reason = reason + "---招标类:开标时间"
+			return true,reason
+		}
+		if info.agencyaddr!="" && info.agencyaddr == v.agencyaddr{//开标地点
+			reason = reason + "---招标类:开标地点"
+			return true,reason
+		}
+	} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
+		//中标结果
+		if v.winner != "" && info.winner == v.winner{//中标单位
+			reason = reason + "---中标类:中标单位"
+			return true,reason
+		}
+		if v.bidamount != 0 && info.bidamount == v.bidamount{//中标金额
+			reason = reason + "---中标类:中标金额"
+			return true,reason
+		}
+	} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+		//合同
+		if info.budget != 0 && info.budget == v.budget{//预算
+			reason = reason + "---合同类:预算"
+			return true,reason
+		}
+		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+			reason = reason + "---合同类:开标时间"
+			return true,reason
+		}
+		if info.agencyaddr!="" && info.agencyaddr == v.agencyaddr{//开标地点
+			reason = reason + "---合同类:开标地点"
+			return true,reason
+		}
+		if v.winner != "" && info.winner == v.winner{//中标单位
+			reason = reason + "---合同类:中标单位"
+			return true,reason
+		}
+		if v.bidamount != 0 && info.bidamount == v.bidamount{//中标金额
+			reason = reason + "---合同类:中标金额"
+			return true,reason
+		}
+	} else {
+		//招标结果
+		if info.budget != 0 && info.budget == v.budget{//预算
+			reason = reason + "---类别空-招标类:预算"
+			return true,reason
+		}
+		if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
+			reason = reason + "---类别空-招标类:开标时间"
+			return true,reason
+		}
+		if info.agencyaddr!="" && info.agencyaddr == v.agencyaddr{//开标地点
+			reason = reason + "---类别空-招标类:开标地点"
+			return true,reason
+		}
+	}
+	return false,reason
+}
+
 //判重方法1
 func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 
@@ -897,14 +1034,25 @@ func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
 		ss = ss + "p3(编号组)-"
 		p3 = true
 	}
-	if v.bidamount != 0 && v.bidamount == info.bidamount {
+	//if v.bidamount != 0 && v.bidamount == info.bidamount {
+	//	ss = ss + "p5(中标金)-"
+	//	p5 = true
+	//}
+	//if v.winner != "" && v.winner == info.winner {
+	//	ss = ss + "p6(中标人)-"
+	//	p6 = true
+	//}
+
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
 		ss = ss + "p5(中标金)-"
 		p5 = true
 	}
-	if v.winner != "" && v.winner == info.winner {
+	if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
 		ss = ss + "p6(中标人)-"
 		p6 = true
 	}
+
+
 	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
 		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
 		ss = ss + "p11(标题)-"
@@ -939,10 +1087,10 @@ func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
 		m++
 	}
-	if v.bidamount != 0 && v.bidamount == info.bidamount {
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
 		m++
 	}
-	if v.winner != "" && v.winner == info.winner {
+	if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
 		m++
 	}
 	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
@@ -964,10 +1112,14 @@ func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 //中标_C
 func winningRepeat_C(v *Info, info *Info) bool {
 
-	if v.bidamount != 0 && info.bidamount != 0 && v.bidamount != info.bidamount {
+	//if v.bidamount != 0 && info.bidamount != 0 && v.bidamount != info.bidamount {
+	//	return true
+	//}
+	if v.bidamount != 0 && info.bidamount != 0 && isBidWinningAmount(v.bidamount,info.bidamount) {
 		return true
 	}
-	if v.winner != "" && info.winner != "" && v.winner != info.winner {
+
+	if v.winner != "" && info.winner != "" && deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) {
 		return true
 	}
 	//原始地址...
@@ -1018,6 +1170,7 @@ func contractRepeat_C(v *Info, info *Info) bool {
 	return false
 }
 
+//再次金额判断
 func againRepeat(v *Info, info *Info) bool {
 	//相同采购单位下
 	if info.buyer != "" && v.buyer == info.buyer {
@@ -1032,8 +1185,8 @@ func againRepeat(v *Info, info *Info) bool {
 			info.subtype == "流标" || info.subtype == "合同" || info.subtype == "验收" ||
 			info.subtype == "违规" {
 			//中标金额单位满足条件
-			if (v.bidamount != info.bidamount && v.bidamount != 0 && info.bidamount != 0) ||
-				(v.winner != info.winner && v.winner != "" && info.winner != "") {
+			if (isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0) ||
+				(deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "") {
 				return true
 			}
 		} else {
@@ -1043,3 +1196,28 @@ func againRepeat(v *Info, info *Info) bool {
 
 	return false
 }
+
+//删除中标单位字符串中多余的空格(含tab)
+func deleteExtraSpace(s string) string {
+	//删除字符串中的多余空格,有多个空格时,仅保留一个空格
+	s1 := strings.Replace(s, "  ", " ", -1)      //替换tab为空格
+	regstr := "\\s{2,}"                          //两个及两个以上空格的正则表达式
+	reg, _ := regexp.Compile(regstr)             //编译正则表达式
+	s2 := make([]byte, len(s1))                  //定义字符数组切片
+	copy(s2, s1)                                 //将字符串复制到切片
+	spc_index := reg.FindStringIndex(string(s2)) //在字符串中搜索
+	for len(spc_index) > 0 {                     //找到适配项
+		s2 = append(s2[:spc_index[0]+1], s2[spc_index[1]:]...) //删除多余空格
+		spc_index = reg.FindStringIndex(string(s2))            //继续在字符串中搜索
+	}
+	return string(s2)
+}
+
+//中标金额倍率:10000
+func isBidWinningAmount(f1 float64 ,f2 float64) bool {
+
+	if f1==f2||f1*10000==f2||f2*10000==f1 {
+		return false
+	}
+	return true
+}

+ 276 - 299
udpfilterdup/src/main.go

@@ -32,6 +32,7 @@ var (
 
 	//正则筛选相关
 	FilterRegTitle   = regexp.MustCompile("^_$")
+	FilterRegTitle_0 = regexp.MustCompile("^_$")
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
 	FilterRegTitle_2 = regexp.MustCompile("^_$")
 
@@ -39,14 +40,14 @@ var (
 	Is_Sort          bool                              //是否排序
 	threadNum        int                               //线程数量
 	SiteMap          map[string]map[string]interface{} //站点map
-	idtype, sid, eid string                            //测试人员判重使用
+	LowHeavy		 bool							   //低质量数据判重
+	sid, eid string                            //测试人员判重使用
 )
 
 func init() {
 	flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
 	flag.StringVar(&sid, "sid", "", "开始id")
 	flag.StringVar(&eid, "eid", "", "结束id")
-	flag.StringVar(&idtype, "idtype", "", "id类型,默认ObjectId:0,String:1")
 	flag.Parse()
 	//172.17.145.163:27080
 	util.ReadConfig(&Sysconfig)
@@ -63,12 +64,13 @@ func init() {
 	//加载数据
 	DM = NewDatamap(dupdays, lastid)
 	FilterRegTitle = regexp.MustCompile(util.ObjToString(Sysconfig["specialwords"]))
+	FilterRegTitle_0 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_0"]))
 	FilterRegTitle_1 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_1"]))
 	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
 	isMerger = Sysconfig["isMerger"].(bool)
 	Is_Sort = Sysconfig["isSort"].(bool)
 	threadNum = util.IntAllDef(Sysconfig["threads"], 1)
-
+	LowHeavy =  Sysconfig["lowHeavy"].(bool)
 	//站点配置
 	site := mconf["site"].(map[string]interface{})
 	SiteMap = make(map[string]map[string]interface{}, 0)
@@ -103,12 +105,14 @@ func mainT() {
 	/*
 		ObjectId("5da3f31aa5cb26b9b798d3aa")
 		ObjectId("5da418c4a5cb26b9b7e3e9a6")
-		ObjectId("5df5071ce9d1f601e495fa54")
-		ObjectId("5e09c05f0cf41612e0626abc")
+
+		ObjectId("5da3f2c5a5cb26b9b79847fc")
+		ObjectId("5db2735ba5cb26b9b7c99c6f")
 	*/
-	log.Println("测试开始")
-	sid = "5da3f31aa5cb26b9b798d3aa"
-	eid = "5da418c4a5cb26b9b7e3e9a6"
+
+	//
+	sid = "5da3f2c5a5cb26b9b79847fc"
+	eid = "5db2735ba5cb26b9b7c99c6f"
 	mapinfo := map[string]interface{}{}
 	if sid == "" || eid == "" {
 		log.Println("sid,eid参数不能为空")
@@ -160,32 +164,23 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 
 //开始判重程序
 func task(data []byte, mapInfo map[string]interface{}) {
-	fmt.Println("开始数据判重")
+	log.Println("开始数据判重")
 	defer util.Catch()
 	//区间id
-	q := map[string]interface{}{}
-	if idtype == "1" {
-		q = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  mapInfo["gtid"].(string),
-				"$lte": mapInfo["lteid"].(string),
-			},
-		}
-	} else {
-		q = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
-				"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
-			},
-		}
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
 	}
 	log.Println(mgo.DbName, extract, q)
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
 
 	//是否排序
-	it := sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
+	it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("_id").Iter()
 	if Is_Sort {
+		log.Println("排序:publishtime")
 		it = sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
 	}
 	//it = sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
@@ -193,8 +188,8 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	log.Println("线程数:", threadNum)
 	pool := make(chan bool, threadNum)
 	wg := &sync.WaitGroup{}
-	//mapLock := &sync.Mutex{}
 	n, repeateN := 0, 0
+
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
 		if n%10000 == 0 {
 			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
@@ -207,132 +202,123 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
-			//是否为无效数据
-			if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
-				updateExtract = append(updateExtract, []map[string]interface{}{
-					map[string]interface{}{
-						"_id": tmp["_id"],
-					},
-					map[string]interface{}{
-						"$set": map[string]interface{}{
-							"repeat": -1,
+			if !LowHeavy {	//是否进行低质量数据判重
+				if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						map[string]interface{}{
+							"_id": tmp["_id"],
 						},
-					},
-				})
-				if len(updateExtract) > 500 {
-					mgo.UpSertBulk(extract, updateExtract...)
-					updateExtract = [][]map[string]interface{}{}
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat": -1,//无效数据标签
+							},
+						},
+					})
+					if len(updateExtract) > 500 {
+						mgo.UpSertBulk(extract, updateExtract...)
+						updateExtract = [][]map[string]interface{}{}
+					}
+					return
 				}
-			} else {
-				b, source, reason := DM.check(info)
-				if b { //有重复,生成更新语句,更新抽取和更新招标
-					repeateN++
-					var is_replace = false
-					var mergeArr = []int64{}                    //更改合并数组记录
-					var newData = &Info{}                       //更换新的数据池数据
-					var repeat_idMap = map[string]interface{}{} //记录判重的
-					var merge_idMap = map[string]interface{}{}  //记录合并的
-					if idtype == "1" {                          //先临时决定一个id
-						repeat_idMap["_id"] = info.id
-						merge_idMap["_id"] = source.id
-					} else {
+			}
+
+			b, source, reason := DM.check(info)
+			if b { //有重复,生成更新语句,更新抽取和更新招标
+				repeateN++
+				var is_replace = false
+				var mergeArr = []int64{}                    //更改合并数组记录
+				var newData = &Info{}                       //更换新的数据池数据
+				var repeat_idMap = map[string]interface{}{} //记录判重的
+				var merge_idMap = map[string]interface{}{}  //记录合并的
+				repeat_idMap["_id"] = StringTOBsonId(info.id)
+				merge_idMap["_id"] = StringTOBsonId(source.id)
+				repeat_id := source.id//初始化一个数据
+
+				if isMerger {//合并相关
+					basic_bool := basicDataScore(source, info)
+					if basic_bool {
+						//已原始数据为标准 - 对比数据打判重标签-
+						newData, mergeArr, is_replace = mergeDataFields(source, info)
+						DM.replaceSourceData(newData, source.id) //替换
+						//对比数据打重复标签的id,原始数据id的记录
 						repeat_idMap["_id"] = StringTOBsonId(info.id)
 						merge_idMap["_id"] = StringTOBsonId(source.id)
+						repeat_id = source.id
+					} else {
+						//已对比数据为标准 ,数据池的数据打判重标签
+						newData, mergeArr, is_replace = mergeDataFields(info, source)
+						DM.replaceSourceData(newData, source.id) //替换
+						//原始数据打重复标签的id,   对比数据id的记录
+						repeat_idMap["_id"] = StringTOBsonId(source.id)
+						merge_idMap["_id"] = StringTOBsonId(info.id)
+						repeat_id = info.id
 					}
-					repeat_id := source.id
-					//以下合并相关
-					if isMerger {
-						basic_bool := basicDataScore(source, info)
-						if basic_bool {
-							//已原始数据为标准 - 对比数据打判重标签-
-							newData, mergeArr, is_replace = mergeDataFields(source, info)
-							DM.replaceSourceData(newData, source.id) //替换
-							//对比数据打重复标签的id,原始数据id的记录
-							if idtype == "1" {
-								repeat_idMap["_id"] = info.id
-								merge_idMap["_id"] = source.id
-							} else {
-								repeat_idMap["_id"] = StringTOBsonId(info.id)
-								merge_idMap["_id"] = StringTOBsonId(source.id)
-							}
-							repeat_id = source.id
-						} else {
-							//已对比数据为标准 ,数据池的数据打判重标签
-							newData, mergeArr, is_replace = mergeDataFields(info, source)
-							DM.replaceSourceData(newData, source.id) //替换
 
-							//原始数据打重复标签的id,   对比数据id的记录
-							if idtype == "1" {
-								repeat_idMap["_id"] = source.id
-								merge_idMap["_id"] = info.id
-							} else {
-								repeat_idMap["_id"] = StringTOBsonId(source.id)
-								merge_idMap["_id"] = StringTOBsonId(info.id)
-							}
-							repeat_id = info.id
+					merge_map := make(map[string]interface{}, 0)
+					if is_replace { //有过合并-更新数据
+						merge_map = map[string]interface{}{
+							"$set": map[string]interface{}{
+								"merge": newData.mergemap,
+							},
 						}
-
-						merge_map := make(map[string]interface{}, 0)
-						if is_replace { //有过合并-更新数据
-
-							merge_map = map[string]interface{}{
-								"$set": map[string]interface{}{
-									"merge": newData.mergemap,
-								},
-							}
-
-							//更新合并后的数据
-							for _, value := range mergeArr {
-								if value == 0 {
-									merge_map["$set"].(map[string]interface{})["area"] = newData.area
-									merge_map["$set"].(map[string]interface{})["city"] = newData.city
-								} else if value == 1 {
-									merge_map["$set"].(map[string]interface{})["area"] = newData.area
-									merge_map["$set"].(map[string]interface{})["city"] = newData.city
-								} else if value == 2 {
-									merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-								} else if value == 3 {
-									merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-								} else if value == 4 {
-									merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-								} else if value == 5 {
-									merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
-								} else if value == 6 {
-									merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
-								} else if value == 7 {
-									merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-								} else if value == 8 {
-									merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-								} else if value == 9 {
-									merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
-								} else if value == 10 {
-									merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
-								} else if value == 11 {
-									merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
-								} else {
-								}
+						//更新合并后的数据
+						for _, value := range mergeArr {
+							if value == 0 {
+								merge_map["$set"].(map[string]interface{})["area"] = newData.area
+								merge_map["$set"].(map[string]interface{})["city"] = newData.city
+							} else if value == 1 {
+								merge_map["$set"].(map[string]interface{})["area"] = newData.area
+								merge_map["$set"].(map[string]interface{})["city"] = newData.city
+							} else if value == 2 {
+								merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
+							} else if value == 3 {
+								merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
+							} else if value == 4 {
+								merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
+							} else if value == 5 {
+								merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
+							} else if value == 6 {
+								merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
+							} else if value == 7 {
+								merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
+							} else if value == 8 {
+								merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
+							} else if value == 9 {
+								merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+							} else if value == 10 {
+								merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
+							} else if value == 11 {
+								merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
+							} else {
 							}
-							//模板数据更新
-							updateExtract = append(updateExtract, []map[string]interface{}{
-								merge_idMap,
-								merge_map,
-							})
 						}
+						//模板数据更新
+						updateExtract = append(updateExtract, []map[string]interface{}{
+							merge_idMap,
+							merge_map,
+						})
+					}
+				}else { //高质量数据
+					basic_bool := basicDataScore(source, info)
+					if !basic_bool {
+						DM.replaceSourceData(info, source.id) //替换
+						repeat_idMap["_id"] = StringTOBsonId(source.id)
+						repeat_id = info.id
 					}
+				}
 
-					//重复数据打标签
-					updateExtract = append(updateExtract, []map[string]interface{}{
-						repeat_idMap,
-						map[string]interface{}{
-							"$set": map[string]interface{}{
-								"repeat":        1,
-								"repeat_reason": reason,
-								"repeat_id":     repeat_id,
-							},
+				//重复数据打标签
+				updateExtract = append(updateExtract, []map[string]interface{}{
+					repeat_idMap,
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat":        1,
+							"repeat_reason": reason,
+							"repeat_id":     repeat_id,
 						},
-					})
+					},
+				})
 
-				}
 			}
 		}(tmp)
 		if len(updateExtract) > 500 {
@@ -344,7 +330,6 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	wg.Wait()
 	if len(updateExtract) > 0 {
 		mgo.UpSertBulk(extract, updateExtract...)
-		//mgo.UpdateBulk(bidding, updateBidding...)
 	}
 	log.Println("this task over.", n, "repeateN:", repeateN, mapInfo["stop"])
 
@@ -379,23 +364,12 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
 
-	var q map[string]interface{}
-	if idtype == "1" {
-		q = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  mapInfo["gtid"].(string),
-				"$lte": mapInfo["lteid"].(string),
-			},
-		}
-	} else {
-		q = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
-				"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
-			},
-		}
+	q:= map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
 	}
-
 	it := sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
 	minTime, maxTime := int64(0), int64(0)
 	for tmp := make(map[string]interface{}); it.Next(&tmp); {
@@ -426,27 +400,18 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	gtid, lteid := util.BsonIdToSId(mapInfo["gtid"].(string)), util.BsonIdToSId(mapInfo["lteid"].(string))
 	fmt.Println(gtid, lteid)
 	HM = NewHistorymap(gtid, lteid, minTime, maxTime)
+
 	fmt.Println("开始历史数据判重")
 
 	defer util.Catch()
 	//区间id
 	sess_history := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess_history)
-	var q_history map[string]interface{}
-	if idtype == "1" {
-		q_history = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  mapInfo["gtid"].(string),
-				"$lte": mapInfo["lteid"].(string),
-			},
-		}
-	} else {
-		q_history = map[string]interface{}{
-			"_id": map[string]interface{}{
-				"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
-				"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
-			},
-		}
+	q_history := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
 	}
 	log.Println(mgo.DbName, extract, q_history)
 
@@ -459,7 +424,6 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	log.Println("线程数:", threadNum)
 	pool := make(chan bool, threadNum)
 	wg := &sync.WaitGroup{}
-	//mapLock := &sync.Mutex{}
 	n, repeateN := 0, 0
 	for tmp := make(map[string]interface{}); it_history.Next(&tmp); n++ {
 		if n%10000 == 0 {
@@ -473,149 +437,162 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
-			if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
-				updateExtract = append(updateExtract, []map[string]interface{}{
-					map[string]interface{}{
-						"_id": tmp["_id"],
-					},
-					map[string]interface{}{
-						"$set": map[string]interface{}{
-							"repeat": -1,
+			if !LowHeavy {	//是否进行低质量数据判重
+				if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						map[string]interface{}{
+							"_id": tmp["_id"],
 						},
-					},
-				})
-				if len(updateExtract) > 500 {
-					mgo.UpSertBulk(extract, updateExtract...)
-					updateExtract = [][]map[string]interface{}{}
-				}
-			} else {
-				b, source, reason := HM.checkHistory(info)
-				if b { //有重复,生成更新语句,更新抽取和更新招标
-					if reason == "未判重记录" {
-						fmt.Println("未判重记录")
-						//把info的数据判重的标签更换,并新增字段
-						DM.replaceSourceData(info, info.id) //替换即添加
-						updateExtract = append(updateExtract, []map[string]interface{}{
-							map[string]interface{}{
-								"_id": tmp["_id"],
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat": -1,//无效数据标签
 							},
-							map[string]interface{}{
-								"$set": map[string]interface{}{
-									"repeat":   0,
-									"repeatid": -2,
-								},
+						},
+					})
+					if len(updateExtract) > 500 {
+						mgo.UpSertBulk(extract, updateExtract...)
+						updateExtract = [][]map[string]interface{}{}
+					}
+					return
+				}
+			}
+			b, source, reason := HM.checkHistory(info)
+			if b { //有重复,生成更新语句,更新抽取和更新招标
+				if reason == "未判重记录" {
+					fmt.Println("未判重记录")
+					//把info的数据判重的标签更换,并新增字段
+					HM.replaceSourceData(info, info.id) //替换即添加
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						map[string]interface{}{
+							"_id": tmp["_id"],
+						},
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat":   0,
+								"repeatid": -2,
 							},
-						})
-					} else {
-						repeateN++
-						var is_replace = false
-						var mergeArr = []int64{}                    //更改合并数组记录
-						var newData = &Info{}                       //更换新的数据池数据
-						var repeat_idMap = map[string]interface{}{} //记录判重的
-						var merge_idMap = map[string]interface{}{}  //记录合并的
-						if idtype == "1" {                          //先临时决定一个id
-							repeat_idMap["_id"] = info.id
-							merge_idMap["_id"] = source.id
-						} else {
+						},
+					})
+				} else {
+					repeateN++
+					var is_replace = false
+					var mergeArr = []int64{}                    //更改合并数组记录
+					var newData = &Info{}                       //更换新的数据池数据
+					var repeat_idMap = map[string]interface{}{} //记录判重的
+					var merge_idMap = map[string]interface{}{}  //记录合并的
+					repeat_idMap["_id"] = StringTOBsonId(info.id)
+					merge_idMap["_id"] = StringTOBsonId(source.id)
+					repeat_id := source.id
+					//以下合并相关
+					if isMerger {
+						basic_bool := basicDataScore(source, info)
+						if basic_bool {
+							//已原始数据为标准 - 对比数据打判重标签-
+							newData, mergeArr, is_replace = mergeDataFields(source, info)
+							HM.replaceSourceData(newData, source.id) //替换
+							//对比数据打重复标签的id,原始数据id的记录
 							repeat_idMap["_id"] = StringTOBsonId(info.id)
 							merge_idMap["_id"] = StringTOBsonId(source.id)
+							repeat_id = source.id
+						} else {
+							//已对比数据为标准 ,数据池的数据打判重标签
+							newData, mergeArr, is_replace = mergeDataFields(info, source)
+							HM.replaceSourceData(newData, source.id) //替换
+							//原始数据打重复标签的id,   对比数据id的记录
+							repeat_idMap["_id"] = StringTOBsonId(source.id)
+							merge_idMap["_id"] = StringTOBsonId(info.id)
+							repeat_id = info.id
 						}
-						repeat_id := source.id
-						//以下合并相关
-						if isMerger {
-							basic_bool := basicDataScore(source, info)
-							if basic_bool {
-								//已原始数据为标准 - 对比数据打判重标签-
-								newData, mergeArr, is_replace = mergeDataFields(source, info)
-								DM.replaceSourceData(newData, source.id) //替换
-								//对比数据打重复标签的id,原始数据id的记录
-								if idtype == "1" {
-									repeat_idMap["_id"] = info.id
-									merge_idMap["_id"] = source.id
-								} else {
-									repeat_idMap["_id"] = StringTOBsonId(info.id)
-									merge_idMap["_id"] = StringTOBsonId(source.id)
-								}
-								repeat_id = source.id
-							} else {
-								//已对比数据为标准 ,数据池的数据打判重标签
-								newData, mergeArr, is_replace = mergeDataFields(info, source)
-								DM.replaceSourceData(newData, source.id) //替换
-
-								//原始数据打重复标签的id,   对比数据id的记录
-								if idtype == "1" {
-									repeat_idMap["_id"] = source.id
-									merge_idMap["_id"] = info.id
-								} else {
-									repeat_idMap["_id"] = StringTOBsonId(source.id)
-									merge_idMap["_id"] = StringTOBsonId(info.id)
-								}
-								repeat_id = info.id
-							}
 
-							merge_map := make(map[string]interface{}, 0)
-							if is_replace { //有过合并-更新数据
-
-								merge_map = map[string]interface{}{
-									"$set": map[string]interface{}{
-										"merge": newData.mergemap,
-									},
-								}
+						merge_map := make(map[string]interface{}, 0)
+						if is_replace { //有过合并-更新数据
+							merge_map = map[string]interface{}{
+								"$set": map[string]interface{}{
+									"merge": newData.mergemap,
+								},
+							}
 
-								//更新合并后的数据
-								for _, value := range mergeArr {
-									if value == 0 {
-										merge_map["$set"].(map[string]interface{})["area"] = newData.area
-										merge_map["$set"].(map[string]interface{})["city"] = newData.city
-									} else if value == 1 {
-										merge_map["$set"].(map[string]interface{})["area"] = newData.area
-										merge_map["$set"].(map[string]interface{})["city"] = newData.city
-									} else if value == 2 {
-										merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-									} else if value == 3 {
-										merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-									} else if value == 4 {
-										merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-									} else if value == 5 {
-										merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
-									} else if value == 6 {
-										merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
-									} else if value == 7 {
-										merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-									} else if value == 8 {
-										merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-									} else if value == 9 {
-										merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
-									} else if value == 10 {
-										merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
-									} else if value == 11 {
-										merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
-									} else {
-									}
+							//更新合并后的数据
+							for _, value := range mergeArr {
+								if value == 0 {
+									merge_map["$set"].(map[string]interface{})["area"] = newData.area
+									merge_map["$set"].(map[string]interface{})["city"] = newData.city
+								} else if value == 1 {
+									merge_map["$set"].(map[string]interface{})["area"] = newData.area
+									merge_map["$set"].(map[string]interface{})["city"] = newData.city
+								} else if value == 2 {
+									merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
+								} else if value == 3 {
+									merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
+								} else if value == 4 {
+									merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
+								} else if value == 5 {
+									merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
+								} else if value == 6 {
+									merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
+								} else if value == 7 {
+									merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
+								} else if value == 8 {
+									merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
+								} else if value == 9 {
+									merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+								} else if value == 10 {
+									merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
+								} else if value == 11 {
+									merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
+								} else {
 								}
-								//模板数据更新
-								updateExtract = append(updateExtract, []map[string]interface{}{
-									merge_idMap,
-									merge_map,
-								})
 							}
+							//模板数据更新
+							updateExtract = append(updateExtract, []map[string]interface{}{
+								merge_idMap,
+								merge_map,
+							})
+						}
+					}else { //高质量数据
+						basic_bool := basicDataScore(source, info)
+						if !basic_bool {
+							HM.replaceSourceData(info, source.id) //替换
+							repeat_idMap["_id"] = StringTOBsonId(source.id)
+							repeat_id = info.id
 						}
+					}
 
-						//重复数据打标签
-						updateExtract = append(updateExtract, []map[string]interface{}{
-							repeat_idMap,
-							map[string]interface{}{
-								"$set": map[string]interface{}{
-									"repeat":        1,
-									"repeat_reason": reason,
-									"repeat_id":     repeat_id,
-								},
+					//重复数据打标签
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						repeat_idMap,
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat":        1,
+								"repeat_reason": reason,
+								"repeat_id":     repeat_id,
 							},
-						})
+						},
+					})
 
-					}
 				}
 			}
+
+
+
+			//if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
+			//	updateExtract = append(updateExtract, []map[string]interface{}{
+			//		map[string]interface{}{
+			//			"_id": tmp["_id"],
+			//		},
+			//		map[string]interface{}{
+			//			"$set": map[string]interface{}{
+			//				"repeat": -1,
+			//			},
+			//		},
+			//	})
+			//	if len(updateExtract) > 500 {
+			//		mgo.UpSertBulk(extract, updateExtract...)
+			//		updateExtract = [][]map[string]interface{}{}
+			//	}
+			//} else {
+			//
+			//}
 		}(tmp)
 		if len(updateExtract) > 500 {
 			mgo.UpSertBulk(extract, updateExtract...)