zhengkun vor 1 Jahr
Ursprung
Commit
85c52903f3

+ 804 - 0
flow_repeat/README.md

@@ -0,0 +1,804 @@
+{
+"udpport": ":1785",
+"dupdays": 7,
+"mongodb": {
+"addr": "172.17.4.85:27080",
+"pool": 10,
+"db": "qfw",
+"extract": "result_20220219",
+"extract_back": "result_20220218",
+"extract_log": "result_replace_log"
+},
+"task_mongodb": {
+"task_addr": "172.17.4.187:27082,172.17.145.163:27083",
+"task_db": "qfw",
+"task_coll": "bidding_processing_ids",
+"task_bidding": "bidding",
+"task_pool": 5
+},
+"spider_mongodb": {
+"spider_addr": "172.17.4.87:27080",
+"spider_db": "editor",
+"spider_coll": "site",
+"spider_pool": 5
+},
+"userName": "zhengkun",
+"passWord": "zk@123123",
+"jkmail": {
+"to": "zhengkun@topnet.net.cn,wangjianghan@topnet.net.cn",
+"api": "http://172.17.145.179:19281/_send/_mail"
+},
+"nextNode": [
+{
+"addr": "172.17.4.196",
+"port": 1787,
+"stype": "bidding",
+"memo": "同步程序id段udp"
+}
+],
+"jyfb_data": [
+"a_jyxxfbpt_gg"
+],
+"threads": 4,
+"lowHeavy":true,
+"timingTask":false,
+"timingSpanDay": 5,
+"timingPubScope": 1440,
+"specialwords": "(重招|重新招标|勘察|施工|监理|总承包|土石方|可研)",
+"specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
+"specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批|期)",
+"specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
+"beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+}
+
+
+
+mgo = &MongodbSim{
+		MongodbAddr: "172.17.4.187:27083",
+		DbName:      "qfw",
+		Size:        10,
+	}
+mgo.InitPool()
+	return
+	
+func moveTimeoutData()  {
+	log.Println("部署迁移定时任务")
+	c := cron.New()
+	c.AddFunc("0 0 0 * * ?", func() { moveOnceTimeOut() })
+	c.Start()
+}
+
+func moveOnceTimeOut()  {
+	log.Println("执行一次迁移超时数据")
+
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	now:=time.Now()
+	move_time := time.Date(now.Year()-2, now.Month(), now.Day(), now.Hour(), 0, 0, 0, time.Local).Unix()
+	q := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$lt": move_time,
+		},
+	}
+	log.Println(q)
+	it := sess.DB(mgo.DbName).C("result_20200714").Find(&q).Iter()
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		if index%10000 == 0 {
+			log.Println("index", index)
+		}
+		del_id:=BsonTOStringId(tmp["_id"])
+		mgo.Save("result_20200713", tmp)
+		mgo.DeleteById("result_20200714",del_id)
+		tmp = map[string]interface{}{}
+	}
+	log.Println("save and delete", " ok index", index)
+}
+
+
+
+
+
+{
+    "udpport": ":1785",
+    "dupdays": 7,
+    "mongodb": {
+        "addr": "172.17.4.85:27080",
+        "pool": 10,
+        "db": "qfw",
+        "extract": "result_20200715",
+        "extract_back": "result_20200714",
+        "site": {
+            "dbname": "qfw",
+            "coll": "site"
+        }
+    },
+    "jkmail": {
+        "to": "zhengkun@topnet.net.cn,zhangjinkun@topnet.net.cn",
+        "api": "http://172.17.145.179:19281/_send/_mail"
+    },
+    "nextNode": [
+        {
+            "addr": "127.0.0.1",
+            "port": 1783,
+            "stype": "bidding",
+            "memo": "创建招标数据索引new"
+        }
+    ],
+    "threads": 1,
+    "lowHeavy":true,
+    "timingTask":false,
+    "timingSpanDay": 5,
+    "timingPubScope": 720,
+    "specialwords": "(重招|重新招标|勘察|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批|期)",
+    "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
+    "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+}
+
+func historyTaskDay() {
+	defer util.Catch()
+
+	for {
+		start:=time.Now().Unix()
+
+		if gtid=="" {
+			log.Println("请传gtid,否则无法运行")
+			os.Exit(0)
+			return
+		}
+		if lteid!="" {
+			//先进行数据迁移
+			log.Println("开启一次迁移任务",gtid,lteid)
+			moveHistoryData(gtid,lteid)
+			gtid = lteid //替换数据
+		}
+
+		//查询表最后一个id
+		task_sess := task_mgo.GetMgoConn()
+		defer task_mgo.DestoryMongoConn(task_sess)
+		q:=map[string]interface{}{
+			"isused":true,
+		}
+		between_time := time.Now().Unix() - (86400 * timingPubScope)//两年周期
+		it_last := task_sess.DB(task_mgo.DbName).C(task_collName).Find(&q).Sort("-_id").Iter()
+		for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+			lteid = util.ObjToString(tmp["gtid"])
+			log.Println("查询的最后一个任务Id:",lteid)
+			break
+		}
+
+		log.Println("查询完毕-先睡眠5分钟",gtid,lteid)
+		time.Sleep(5 * time.Minute)
+
+		sess := mgo.GetMgoConn()//连接器
+		defer mgo.DestoryMongoConn(sess)
+		//开始判重
+		q = map[string]interface{}{
+			"_id": map[string]interface{}{
+				"$gt": StringTOBsonId(gtid),
+				"$lte": StringTOBsonId(lteid),
+			},
+		}
+		log.Println("历史判重查询条件:",q,"时间:", between_time)
+		it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
+		num,oknum,outnum, deterTime:= int64(0),int64(0),int64(0),int64(0) //计数
+		updateExtract := [][]map[string]interface{}{}//批量更新mongo数组
+		pendAllArr:=[][]map[string]interface{}{}//待处理数组
+		dayArr := []map[string]interface{}{}
+		for tmp := make(map[string]interface{}); it.Next(&tmp); num++ {
+			if num%10000 == 0 {
+				log.Println("正序遍历:", num)
+			}
+			source := util.ObjToMap(tmp["jsondata"])
+			if util.IntAll((*source)["sourcewebsite"]) == 1 {
+				outnum++
+				updatelock.Lock()
+				updateExtract = append(updateExtract, []map[string]interface{}{
+					map[string]interface{}{
+						"_id": tmp["_id"],
+					},
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat": 1,
+							"dataging": 0,
+							"repeat_reason": "sourcewebsite为1 重复",
+						},
+					},
+				})
+				if len(updateExtract) >= 200 {
+					log.Println("sourcewebsite,批量更新")
+					mgo.UpSertBulk(extract, updateExtract...)
+					updateExtract = [][]map[string]interface{}{}
+				}
+
+				updatelock.Unlock()
+
+
+				tmp = make(map[string]interface{})
+				continue
+			}
+
+			//取-符合-发布时间X年内的数据
+			updatelock.Lock()
+			if util.IntAll(tmp["dataging"]) == 1 {
+				pubtime := util.Int64All(tmp["publishtime"])
+				if pubtime > 0 && pubtime >= between_time {
+					oknum++
+					if deterTime==0 {
+						log.Println("找到第一条符合条件的数据")
+						deterTime = util.Int64All(tmp["publishtime"])
+						dayArr = append(dayArr,tmp)
+					}else {
+						if pubtime-deterTime >timingSpanDay*86400 {
+							//新数组重新构建,当前组数据加到全部组数据
+							pendAllArr = append(pendAllArr,dayArr)
+							dayArr = []map[string]interface{}{}
+							deterTime = util.Int64All(tmp["publishtime"])
+							dayArr = append(dayArr,tmp)
+						}else {
+							dayArr = append(dayArr,tmp)
+						}
+					}
+				}else {
+					outnum++
+					//不在两年内的也清标记
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						map[string]interface{}{
+							"_id": tmp["_id"],
+						},
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"dataging": 0,
+							},
+						},
+					})
+					if len(updateExtract) >= 200 {
+						log.Println("不在周期内符合dataging==1,批量更新")
+						mgo.UpSertBulk(extract, updateExtract...)
+						updateExtract = [][]map[string]interface{}{}
+					}
+
+				}
+			}
+
+			updatelock.Unlock()
+
+			tmp = make(map[string]interface{})
+		}
+
+
+		//批量更新标记
+		updatelock.Lock()
+
+		if len(updateExtract) > 0 {
+			log.Println("分组后,最后更新不进行判重的数据:",len(updateExtract),oknum+outnum)
+			mgo.UpSertBulk(extract, updateExtract...)
+			updateExtract = [][]map[string]interface{}{}
+		}
+
+		updatelock.Unlock()
+
+
+		if len(dayArr)>0 {
+			pendAllArr = append(pendAllArr,dayArr)
+			dayArr = []map[string]interface{}{}
+		}
+
+		log.Println("查询数量:",num,"符合条件:",oknum)
+
+		if len(pendAllArr) <= 0 {
+			log.Println("没找到dataging==1的数据")
+		}
+
+		//测试分组数量是否正确
+		testNum:=0
+		for k,v:=range pendAllArr {
+			log.Println("第",k,"组--","数量:",len(v))
+			testNum = testNum+len(v)
+		}
+		log.Println("本地构建分组完成:",len(pendAllArr),"组","测试-总计数量:",testNum)
+
+		n, repeateN := 0, 0
+		log.Println("线程数:",threadNum)
+		pool := make(chan bool, threadNum)
+		wg := &sync.WaitGroup{}
+		for k,v:=range pendAllArr { //每组结束更新一波数据
+			pool <- true
+			wg.Add(1)
+			go func(k int, v []map[string]interface{}) {
+				defer func() {
+					<-pool
+					wg.Done()
+				}()
+				//每组临时数组 -  互不干扰
+				groupUpdateExtract := [][]map[string]interface{}{}
+				//
+				groupOtherExtract := [][]map[string]interface{}{}
+
+				//构建当前组的数据池
+				log.Println("构建第", k, "组---(数据池)")
+				//当前组的第一个发布时间
+				first_pt := util.Int64All(v[len(v)-1]["publishtime"])
+				curTM := TimedTaskDatamap(dupdays+int(timingSpanDay)+1, first_pt+86400, int(k))
+				log.Println("开始遍历判重第", k, "组  共计数量:", len(v))
+				n = n + len(v)
+				log.Println("统计目前总数量:", n, "重复数量:", repeateN)
+				for _, tmp := range v {
+					info := NewInfo(tmp)
+					b, source, reason := curTM.check(info)
+					if b { //有重复,生成更新语句,更新抽取和更新招标
+						repeateN++
+						//重复数据打标签
+						repeat_ids:=source.repeat_ids
+						repeat_ids =  append(repeat_ids,info.id)
+						source.repeat_ids = repeat_ids
+						//替换数据池-更新
+						DM.replacePoolData(source)
+						updatelock.Lock()
+
+
+						//更新数据源-   14 或者 15
+						//判断是否在当前段落
+						if judgeIsCurIds(gtid,lteid,source.id) {
+							groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{//重复数据打标签
+								map[string]interface{}{
+									"_id": StringTOBsonId(source.id),
+								},
+								map[string]interface{}{
+									"$set": map[string]interface{}{
+										"repeat_ids": repeat_ids,
+									},
+								},
+							})
+						}else {
+							groupOtherExtract = append(groupOtherExtract, []map[string]interface{}{//重复数据打标签
+								map[string]interface{}{
+									"_id": StringTOBsonId(source.id),
+								},
+								map[string]interface{}{
+									"$set": map[string]interface{}{
+										"repeat_ids": repeat_ids,
+									},
+								},
+							})
+						}
+						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat":        1,
+									"repeat_reason": reason,
+									"repeat_id":     source.id,
+									"dataging":      0,
+								},
+							},
+						})
+
+						if len(groupUpdateExtract) >= 500 {
+							mgo.UpSertBulk(extract, groupUpdateExtract...)
+							groupUpdateExtract = [][]map[string]interface{}{}
+						}
+
+						if len(groupOtherExtract) >= 500 {
+							mgo.UpSertBulk(extract_back, groupOtherExtract...)
+							groupOtherExtract = [][]map[string]interface{}{}
+						}
+
+						updatelock.Unlock()
+
+
+					} else {
+						updatelock.Lock()
+
+						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"dataging": 0, //符合条件的都为dataging==0
+								},
+							},
+						})
+
+						if len(groupUpdateExtract) >= 500 {
+							mgo.UpSertBulk(extract, groupUpdateExtract...)
+							groupUpdateExtract = [][]map[string]interface{}{}
+						}
+						updatelock.Unlock()
+					}
+				}
+				//每组数据结束-更新数据
+				updatelock.Lock()
+				if len(groupUpdateExtract) > 0 {
+					mgo.UpSertBulk(extract, groupUpdateExtract...)
+				}
+
+				if len(groupOtherExtract) > 0 {
+					mgo.UpSertBulk(extract_back, groupOtherExtract...)
+				}
+				updatelock.Unlock()
+
+			}(k, v)
+
+		}
+
+		wg.Wait()
+
+
+		//任务完成,开始发送广播通知下面节点 发udp 去升索引待定 + 合并
+		if n >= repeateN && gtid!=lteid{
+			for _, to := range nextNode {
+				next_sid := util.BsonIdToSId(gtid)
+				next_eid := util.BsonIdToSId(lteid)
+				key := next_sid + "-" + next_eid + "-" + util.ObjToString(to["stype"])
+				by, _ := json.Marshal(map[string]interface{}{
+					"gtid":  next_sid,
+					"lteid": next_eid,
+					"stype": util.ObjToString(to["stype"]),
+					"key":   key,
+				})
+				addr := &net.UDPAddr{
+					IP:   net.ParseIP(to["addr"].(string)),
+					Port: util.IntAll(to["port"]),
+				}
+				node := &udpNode{by, addr, time.Now().Unix(), 0}
+				udptaskmap.Store(key, node)
+				udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+			}
+		}
+
+		end:=time.Now().Unix()
+
+		log.Println("this timeTask over.", n, "repeateN:", repeateN,gtid,lteid)
+		log.Println(gtid,lteid)
+		if end-start<60*5 {
+			log.Println("睡眠.............")
+			time.Sleep(5 * time.Minute)
+		}
+		log.Println("继续下一段的历史判重")
+	}
+}func historyTaskDay() {
+ 	defer util.Catch()
+ 
+ 	for {
+ 		start:=time.Now().Unix()
+ 
+ 		if gtid=="" {
+ 			log.Println("请传gtid,否则无法运行")
+ 			os.Exit(0)
+ 			return
+ 		}
+ 		if lteid!="" {
+ 			//先进行数据迁移
+ 			log.Println("开启一次迁移任务",gtid,lteid)
+ 			moveHistoryData(gtid,lteid)
+ 			gtid = lteid //替换数据
+ 		}
+ 
+ 		//查询表最后一个id
+ 		task_sess := task_mgo.GetMgoConn()
+ 		defer task_mgo.DestoryMongoConn(task_sess)
+ 		q:=map[string]interface{}{
+ 			"isused":true,
+ 		}
+ 		between_time := time.Now().Unix() - (86400 * timingPubScope)//两年周期
+ 		it_last := task_sess.DB(task_mgo.DbName).C(task_collName).Find(&q).Sort("-_id").Iter()
+ 		for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+ 			lteid = util.ObjToString(tmp["gtid"])
+ 			log.Println("查询的最后一个任务Id:",lteid)
+ 			break
+ 		}
+ 
+ 		log.Println("查询完毕-先睡眠5分钟",gtid,lteid)
+ 		time.Sleep(5 * time.Minute)
+ 
+ 		sess := mgo.GetMgoConn()//连接器
+ 		defer mgo.DestoryMongoConn(sess)
+ 		//开始判重
+ 		q = map[string]interface{}{
+ 			"_id": map[string]interface{}{
+ 				"$gt": StringTOBsonId(gtid),
+ 				"$lte": StringTOBsonId(lteid),
+ 			},
+ 		}
+ 		log.Println("历史判重查询条件:",q,"时间:", between_time)
+ 		it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
+ 		num,oknum,outnum, deterTime:= int64(0),int64(0),int64(0),int64(0) //计数
+ 		updateExtract := [][]map[string]interface{}{}//批量更新mongo数组
+ 		pendAllArr:=[][]map[string]interface{}{}//待处理数组
+ 		dayArr := []map[string]interface{}{}
+ 		for tmp := make(map[string]interface{}); it.Next(&tmp); num++ {
+ 			if num%10000 == 0 {
+ 				log.Println("正序遍历:", num)
+ 			}
+ 			source := util.ObjToMap(tmp["jsondata"])
+ 			if util.IntAll((*source)["sourcewebsite"]) == 1 {
+ 				outnum++
+ 				updatelock.Lock()
+ 				updateExtract = append(updateExtract, []map[string]interface{}{
+ 					map[string]interface{}{
+ 						"_id": tmp["_id"],
+ 					},
+ 					map[string]interface{}{
+ 						"$set": map[string]interface{}{
+ 							"repeat": 1,
+ 							"dataging": 0,
+ 							"repeat_reason": "sourcewebsite为1 重复",
+ 						},
+ 					},
+ 				})
+ 				if len(updateExtract) >= 200 {
+ 					log.Println("sourcewebsite,批量更新")
+ 					mgo.UpSertBulk(extract, updateExtract...)
+ 					updateExtract = [][]map[string]interface{}{}
+ 				}
+ 
+ 				updatelock.Unlock()
+ 
+ 
+ 				tmp = make(map[string]interface{})
+ 				continue
+ 			}
+ 
+ 			//取-符合-发布时间X年内的数据
+ 			updatelock.Lock()
+ 			if util.IntAll(tmp["dataging"]) == 1 {
+ 				pubtime := util.Int64All(tmp["publishtime"])
+ 				if pubtime > 0 && pubtime >= between_time {
+ 					oknum++
+ 					if deterTime==0 {
+ 						log.Println("找到第一条符合条件的数据")
+ 						deterTime = util.Int64All(tmp["publishtime"])
+ 						dayArr = append(dayArr,tmp)
+ 					}else {
+ 						if pubtime-deterTime >timingSpanDay*86400 {
+ 							//新数组重新构建,当前组数据加到全部组数据
+ 							pendAllArr = append(pendAllArr,dayArr)
+ 							dayArr = []map[string]interface{}{}
+ 							deterTime = util.Int64All(tmp["publishtime"])
+ 							dayArr = append(dayArr,tmp)
+ 						}else {
+ 							dayArr = append(dayArr,tmp)
+ 						}
+ 					}
+ 				}else {
+ 					outnum++
+ 					//不在两年内的也清标记
+ 					updateExtract = append(updateExtract, []map[string]interface{}{
+ 						map[string]interface{}{
+ 							"_id": tmp["_id"],
+ 						},
+ 						map[string]interface{}{
+ 							"$set": map[string]interface{}{
+ 								"dataging": 0,
+ 							},
+ 						},
+ 					})
+ 					if len(updateExtract) >= 200 {
+ 						log.Println("不在周期内符合dataging==1,批量更新")
+ 						mgo.UpSertBulk(extract, updateExtract...)
+ 						updateExtract = [][]map[string]interface{}{}
+ 					}
+ 
+ 				}
+ 			}
+ 
+ 			updatelock.Unlock()
+ 
+ 			tmp = make(map[string]interface{})
+ 		}
+ 
+ 
+ 		//批量更新标记
+ 		updatelock.Lock()
+ 
+ 		if len(updateExtract) > 0 {
+ 			log.Println("分组后,最后更新不进行判重的数据:",len(updateExtract),oknum+outnum)
+ 			mgo.UpSertBulk(extract, updateExtract...)
+ 			updateExtract = [][]map[string]interface{}{}
+ 		}
+ 
+ 		updatelock.Unlock()
+ 
+ 
+ 		if len(dayArr)>0 {
+ 			pendAllArr = append(pendAllArr,dayArr)
+ 			dayArr = []map[string]interface{}{}
+ 		}
+ 
+ 		log.Println("查询数量:",num,"符合条件:",oknum)
+ 
+ 		if len(pendAllArr) <= 0 {
+ 			log.Println("没找到dataging==1的数据")
+ 		}
+ 
+ 		//测试分组数量是否正确
+ 		testNum:=0
+ 		for k,v:=range pendAllArr {
+ 			log.Println("第",k,"组--","数量:",len(v))
+ 			testNum = testNum+len(v)
+ 		}
+ 		log.Println("本地构建分组完成:",len(pendAllArr),"组","测试-总计数量:",testNum)
+ 
+ 		n, repeateN := 0, 0
+ 		log.Println("线程数:",threadNum)
+ 		pool := make(chan bool, threadNum)
+ 		wg := &sync.WaitGroup{}
+ 		for k,v:=range pendAllArr { //每组结束更新一波数据
+ 			pool <- true
+ 			wg.Add(1)
+ 			go func(k int, v []map[string]interface{}) {
+ 				defer func() {
+ 					<-pool
+ 					wg.Done()
+ 				}()
+ 				//每组临时数组 -  互不干扰
+ 				groupUpdateExtract := [][]map[string]interface{}{}
+ 				//
+ 				groupOtherExtract := [][]map[string]interface{}{}
+ 
+ 				//构建当前组的数据池
+ 				log.Println("构建第", k, "组---(数据池)")
+ 				//当前组的第一个发布时间
+ 				first_pt := util.Int64All(v[len(v)-1]["publishtime"])
+ 				curTM := TimedTaskDatamap(dupdays+int(timingSpanDay)+1, first_pt+86400, int(k))
+ 				log.Println("开始遍历判重第", k, "组  共计数量:", len(v))
+ 				n = n + len(v)
+ 				log.Println("统计目前总数量:", n, "重复数量:", repeateN)
+ 				for _, tmp := range v {
+ 					info := NewInfo(tmp)
+ 					b, source, reason := curTM.check(info)
+ 					if b { //有重复,生成更新语句,更新抽取和更新招标
+ 						repeateN++
+ 						//重复数据打标签
+ 						repeat_ids:=source.repeat_ids
+ 						repeat_ids =  append(repeat_ids,info.id)
+ 						source.repeat_ids = repeat_ids
+ 						//替换数据池-更新
+ 						DM.replacePoolData(source)
+ 						updatelock.Lock()
+ 
+ 
+ 						//更新数据源-   14 或者 15
+ 						//判断是否在当前段落
+ 						if judgeIsCurIds(gtid,lteid,source.id) {
+ 							groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{//重复数据打标签
+ 								map[string]interface{}{
+ 									"_id": StringTOBsonId(source.id),
+ 								},
+ 								map[string]interface{}{
+ 									"$set": map[string]interface{}{
+ 										"repeat_ids": repeat_ids,
+ 									},
+ 								},
+ 							})
+ 						}else {
+ 							groupOtherExtract = append(groupOtherExtract, []map[string]interface{}{//重复数据打标签
+ 								map[string]interface{}{
+ 									"_id": StringTOBsonId(source.id),
+ 								},
+ 								map[string]interface{}{
+ 									"$set": map[string]interface{}{
+ 										"repeat_ids": repeat_ids,
+ 									},
+ 								},
+ 							})
+ 						}
+ 						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
+ 							map[string]interface{}{
+ 								"_id": tmp["_id"],
+ 							},
+ 							map[string]interface{}{
+ 								"$set": map[string]interface{}{
+ 									"repeat":        1,
+ 									"repeat_reason": reason,
+ 									"repeat_id":     source.id,
+ 									"dataging":      0,
+ 								},
+ 							},
+ 						})
+ 
+ 						if len(groupUpdateExtract) >= 500 {
+ 							mgo.UpSertBulk(extract, groupUpdateExtract...)
+ 							groupUpdateExtract = [][]map[string]interface{}{}
+ 						}
+ 
+ 						if len(groupOtherExtract) >= 500 {
+ 							mgo.UpSertBulk(extract_back, groupOtherExtract...)
+ 							groupOtherExtract = [][]map[string]interface{}{}
+ 						}
+ 
+ 						updatelock.Unlock()
+ 
+ 
+ 					} else {
+ 						updatelock.Lock()
+ 
+ 						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
+ 							map[string]interface{}{
+ 								"_id": tmp["_id"],
+ 							},
+ 							map[string]interface{}{
+ 								"$set": map[string]interface{}{
+ 									"dataging": 0, //符合条件的都为dataging==0
+ 								},
+ 							},
+ 						})
+ 
+ 						if len(groupUpdateExtract) >= 500 {
+ 							mgo.UpSertBulk(extract, groupUpdateExtract...)
+ 							groupUpdateExtract = [][]map[string]interface{}{}
+ 						}
+ 						updatelock.Unlock()
+ 					}
+ 				}
+ 				//每组数据结束-更新数据
+ 				updatelock.Lock()
+ 				if len(groupUpdateExtract) > 0 {
+ 					mgo.UpSertBulk(extract, groupUpdateExtract...)
+ 				}
+ 
+ 				if len(groupOtherExtract) > 0 {
+ 					mgo.UpSertBulk(extract_back, groupOtherExtract...)
+ 				}
+ 				updatelock.Unlock()
+ 
+ 			}(k, v)
+ 
+ 		}
+ 
+ 		wg.Wait()
+ 
+ 
+ 		//任务完成,开始发送广播通知下面节点 发udp 去升索引待定 + 合并
+ 		if n >= repeateN && gtid!=lteid{
+ 			for _, to := range nextNode {
+ 				next_sid := util.BsonIdToSId(gtid)
+ 				next_eid := util.BsonIdToSId(lteid)
+ 				key := next_sid + "-" + next_eid + "-" + util.ObjToString(to["stype"])
+ 				by, _ := json.Marshal(map[string]interface{}{
+ 					"gtid":  next_sid,
+ 					"lteid": next_eid,
+ 					"stype": util.ObjToString(to["stype"]),
+ 					"key":   key,
+ 				})
+ 				addr := &net.UDPAddr{
+ 					IP:   net.ParseIP(to["addr"].(string)),
+ 					Port: util.IntAll(to["port"]),
+ 				}
+ 				node := &udpNode{by, addr, time.Now().Unix(), 0}
+ 				udptaskmap.Store(key, node)
+ 				udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+ 			}
+ 		}
+ 
+ 		end:=time.Now().Unix()
+ 
+ 		log.Println("this timeTask over.", n, "repeateN:", repeateN,gtid,lteid)
+ 		log.Println(gtid,lteid)
+ 		if end-start<60*5 {
+ 			log.Println("睡眠.............")
+ 			time.Sleep(5 * time.Minute)
+ 		}
+ 		log.Println("继续下一段的历史判重")
+ 	}
+ }	       		
+
+
+
+
+func getDB() *mgo.Database {
+session, err := mgo.Dial("127.0.0.1:27017")
+if err != nil {
+panic(err)
+}
+session.SetMode(mgo.Monotonic, true)
+db := session.DB("zhengkun")
+return db
+}
+

+ 49 - 0
flow_repeat/config.json

@@ -0,0 +1,49 @@
+{
+    "udpport": ":1785",
+    "dupdays": 7,
+    "mongodb": {
+        "addr": "127.0.0.1:27017",
+        "db": "zhengkun",
+        "username": "",
+        "password": "",
+        "extract": "repeat_test",
+        "extract_back": "repeat_test",
+        "extract_log": "result_replace_log",
+        "pool": 5
+    },
+    "task_mongodb": {
+        "task_addr": "127.0.0.1:27017",
+        "task_db": "zhengkun",
+        "username": "",
+        "password": "",
+        "task_coll": "bidding_processing_ids",
+        "task_bidding": "bidding",
+        "task_pool": 5
+    },
+    "spider_mongodb": {
+        "spider_addr": "127.0.0.1:27017",
+        "spider_db": "zhengkun",
+        "username": "",
+        "password": "",
+        "spider_coll": "site",
+        "spider_pool": 5
+    },
+    "jkmail": {
+        "to": "zhengkun@topnet.net.cn,zhangjinkun@topnet.net.cn",
+        "api": "http://172.17.145.179:19281/_send/_mail"
+    },
+    "nextNode": [],
+    "jyfb_data": [
+        "a_jyxxfbpt_gg"
+    ],
+    "threads": 1,
+    "lowHeavy":true,
+    "timingTask":false,
+    "timingSpanDay": 4,
+    "timingPubScope": 720,
+    "specialwords": "(重招|重新招标|勘察|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批|期)",
+    "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
+    "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+}

+ 447 - 0
flow_repeat/dataMethod.go

@@ -0,0 +1,447 @@
+package main
+
+import (
+	qutil "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	"math"
+	"regexp"
+	"strings"
+)
+
+var cleanNameReg_0 = regexp.MustCompile("([(())::\\s ])")
+var cleanNameReg_1 = regexp.MustCompile("(项目)(.{0,5})(招标|中标|中标结果|成交|候选人|竞谈|竞争性磋商)(公告)?$")
+var cleanNameReg_2 = regexp.MustCompile("(公告|公示|公告公告)$")
+var cleanNameReg_3 = regexp.MustCompile("(公开|的)(比选|招标|单一来源)")
+
+var un_cleanNameReg_1 = regexp.MustCompile("(项目[一二三四五六七八九1-9][次](招标|中标|中标结果|成交|候选人|竞谈|竞争性磋商)(公告)?)$")
+
+// 完善判重数据检测-前置条件
+func convertArabicNumeralsAndLetters(data string) string {
+	newData := data
+	res1, _ := regexp.Compile("[a-zA-Z]+")
+	if res1.MatchString(data) {
+		newData = res1.ReplaceAllStringFunc(data, strings.ToUpper)
+	}
+	res2, _ := regexp.Compile("[0-9]+")
+	if res2.MatchString(newData) {
+		arr1 := []string{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}
+		arr2 := []string{"零", "一", "二", "三", "四", "五", "六", "七", "八", "九"}
+		for i := 0; i < len(arr1); i++ {
+			resTemp, _ := regexp.Compile(arr1[i])
+			newData = resTemp.ReplaceAllString(newData, arr2[i])
+		}
+	}
+	return newData
+}
+
+// 特殊词处理
+func dealWithSpecialPhrases(str1 string, str2 string) (string, string) {
+	newStr1 := str1
+	newStr2 := str2
+	res, _ := regexp.Compile("重新招标")
+	if res.MatchString(newStr1) {
+		newStr1 = res.ReplaceAllString(newStr1, "重招")
+	}
+	if res.MatchString(newStr2) {
+		newStr2 = res.ReplaceAllString(newStr2, "重招")
+	}
+	return newStr1, newStr2
+}
+
+// 关键词数量v
+func dealWithSpecialWordNumber(info *Info, v *Info) int {
+	okNum := 0
+	if info.titleSpecialWord || info.specialWord {
+		okNum++
+	}
+	if v.titleSpecialWord || v.specialWord {
+		okNum++
+	}
+	return okNum
+}
+
+// 关键词再次判断
+func againRepeat(v *Info, info *Info, site bool) bool {
+	if isPublishtimeInterval(info.publishtime, v.publishtime) && site {
+		return true
+	}
+	if isBidopentimeInterval(info.bidopentime, v.bidopentime) {
+		return true
+	}
+	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+		return true
+	}
+	if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0 && info.bidamount != 0 {
+		return true
+	}
+	if v.winner != info.winner && v.winner != "" && info.winner != "" {
+		return true
+	}
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+	if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
+		return true
+	}
+	if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title {
+		if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
+			return true
+		}
+	}
+	return false
+}
+
+// 均含有关键词再次判断
+func againContainSpecialWord(v *Info, info *Info) bool {
+	if isBidopentimeInterval(info.bidopentime, v.bidopentime) {
+		return true
+	}
+	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+		return true
+	}
+	if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0 && info.bidamount != 0 {
+		return true
+	}
+	if v.winner != info.winner && v.winner != "" && info.winner != "" {
+		return true
+	}
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+	//提取标题-标段号处理
+	if dealTitleSpecial(v.title, info.title) {
+		return true
+	}
+
+	return false
+}
+
+// 提取标题-标段号处理
+func dealTitleSpecial(title1 string, title2 string) bool {
+	regular1 := "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789]+[))]?"
+	regular2 := "[0-9a-zA-Z一二三四五六七八九十零123456789]+(包|标段|标包)"
+	regx1_1, _ := regexp.Compile(regular1)
+	str1 := regx1_1.FindString(title1)
+	if str1 == "" {
+		regx1_2, _ := regexp.Compile(regular2)
+		str1 = regx1_2.FindString(title1)
+	}
+	regx2_1, _ := regexp.Compile(regular1)
+	str2 := regx2_1.FindString(title2)
+	if str2 == "" {
+		regx2_2, _ := regexp.Compile(regular2)
+		str2 = regx2_2.FindString(title2)
+	}
+	//根据提取的结果,在进行清洗
+	if str1 != "" {
+		str1 = deleteExtraSpaceName(str1)
+		str1 = cleanNameReg_0.ReplaceAllString(str1, "")
+		str1 = convertArabicNumeralsAndLetters(str1)
+	}
+	if str2 != "" {
+		str2 = deleteExtraSpaceName(str2)
+		str2 = cleanNameReg_0.ReplaceAllString(str2, "")
+		str2 = convertArabicNumeralsAndLetters(str2)
+	}
+	if str1 != str2 {
+		return true
+	} else {
+		return false
+	}
+}
+
+// 删除中标单位字符串中多余的空格(含tab)
+func deleteExtraSpaceName(s string) string {
+	//删除字符串中的多余空格,有多个空格时,仅保留一个空格
+	s1 := strings.Replace(s, "  ", " ", -1)      //替换tab为空格
+	regstr := "\\s{2,}"                          //两个及两个以上空格的正则表达式
+	reg, _ := regexp.Compile(regstr)             //编译正则表达式
+	s2 := make([]byte, len(s1))                  //定义字符数组切片
+	copy(s2, s1)                                 //将字符串复制到切片
+	spc_index := reg.FindStringIndex(string(s2)) //在字符串中搜索
+	for len(spc_index) > 0 {                     //找到适配项
+		s2 = append(s2[:spc_index[0]+1], s2[spc_index[1]:]...) //删除多余空格
+		spc_index = reg.FindStringIndex(string(s2))            //继续在字符串中搜索
+	}
+	return string(s2)
+}
+
+// 中标金额倍率:10000
+func isBidWinningAmount(f1 float64, f2 float64) bool {
+	if f1 == f2 || f1*10000 == f2 || f2*10000 == f1 {
+		return false
+	}
+	return true
+}
+
+// 时间间隔周期
+func isTimeIntervalPeriod(i1 int64, i2 int64) bool {
+	if math.Abs(float64(i1-i2)) < 172800.0 {
+		return true
+	} else {
+		return false //大于48小时
+	}
+}
+
+// 开标时间区间为一天
+func isBidopentimeInterval(i1 int64, i2 int64) bool {
+	if i1 == 0 || i2 == 0 {
+		return false
+	}
+	//不在同一天-或者同一天间隔超过六小时,属于不相等返回true
+	timeOne, timeTwo := i1, i2
+	day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
+	day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
+	if day1 == day2 {
+		//是否间隔超过十二小时
+		if math.Abs(float64(i1-i2)) > 43200.0 {
+			return true
+		} else {
+			return false
+		}
+	} else {
+		return true
+	}
+}
+
+// 发布时间区间为一天
+func isPublishtimeInterval(i1 int64, i2 int64) bool {
+	if i1 == 0 || i2 == 0 {
+		return false
+	}
+	//不在同一天-或者同一天间隔超过12小时,属于不相等返回true
+	timeOne, timeTwo := i1, i2
+	day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
+	day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
+	if day1 == day2 {
+		//是否间隔超过十二小时
+		if math.Abs(float64(i1-i2)) >= 43200.0 {
+			return true
+		} else {
+			return false
+		}
+	} else {
+		return true
+	}
+}
+
+// 时间区间为一天
+func isTheSameDay(i1 int64, i2 int64) bool {
+	if i1 == 0 || i2 == 0 {
+		return false
+	}
+	timeOne, timeTwo := i1, i2
+	day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
+	day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
+	if day1 == day2 {
+		return true
+	}
+	return false
+}
+
+// 前置0 五要素均相等认为重复
+func leadingElementSame(v *Info, info *Info) bool {
+	isok := 0
+	if info.projectname != "" && v.projectname == info.projectname {
+		isok++
+	}
+	if info.buyer != "" && v.buyer == info.buyer {
+		isok++
+	}
+	if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+		if info.contractnumber != "" && v.contractnumber == info.contractnumber {
+			isok++
+		}
+	} else {
+		if info.projectcode != "" && v.projectcode == info.projectcode {
+			isok++
+		}
+	}
+	if info.title != "" && v.title == info.title {
+		isok++
+	}
+	if v.agency == info.agency {
+		isok++
+	}
+	if v.winner == info.winner && info.winner != "" {
+		isok++
+	}
+
+	if isok >= 5 { //加一层金额单位的逻辑校验
+		if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+			return false
+		}
+		if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0.0 && info.bidamount != 0.0 {
+			return false
+		}
+		if v.winner != "" && info.winner != "" && v.winner != info.winner {
+			return false
+		}
+		return true
+	}
+
+	return false
+}
+
+// 前置0 竞品要素简易计算
+func jingPinElementSame(v *Info, info *Info) bool {
+	if info.projectname != "" && v.projectname != info.projectname {
+		return false
+	}
+	if info.buyer != "" && v.buyer != info.buyer {
+		return false
+	}
+	if info.projectcode != "" && v.projectcode != info.projectcode {
+		return false
+	}
+	if v.agency != info.agency {
+		return false
+	}
+	return true
+}
+
+// buyer的优先级
+func buyerIsContinue(v *Info, info *Info) bool {
+	if !isTheSameDay(info.publishtime, v.publishtime) {
+		return true
+	}
+	if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
+		return true
+	}
+	if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title {
+		if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
+			return true
+		}
+	}
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+	return false
+}
+
+// 判断~是否需要替换数据相关
+func judgeIsReplaceInfo(s_href string, i_href string) bool {
+	if strings.Contains(s_href, "https://www.jianyu360.cn") && i_href != "" &&
+		!strings.Contains(i_href, "https://www.jianyu360.cn") {
+		return true
+	}
+	return false
+}
+
+// 查询抽取表数据
+func confrimExtractData(source_id string, info_id string) (bool, map[string]interface{}, map[string]interface{}) {
+	source_data := map[string]interface{}{}
+	info_data := map[string]interface{}{}
+	isvalid := false
+	source_data = data_mgo.FindById(extract, source_id)
+	info_data = data_mgo.FindById(extract, info_id)
+	if len(source_data) > 2 && len(info_data) > 2 {
+		isvalid = true
+		ts_id := source_data["_id"]
+		ti_id := info_data["_id"]
+		source_data["_id"] = ti_id
+		info_data["_id"] = ts_id
+	}
+	return isvalid, info_data, source_data
+}
+
+// 查询历史抽取表数据
+func confrimHistoryExtractData(source_id string, info_id string) (bool, bool, map[string]interface{}, map[string]interface{}) {
+	source_data := map[string]interface{}{}
+	info_data := map[string]interface{}{}
+	isvalid := false
+	isexists := false
+	if judgeIsCurIds(gtid, lteid, source_id) {
+		isexists = true
+		source_data = data_mgo.FindById(extract, source_id)
+	} else {
+		source_data = data_mgo.FindById(extract_back, source_id)
+	}
+	info_data = data_mgo.FindById(extract, info_id)
+	if len(source_data) > 2 && len(info_data) > 2 {
+		isvalid = true
+		ts_id := source_data["_id"]
+		ti_id := info_data["_id"]
+		source_data["_id"] = ti_id
+		info_data["_id"] = ts_id
+	}
+	return isvalid, isexists, info_data, source_data
+}
+
+// 查询bidding表数据
+func confrimBiddingData(source_id string, info_id string) (bool, map[string]interface{}, map[string]interface{}) {
+	source_data := map[string]interface{}{}
+	info_data := map[string]interface{}{}
+	isvalid := false
+	source_data = task_mgo.FindById(task_bidding, source_id)
+	info_data = task_mgo.FindById(task_bidding, info_id)
+	if len(source_data) > 2 && len(info_data) > 2 {
+		isvalid = true
+		ts_id := source_data["_id"]
+		ti_id := info_data["_id"]
+		source_data["_id"] = ti_id
+		info_data["_id"] = ts_id
+	}
+	return isvalid, info_data, source_data
+}
+
+// 是否为竞品链接
+func IsJpHref(href string) bool {
+	if strings.Contains(href, "www.jianyu360") && href != "" {
+		return true
+	}
+	return false
+}
+
+// 验证竞品是否重复
+func confirmJingPinIsRepeatData(v *Info, info *Info) bool {
+	//标题验证~是否有关联~是否需要清洗数据-长度需要考虑
+	if v.c_title != "" && info.c_title != "" { //标题相似判断
+		if !(strings.Contains(v.c_title, info.c_title) || strings.Contains(info.c_title, v.c_title)) {
+			if !jingPinElementSame(v, info) {
+				return false
+			}
+		}
+		if !isTheSameDay(v.publishtime, info.publishtime) {
+			return false
+		}
+		if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+			return false
+		}
+		if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0.0 && info.bidamount != 0.0 {
+			return false
+		}
+		if v.winner != "" && info.winner != "" && v.winner != info.winner {
+			return false
+		}
+		if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+			return false
+		}
+		if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+			return false
+		}
+		return true
+	}
+	return false
+}
+
+// 通用清洗~清洗名称~过滤冗余~
+func cleanNameFilterRedundant(name string) string {
+	new_name := name
+	new_name = cleanNameReg_0.ReplaceAllString(new_name, "")
+	if !un_cleanNameReg_1.MatchString(new_name) {
+		new_name = cleanNameReg_1.ReplaceAllString(new_name, "${1}${3}")
+	}
+	new_name = cleanNameReg_2.ReplaceAllString(new_name, "")
+	new_name = cleanNameReg_3.ReplaceAllString(new_name, "${2}")
+
+	return new_name
+}

+ 492 - 0
flow_repeat/dataMethodHeavy.go

@@ -0,0 +1,492 @@
+package main
+
+import (
+	"strings"
+)
+
+// 判重方法1
+func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
+
+	isMeet := false
+	if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
+		info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
+		info.subtype == "变更" || info.subtype == "其他" {
+		//招标结果
+		if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
+			if tenderRepeat_C(v, info) {
+				return false, reason
+			} else {
+				reason = reason + "---招标类"
+				return true, reason
+			}
+		} else {
+			return false, reason
+		}
+
+	} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
+		//中标结果
+		if isMeet, reason = winningRepeat_A(v, info, reason); isMeet {
+			if winningRepeat_C(v, info) {
+				return false, reason
+			} else {
+				reason = reason + "---中标类"
+				return true, reason
+			}
+		} else {
+			return false, reason
+		}
+
+	} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+		//合同
+		if isMeet, reason = contractRepeat_A(v, info, reason); isMeet {
+			if contractRepeat_C(v, info) {
+				return false, reason
+			} else {
+				reason = reason + "---合同类"
+				return true, reason
+			}
+		} else {
+			return false, reason
+		}
+	} else {
+		//招标结果
+		if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
+			if tenderRepeat_C(v, info) {
+				return false, reason
+			} else {
+				reason = reason + "---类别空-招标类"
+				return true, reason
+			}
+		} else {
+			return false, reason
+		}
+	}
+
+	return false, reason
+}
+
+// 判重方法2
+func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
+	isMeet := false
+	isAgency := false
+	//招标类-代理机构不同-广泛前后缀比较
+	if v.agency != info.agency && v.agency != "" && info.agency != "" {
+		//新增一层判断
+		if strings.Contains(v.agency, info.agency) || strings.Contains(info.agency, v.agency) {
+			isAgency = true
+		} else {
+			return false, reason
+		}
+	}
+
+	if (v.agency == info.agency && v.agency != "" && info.agency != "") || isAgency {
+		if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
+			info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
+			info.subtype == "变更" || info.subtype == "其他" {
+			//招标结果
+			if isMeet, reason = tenderRepeat_B(v, info, reason); isMeet {
+				if tenderRepeat_C(v, info) { //有不同
+					return false, reason
+				} else {
+					reason = reason + "---招标类"
+					return true, reason
+				}
+			} else {
+				return false, reason
+			}
+
+		} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
+			//中标结果
+			if isMeet, reason = winningRepeat_B(v, info, reason); isMeet {
+				if winningRepeat_C(v, info) { //有不同
+					return false, reason
+				} else {
+					reason = reason + "---中标类"
+					return true, reason
+				}
+			} else {
+				return false, reason
+			}
+
+		} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+			//合同
+			if isMeet, reason = contractRepeat_B(v, info, reason); isMeet {
+				if contractRepeat_C(v, info) { //有不同
+					return false, reason
+				} else {
+					reason = reason + "---合同类"
+					return true, reason
+				}
+			} else {
+				return false, reason
+			}
+		} else {
+			//招标结果
+			if isMeet, reason = tenderRepeat_B(v, info, reason); isMeet {
+				if tenderRepeat_C(v, info) { //有不同
+					return false, reason
+				} else {
+					reason = reason + "---类别空-招标类"
+					return true, reason
+				}
+			} else {
+				return false, reason
+			}
+		}
+	}
+
+	//机构最2少一个为空
+	if v.agency == "" || info.agency == "" {
+		var repeat = false
+		if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
+			reason = reason + "---机构最少一个空"
+			return true, reason
+		} else {
+			return false, reason
+		}
+	}
+
+	return false, reason
+}
+
+// 招标_A
+func tenderRepeat_A(v *Info, info *Info, reason string) (bool, string) {
+
+	var ss string
+	p1, p2, p3, p4, p9, p10, p11 := false, false, false, false, false, false, false
+	if v.projectname != "" && v.projectname == info.projectname {
+		ss = ss + "p1-名称-"
+		p1 = true
+	}
+	if v.buyer != "" && v.buyer == info.buyer {
+		ss = ss + "p2-单位-"
+		p2 = true
+	}
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
+		ss = ss + "p3-编号组-"
+		p3 = true
+	}
+	if v.budget != 0 && v.budget == info.budget {
+		ss = ss + "p4-预算-"
+		p4 = true
+	}
+	if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
+		ss = ss + "p9-开标时间相同-"
+		p9 = true
+	}
+	if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
+		ss = ss + "p10-开标地点-"
+		p10 = true
+	}
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 {
+		if strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title) {
+			ss = ss + "p11-标题-"
+			p11 = true
+		} else {
+			if v.c_title != "" && info.c_title != "" &&
+				(strings.Contains(v.c_title, info.c_title) || strings.Contains(info.c_title, v.c_title)) {
+				ss = ss + "p11-标题-"
+				p11 = true
+			}
+		}
+	}
+	if info.subtype != "" && (p1 && p3 && p11) {
+		reason = reason + "满足招标A,3要素组合-" + ss + ","
+		return true, reason
+	}
+
+	if (p1 && p2 && p3) || (p1 && p2 && p4) || (p1 && p2 && p9) ||
+		(p1 && p2 && p10) || (p1 && p2 && p11) || (p1 && p3 && p9) || (p1 && p3 && p10) || (p1 && p3 && p4) ||
+		(p1 && p4 && p9) || (p1 && p4 && p10) || (p2 && p3 && p4) ||
+		(p2 && p3 && p9) || (p2 && p3 && p10) || (p2 && p3 && p11) ||
+		(p2 && p4 && p9) || (p2 && p4 && p10) || (p2 && p4 && p11) ||
+		(p3 && p4 && p9) || (p3 && p4 && p10) || (p3 && p4 && p11) ||
+		(p4 && p9 && p10) || (p4 && p9 && p11) || (p9 && p10 && p11) {
+		reason = reason + "满足招标A,3要素组合-" + ss + ","
+		return true, reason
+	}
+	return false, reason
+}
+
+// 招标_B
+func tenderRepeat_B(v *Info, info *Info, reason string) (bool, string) {
+
+	m, n := 0, 0
+	if v.projectname != "" && v.projectname == info.projectname {
+		m++
+		n++
+	}
+	if v.buyer != "" && v.buyer == info.buyer {
+		m++
+	}
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
+		m++
+	}
+	if v.budget != 0 && v.budget == info.budget {
+		m++
+	}
+	if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
+		m++
+	}
+	//if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
+	//	m++
+	//}
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
+		m++
+		n++
+	}
+	if m >= 2 {
+		if n == 2 && m == 2 {
+			return false, reason
+		} else {
+			reason = reason + "满足招标B,六选二,"
+			return true, reason
+		}
+	}
+	return false, reason
+}
+
+// 招标_C
+func tenderRepeat_C(v *Info, info *Info) bool {
+
+	if v.budget != 0 && info.budget != 0 && v.budget != info.budget {
+		return true
+	}
+	if v.bidopentime != 0 && info.bidopentime != 0 && isBidopentimeInterval(info.bidopentime, v.bidopentime) {
+		return true
+	}
+	return false
+}
+
+// 中标_A
+func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
+
+	var ss string
+	p1, p2, p3, p5, p6, p11 := false, false, false, false, false, false
+	if v.projectname != "" && v.projectname == info.projectname {
+		ss = ss + "p1-项目名称-"
+		p1 = true
+	}
+	if v.buyer != "" && v.buyer == info.buyer {
+		ss = ss + "p2-单位-"
+		p2 = true
+	}
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
+		ss = ss + "p3-编号组--"
+		p3 = true
+	}
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount, info.bidamount) {
+		ss = ss + "p5-中标金-"
+		p5 = true
+	}
+	if v.winner != "" && v.winner == info.winner {
+		ss = ss + "p6-中标人-"
+		p6 = true
+	}
+
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
+		ss = ss + "p11-标题-"
+		p11 = true
+	}
+
+	if (p1 && p2 && p3) || (p1 && p2 && p5) || (p1 && p2 && p6) ||
+		(p1 && p2 && p11) || (p1 && p3 && p11) ||
+		(p1 && p3 && p5) || (p1 && p3 && p6) || (p1 && p5 && p6) ||
+		(p2 && p3 && p5) || (p2 && p3 && p6) || (p2 && p3 && p11) ||
+		(p2 && p5 && p6) || (p2 && p5 && p11) || (p2 && p6 && p11) ||
+		(p3 && p5 && p6) || (p3 && p5 && p11) || (p3 && p6 && p11) ||
+		(p5 && p6 && p11) {
+		reason = reason + "满足中标A,3要素组合-" + ss + ","
+		return true, reason
+	}
+
+	return false, reason
+}
+
+// 中标_B
+func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
+
+	m, n := 0, 0
+	if v.projectname != "" && v.projectname == info.projectname {
+		m++
+		n++
+	}
+	if v.buyer != "" && v.buyer == info.buyer {
+		m++
+	}
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
+		m++
+	}
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount, info.bidamount) {
+		m++
+	}
+	if v.winner != "" && v.winner == info.winner {
+		m++
+	}
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
+		m++
+		n++
+	}
+	if m >= 2 {
+		if n == 2 && m == 2 {
+			return false, reason
+		} else {
+			reason = reason + "满足中标B.六选二,"
+			return true, reason
+		}
+	}
+	return false, reason
+}
+
+// 中标_C
+func winningRepeat_C(v *Info, info *Info) bool {
+
+	if v.bidamount != 0 && info.bidamount != 0 && isBidWinningAmount(v.bidamount, info.bidamount) {
+		//避免抽错金额-
+		if ((v.projectcode != "" && info.projectcode != "" && v.projectcode == info.projectcode) ||
+			(v.contractnumber != "" && info.contractnumber != "" && v.contractnumber == info.contractnumber)) &&
+			(v.winner != "" && info.winner != "" && v.winner == info.winner) {
+			return false
+		}
+		return true
+	}
+	if v.winner != "" && info.winner != "" && v.winner != info.winner {
+		return true
+	}
+	return false
+}
+
+// 合同_A
+func contractRepeat_A(v *Info, info *Info, reason string) (bool, string) {
+
+	isMeet_1 := false
+	if isMeet_1, reason = tenderRepeat_A(v, info, reason); isMeet_1 {
+		return true, reason
+	}
+
+	isMeet_2 := false
+	if isMeet_2, reason = winningRepeat_A(v, info, reason); isMeet_2 {
+		return true, reason
+	}
+	return false, reason
+}
+
+// 合同_B
+func contractRepeat_B(v *Info, info *Info, reason string) (bool, string) {
+
+	isMeet_1 := false
+	if isMeet_1, reason = tenderRepeat_B(v, info, reason); isMeet_1 {
+		return true, reason
+	}
+	isMeet_2 := false
+	if isMeet_2, reason = winningRepeat_B(v, info, reason); isMeet_2 {
+		return true, reason
+	}
+	return false, reason
+}
+
+// 合同_C
+func contractRepeat_C(v *Info, info *Info) bool {
+
+	if tenderRepeat_C(v, info) {
+		return true
+	}
+	if winningRepeat_C(v, info) {
+		return true
+	}
+
+	//合同类 - 新增编号
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+
+	return false
+}
+
+// 是否相似
+func isTheSimilarName(name1 string, name2 string) bool {
+	if strings.Contains(name1, name2) || strings.Contains(name2, name1) {
+		return true
+	}
+	return false
+}
+
+// 快速低质量数据判重
+func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
+	//区间间隔24小时
+	if !isTimeIntervalPeriod(v.publishtime, info.publishtime) {
+		return false, reason
+	}
+	//首先判定是否为低质量数据    info目标数据
+	if info.title != "" && (info.agency == "" || v.agency == "") &&
+		(info.title == v.title) &&
+		(info.projectcode == "" || info.projectcode == v.projectcode) &&
+		info.contractnumber == "" && info.buyer == "" {
+		isValue := 0                //五要素判断
+		if info.projectname != "" { //项目名称
+			isValue++
+		}
+		if info.budget != 0 { //预算
+			isValue++
+		}
+		if info.winner != "" { //中标单位
+			isValue++
+		}
+		if info.bidamount != 0 { //中标金额
+			isValue++
+		}
+		if isValue == 0 {
+			reason = reason + "---低质量-要素均为空-标题满足"
+			return true, reason
+		} else if isValue == 1 {
+			isMeet := false
+			if isMeet, reason = judgeLowQualityData(v, info, reason); isMeet {
+				reason = reason + "---低质量-有且一个要素组合"
+				return true, reason
+			}
+		} else if isValue == 2 {
+			if info.subtype == "采购意向" { //特殊
+				if info.projectname != "" && info.projectname == v.projectname &&
+					info.budget != 0 && info.budget == v.budget &&
+					info.city != "" && info.city == v.city {
+					reason = reason + "---采购意向~同城~预算~名称均一致"
+					return true, reason
+				}
+			}
+		} else {
+
+		}
+	}
+	return false, reason
+}
+
+// 类别细节原因记录
+func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
+	if info.projectname != "" && isTheSimilarName(info.projectname, v.projectname) {
+		reason = reason + "---项目名称"
+		return true, reason
+	}
+	if info.budget != 0 && info.budget == v.budget { //预算
+		reason = reason + "---预算"
+		return true, reason
+	}
+	if v.winner != "" && info.winner == v.winner { //中标单位
+		reason = reason + "---中标单位"
+		return true, reason
+	}
+	if v.bidamount != 0 && info.bidamount == v.bidamount { //中标金额
+		reason = reason + "---中标金额"
+		return true, reason
+	}
+	return false, reason
+}

+ 583 - 0
flow_repeat/datamap.go

@@ -0,0 +1,583 @@
+package main
+
+import (
+	"fmt"
+	qutil "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	"log"
+	"reflect"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+)
+
+type Info struct {
+	id               string  //id
+	title            string  //标题
+	spidercode       string  //爬虫代码
+	area             string  //省份
+	city             string  //城市
+	subtype          string  //信息类型
+	buyer            string  //采购单位
+	agency           string  //代理机构
+	winner           string  //中标单位
+	budget           float64 //预算金额
+	bidamount        float64 //中标金额
+	projectname      string  //项目名称
+	projectcode      string  //项目编号
+	contractnumber   string  //合同编号
+	publishtime      int64   //发布时间
+	comeintime       int64   //入库时间
+	bidopentime      int64   //开标时间
+	bidopenaddress   string  //开标地点
+	site             string  //站点
+	href             string  //正文的url
+	repeatid         string  //重复id
+	specialWord      bool    //特殊词
+	titleSpecialWord bool    //标题特殊词
+	isJphref         bool    //是否竞品数据
+	c_title          string  //清洗后的标题
+	c_projectname    string  //清洗后的项目名称
+}
+
+var datelimit = float64(432000) //五天
+var sitelock sync.Mutex         //锁
+
+// 一般数据判重
+type datamap struct {
+	lock     sync.Mutex //锁
+	days     int        //保留几天数据
+	data     map[string][]*Info
+	keymap   []string
+	areakeys []string
+	keys     map[string]bool
+}
+
+// 历史~存量
+func TimedTaskDatamap(days int, lasttime int64, numIndex int) *datamap {
+	datelimit = qutil.Float64All(days * 86400)
+	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{}, map[string]bool{}}
+	if lasttime < 0 {
+		log.Println("数据池空数据")
+		return dm
+	}
+	start := int(time.Now().Unix())
+	sess := data_mgo.GetMgoConn()
+	defer data_mgo.DestoryMongoConn(sess)
+	query := map[string]interface{}{"publishtime": map[string]interface{}{
+		"$lt": lasttime,
+	}}
+	log.Println("query", query)
+	it := sess.DB(data_mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
+	n, continuSum := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		//if n%10000 == 0 {
+		//	log.Println("当前 n:", n, "数量:", continuSum, tmp["_id"], tmp["publishtime"])
+		//}
+		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 || qutil.IntAll(tmp["dataging"]) == 1 ||
+			qutil.ObjToString(tmp["subtype"]) == "拟建" || qutil.ObjToString(tmp["subtype"]) == "产权" ||
+			qutil.ObjToString(tmp["spidercode"]) == "sdxzbiddingsjzypc" {
+
+		} else {
+			if fmt.Sprint(reflect.TypeOf(tmp["publishtime"])) == "string" {
+				continue
+			}
+			pt := tmp["publishtime"]
+			pt_time := qutil.Int64All(pt)
+
+			if pt_time > time.Now().Unix() {
+				continue
+			}
+			if qutil.Float64All(lasttime-pt_time) < datelimit {
+				continuSum++
+				info := NewInfo(tmp)
+				dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
+				k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
+				data := dm.data[k]
+				if data == nil {
+					data = []*Info{}
+				}
+				data = append(data, info)
+				dm.data[k] = data
+				dm.keys[dkey] = true
+				//添加省
+				isAreaExist := false
+				for _, v := range dm.areakeys {
+					if v == info.area {
+						isAreaExist = true
+					}
+				}
+				if !isAreaExist {
+					areaArr := dm.areakeys
+					areaArr = append(areaArr, info.area)
+					dm.areakeys = areaArr
+				}
+			} else {
+				break
+			}
+		}
+
+		tmp = make(map[string]interface{})
+	}
+
+	log.Printf("第%d组:数据池构建完成:%d秒,%d个\n", numIndex, int(time.Now().Unix())-start, n)
+
+	return dm
+}
+
+// 增量
+func NewDatamap(days int, lastid string) *datamap {
+	datelimit = qutil.Float64All(days * 86400 * 2)
+	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{}, map[string]bool{}}
+	if lastid == "" {
+		log.Println("不构建数据池")
+		return dm
+	}
+	//初始化加载数据
+	sess := data_mgo.GetMgoConn()
+	defer data_mgo.DestoryMongoConn(sess)
+	query := map[string]interface{}{"_id": map[string]interface{}{
+		"$lte": StringTOBsonId(lastid),
+	}}
+	log.Println("query", query)
+	it := sess.DB(data_mgo.DbName).C(extract).Find(query).Sort("-publishtime").Iter()
+	nowTime := time.Now().Unix() //当前时间的时间戳
+	n, continuSum := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 ||
+			qutil.ObjToString(tmp["subtype"]) == "拟建" || qutil.ObjToString(tmp["subtype"]) == "产权" ||
+			qutil.ObjToString(tmp["spidercode"]) == "sdxzbiddingsjzypc" {
+		} else {
+			if fmt.Sprint(reflect.TypeOf(tmp["publishtime"])) == "string" {
+				continue
+			}
+			pt := tmp["publishtime"]
+			pt_time := qutil.Int64All(pt)
+			if pt_time > time.Now().Unix() {
+				continue
+			}
+			if qutil.Float64All(nowTime-pt_time) <= datelimit {
+				continuSum++
+				info := NewInfo(tmp)
+				dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
+				k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
+				data := dm.data[k]
+				if data == nil {
+					data = []*Info{}
+				}
+				data = append(data, info)
+				dm.data[k] = data
+				dm.keys[dkey] = true
+				//添加省
+				isAreaExist := false
+				for _, v := range dm.areakeys {
+					if v == info.area {
+						isAreaExist = true
+					}
+				}
+				if !isAreaExist {
+					areaArr := dm.areakeys
+					areaArr = append(areaArr, info.area)
+					dm.areakeys = areaArr
+				}
+			} else {
+				break
+			}
+		}
+		if n%10000 == 0 {
+			log.Println("当前 n:", n, "数量:", continuSum, tmp["_id"])
+		}
+		tmp = make(map[string]interface{})
+	}
+	log.Println("load data:", n, "总数:", continuSum)
+	return dm
+}
+
+// 数据构建
+func NewInfo(tmp map[string]interface{}) *Info {
+	subtype := qutil.ObjToString(tmp["subtype"])
+	if subtype == "招标" || subtype == "邀标" || subtype == "询价" ||
+		subtype == "竞谈" || subtype == "竞价" {
+		subtype = "招标"
+	}
+	area := qutil.ObjToString(tmp["area"])
+	if area == "A" {
+		area = "全国"
+	}
+	info := &Info{}
+	info.id = BsonTOStringId(tmp["_id"])
+	info.title = qutil.ObjToString(tmp["title"])
+	info.area = area
+	info.subtype = subtype
+	info.spidercode = qutil.ObjToString(tmp["spidercode"])
+	info.buyer = qutil.ObjToString(tmp["buyer"])
+	info.projectname = qutil.ObjToString(tmp["projectname"])
+	info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
+	info.projectcode = qutil.ObjToString(tmp["projectcode"])
+	info.city = qutil.ObjToString(tmp["city"])
+	info.agency = qutil.ObjToString(tmp["agency"])
+	info.winner = deleteExtraSpaceName(qutil.ObjToString(tmp["winner"]))
+	info.budget = qutil.Float64All(tmp["budget"])
+	info.bidamount = qutil.Float64All(tmp["bidamount"])
+	info.publishtime = qutil.Int64All(tmp["publishtime"])
+	info.comeintime = qutil.Int64All(tmp["comeintime"])
+	info.bidopentime = qutil.Int64All(tmp["bidopentime"])
+	info.bidopenaddress = qutil.ObjToString(tmp["bidopenaddress"])
+	info.site = qutil.ObjToString(tmp["site"])
+	info.href = qutil.ObjToString(tmp["href"])
+	info.repeatid = qutil.ObjToString(tmp["repeatid"])
+	info.specialWord = FilterRegTitle.MatchString(info.title)
+	info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) || FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
+	info.isJphref = IsJpHref(qutil.ObjToString(tmp["href"]))
+
+	//经过通用清洗后
+	info.c_title = cleanNameFilterRedundant(info.title)
+	info.c_projectname = cleanNameFilterRedundant(info.projectname)
+
+	return info
+}
+
+// 判重方法
+// 判重方法
+// 判重方法
+func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
+	reason := ""
+	keys := []string{}
+	d.lock.Lock()
+	for k, _ := range d.keys { //不同时间段
+		if info.area == "全国" { //匹配所有省
+			for _, v := range d.areakeys {
+				keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, v))
+			}
+		} else { //匹配指定省
+			keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
+		}
+		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
+	}
+	d.lock.Unlock()
+L:
+	for _, k := range keys {
+		d.lock.Lock()
+		data := d.data[k]
+		d.lock.Unlock()
+		if len(data) > 0 { //对比v   找到同类型,同省或全国的数据作对比
+			for _, v := range data {
+				reason = ""
+				if v.id == info.id { //正常重复
+					return false, v, ""
+				}
+				//buyer 优先级高,有值且不相等过滤
+				if info.buyer != "" && v.buyer != "" && info.buyer != v.buyer {
+					if buyerIsContinue(v, info) {
+						continue
+					}
+				}
+				// 竞品判重模式
+				if v.isJphref || info.isJphref {
+					if confirmJingPinIsRepeatData(v, info) {
+						reason = "竞品模式~重复"
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+				}
+				//站点补城市
+				if info.site != "" { //站点临时赋值
+					if info.area == "全国" || info.city == "" {
+						sitelock.Lock()
+						dict := SiteMap[info.site]
+						sitelock.Unlock()
+						if dict != nil && qutil.ObjToString(dict["city"]) != "" {
+							info.area = qutil.ObjToString(dict["area"])
+							info.city = qutil.ObjToString(dict["city"])
+						}
+					}
+				}
+				//前置条件-五要素均相等
+				if leadingElementSame(v, info) {
+					reason = "五要素-相同-满足"
+					b = true
+					source = v
+					reasons = reason
+					break L
+				}
+				//前置条件 - 站点相关
+				if info.site != "" && info.site == v.site {
+					if info.href != "" && info.href == v.href {
+						reason = "同站点-href相同"
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+					//相同发布时间-标题无包含关系 - 项目名称不等
+					if isTheSameDay(info.publishtime, v.publishtime) {
+						if !isTheSimilarName(info.title, v.title) {
+							continue
+						}
+					}
+					//不同href
+					if info.href != "" && info.href != v.href {
+						if v.title == info.title {
+							if !againRepeat(v, info, true) { //进行同站点二次判断
+								reason = "同站点-href不同-标题相同等"
+								b = true
+								source = v
+								reasons = reason
+								break L
+							} else {
+								continue
+							}
+						} else {
+							if againRepeat(v, info, true) {
+								continue
+							}
+						}
+					}
+				}
+				//特殊词处理
+				specialNum := dealWithSpecialWordNumber(info, v)
+				//前置条件 - 标题相关,有且一个关键词
+				if specialNum == 1 {
+					if againRepeat(v, info, false) {
+						continue
+					}
+				}
+				//前置条件 - 标题相关,均含有关键词
+				if specialNum == 2 {
+					if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+						v.title != "" && info.title != "" {
+						letter1, letter2 := v.title, info.title
+						res, _ := regexp.Compile("[0-9a-zA-Z]+")
+						if res.MatchString(letter1) || res.MatchString(letter2) {
+							letter1 = convertArabicNumeralsAndLetters(letter1)
+							letter2 = convertArabicNumeralsAndLetters(letter2)
+						}
+						if strings.Contains(letter1, "重新招标") || strings.Contains(letter2, "重新招标") {
+							letter1, letter2 = dealWithSpecialPhrases(letter1, letter2)
+						}
+						letter1 = cleanNameFilterRedundant(letter1)
+						letter2 = cleanNameFilterRedundant(letter2)
+						if letter1 == letter2 {
+							reason = reason + "标题关键词相等有效关系"
+							if !againRepeat(v, info, false) { //进行二级金额判断
+								b = true
+								source = v
+								reasons = reason
+								break L
+							}
+						} else {
+							if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
+								if againContainSpecialWord(v, info) { //无包含关系-即不相等
+									continue
+								}
+							}
+						}
+					}
+				}
+				//新增快速数据过少判重
+				if LowHeavy {
+					repeat := false
+					if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+				}
+
+				//代理机构相同-非空相等
+				if v.agency != "" && info.agency != "" && v.agency == info.agency {
+					reason = reason + "同机构-"
+					repeat := false
+					if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+				} else {
+					reason = reason + "非同机构-"
+					if info.city != "" && info.city == v.city {
+						reason = reason + "同城-"
+						repeat := false
+						if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break L
+						}
+					} else {
+						reason = reason + "不同城-"
+						repeat := false
+						if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break L
+						}
+					}
+				}
+			}
+
+		}
+	}
+
+	//往预存数据 d 添加
+	if !b {
+		ct := info.publishtime
+		dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
+		k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
+		d.lock.Lock()
+		data := d.data[k]
+		if data == nil {
+			data = []*Info{info}
+			d.data[k] = data
+			if !d.keys[dkey] {
+				d.keys[dkey] = true
+				d.update(ct)
+			}
+		} else {
+			data = append(data, info)
+			d.data[k] = data
+		}
+
+		//添加省
+		isAreaExist := false
+		for _, v := range d.areakeys {
+			if v == info.area {
+				isAreaExist = true
+			}
+		}
+		if !isAreaExist {
+			areaArr := d.areakeys
+			areaArr = append(areaArr, info.area)
+			d.areakeys = areaArr
+		}
+		d.lock.Unlock()
+	}
+	return
+}
+
+func (d *datamap) update(t int64) {
+
+	if TimingTask {
+
+	} else {
+		if IsFull {
+			d.keymap = d.GetLatelyFiveDay(t) //全量
+		} else {
+			d.keymap = d.GetLatelyFiveDayDouble(t) //增量
+		}
+		m := map[string]bool{}
+		for _, v := range d.keymap {
+			m[v] = true
+		}
+		for k, _ := range d.data {
+			if !m[k[:8]] {
+				delete(d.data, k)
+			}
+		}
+		for k, _ := range d.keys {
+			if !m[k] {
+				delete(d.keys, k)
+			}
+		}
+	}
+
+}
+
+func (d *datamap) GetLatelyFiveDay(t int64) []string {
+	array := make([]string, d.days)
+	now := time.Unix(t, 0)
+	for i := 0; i < d.days; i++ {
+		array[i] = now.Format(qutil.Date_yyyyMMdd)
+		now = now.AddDate(0, 0, -1)
+	}
+	return array
+}
+
+func (d *datamap) GetLatelyFiveDayDouble(t int64) []string { //增量-两倍
+	array := make([]string, d.days*2)
+	now := time.Now()
+	for i := 0; i < d.days*2; i++ {
+		array[i] = now.Format(qutil.Date_yyyyMMdd)
+		now = now.AddDate(0, 0, -1)
+	}
+	return array
+}
+
+// 替换原始数据池-更新
+func (d *datamap) replacePoolData(newData *Info) {
+	d.lock.Lock()
+	ct := newData.publishtime
+	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
+	k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
+	data := d.data[k]
+	for k, v := range data {
+		if v.id == newData.id { //替换
+			data[k] = newData
+			break
+		}
+	}
+	d.data[k] = data
+	d.lock.Unlock()
+}
+
+// 相互替换数据池-暂时弃用
+func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
+	//删除数据池的老数据
+	ct_old := oldData.publishtime
+	dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
+	k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
+	data_old := d.data[k_old]
+	for k, v := range data_old {
+		if v.id == oldData.id { //删除对应当前的老数据
+			data_old = append(data_old[:k], data_old[k+1:]...)
+			break
+		}
+	}
+	d.data[k_old] = data_old
+
+	//添加新的
+	ct := newData.publishtime
+	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
+	k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
+	d.lock.Lock()
+	data := d.data[k]
+	if data == nil {
+		data = []*Info{newData}
+		d.data[k] = data
+		if !d.keys[dkey] {
+			d.keys[dkey] = true
+			d.update(ct)
+		}
+	} else {
+		data = append(data, newData)
+		d.data[k] = data
+	}
+	//添加省
+	isAreaExist := false
+	for _, v := range d.areakeys {
+		if v == newData.area {
+			isAreaExist = true
+		}
+	}
+	if !isAreaExist {
+		areaArr := d.areakeys
+		areaArr = append(areaArr, newData.area)
+		d.areakeys = areaArr
+	}
+
+	d.lock.Unlock()
+}
+
+// 总计条数-暂时弃用
+func (d *datamap) currentTotalCount() int {
+	num := qutil.IntAll(0)
+	for _, v := range d.data {
+		num = num + qutil.IntAll(len(v))
+	}
+	return num
+}

+ 115 - 0
flow_repeat/fullDataRepeat.go

@@ -0,0 +1,115 @@
+package main
+
+import (
+	"fmt"
+	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	"log"
+	"sync"
+	"time"
+)
+
+var timeLayout = "2006-01-02"
+
+//var timeLayout = "2006-01-02 15:04:05"
+
+// 划分时间段落
+func initModelArr() []map[string]interface{} {
+	modelArr := make([]map[string]interface{}, 0)
+	start := time.Date(2021, 12, 15, 0, 0, 0, 0, time.Local).Unix()
+	end := time.Date(2022, 1, 1, 0, 0, 0, 0, time.Local).Unix()
+	gte_time := start
+	lt_time := start + 86400
+	log.Println("开始构建数据池...一周...")
+	FullDM = TimedTaskDatamap(dupdays, start, 1)
+	log.Println("......")
+	log.Println("开启...全量判重...", start, "~", end)
+	for {
+		modelArr = append(modelArr, map[string]interface{}{
+			"publishtime": map[string]interface{}{
+				"$gte": gte_time,
+				"$lt":  lt_time,
+			},
+		})
+		gte_time = lt_time
+		lt_time = gte_time + 86400
+		if lt_time > end {
+			break
+		}
+	}
+	return modelArr
+}
+
+// 全量数据处理
+func fullDataRepeat() {
+	modelArr := initModelArr()
+	for _, query := range modelArr {
+		pt := *qu.ObjToMap(query["publishtime"])
+		time_str := time.Unix(qu.Int64All(pt["$gte"]), 0).Format(timeLayout)
+		dealWithfullData(query, time_str)
+	}
+}
+
+// 多线程~处理数据
+func dealWithfullData(query map[string]interface{}, time_str string) {
+
+	log.Println("开始处理~", time_str, "~", query)
+	sess := data_mgo.GetMgoConn()
+	defer data_mgo.DestoryMongoConn(sess)
+	it := sess.DB(data_mgo.DbName).C(extract).Find(&query).Sort("publishtime").Iter()
+	total, isok, repeatN := 0, 0, 0
+	dataAllDict := make(map[string][]map[string]interface{}, 0)
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if qu.IntAll(tmp["repeat"]) == 1 || qu.IntAll(tmp["repeat"]) == -1 ||
+			qu.ObjToString(tmp["subtype"]) == "拟建" || qu.ObjToString(tmp["subtype"]) == "产权" ||
+			qu.ObjToString(tmp["spidercode"]) == "sdxzbiddingsjzypc" {
+			tmp = make(map[string]interface{})
+			continue
+		}
+
+		isok++
+		subtype := qu.ObjToString(tmp["subtype"])
+		if subtype == "招标" || subtype == "邀标" || subtype == "询价" ||
+			subtype == "竞谈" || subtype == "竞价" {
+			subtype = "招标"
+		}
+		dataArr := dataAllDict[subtype]
+		if dataArr == nil {
+			dataArr = []map[string]interface{}{}
+		}
+		dataArr = append(dataArr, tmp)
+		dataAllDict[subtype] = dataArr
+		tmp = make(map[string]interface{})
+	}
+	pool := make(chan bool, threadNum)
+	wg := &sync.WaitGroup{}
+	for _, dataArr := range dataAllDict {
+		fmt.Print("...")
+		pool <- true
+		wg.Add(1)
+		go func(dataArr []map[string]interface{}) {
+			defer func() {
+				<-pool
+				wg.Done()
+			}()
+			num := 0
+			for _, tmp := range dataArr {
+				info := NewInfo(tmp)
+				b, source, reason := FullDM.check(info)
+				if b {
+					num++
+					AddGroupPool.pool <- map[string]interface{}{
+						"_id":         StringTOBsonId(info.id),
+						"repeat_id":   source.id,
+						"reason":      reason,
+						"update_time": qu.Int64All(time.Now().Unix()),
+					}
+				}
+			}
+			numlock.Lock()
+			repeatN += num
+			numlock.Unlock()
+		}(dataArr)
+	}
+	wg.Wait()
+	log.Println("处理结束~", time_str, "总计需判重~", isok, "~重复量", repeatN)
+}

+ 29 - 0
flow_repeat/go.mod

@@ -0,0 +1,29 @@
+module flow_repeat
+
+go 1.21.0
+
+require (
+	github.com/nsqio/go-nsq v1.1.0
+	go.mongodb.org/mongo-driver v1.10.1
+	gopkg.in/mgo.v2 v2.0.0-20190816093944-a6b53ec6cb22
+	jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20230915054514-628d4fe7544c
+)
+
+require (
+	github.com/PuerkitoBio/goquery v1.8.0 // indirect
+	github.com/andybalholm/cascadia v1.3.1 // indirect
+	github.com/dchest/captcha v1.0.0 // indirect
+	github.com/golang/snappy v0.0.1 // indirect
+	github.com/klauspost/compress v1.13.6 // indirect
+	github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect
+	github.com/pkg/errors v0.9.1 // indirect
+	github.com/robfig/cron/v3 v3.0.1 // indirect
+	github.com/xdg-go/pbkdf2 v1.0.0 // indirect
+	github.com/xdg-go/scram v1.1.1 // indirect
+	github.com/xdg-go/stringprep v1.0.3 // indirect
+	github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect
+	golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d // indirect
+	golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd // indirect
+	golang.org/x/sync v0.0.0-20210220032951-036812b2e83c // indirect
+	golang.org/x/text v0.3.7 // indirect
+)

+ 197 - 0
flow_repeat/go.sum

@@ -0,0 +1,197 @@
+cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/BurntSushi/toml v1.2.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
+github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
+github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
+github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
+github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA=
+github.com/aws/aws-sdk-go v1.43.21/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
+github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
+github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
+github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dchest/captcha v1.0.0 h1:vw+bm/qMFvTgcjQlYVTuQBJkarm5R0YSsDKhm1HZI2o=
+github.com/dchest/captcha v1.0.0/go.mod h1:7zoElIawLp7GUMLcj54K9kbw+jEyvz2K0FDdRRYhvWo=
+github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
+github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
+github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
+github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/go-sql-driver/mysql v1.6.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg=
+github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
+github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
+github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A=
+github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
+github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
+github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
+github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
+github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w=
+github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0=
+github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8=
+github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
+github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
+github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/gomodule/redigo v1.8.9/go.mod h1:7ArFNvsTjH8GMMzB4uy1snslv2BwmginuMs06a1uzZE=
+github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
+github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.7 h1:81/ik6ipDQS2aGcBfIN5dHDB36BwrStyeAQquSYCV4o=
+github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8/DtOE=
+github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
+github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
+github.com/klauspost/compress v1.13.6 h1:P76CopJELS0TiO2mebmnzgWaajssP/EszplttgQxcgc=
+github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
+github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
+github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
+github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe h1:iruDEfMl2E6fbMZ9s0scYfZQ84/6SPL6zC8ACM2oIL0=
+github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
+github.com/nsqio/go-nsq v1.1.0 h1:PQg+xxiUjA7V+TLdXw7nVrJ5Jbl3sN86EhGCQj4+FYE=
+github.com/nsqio/go-nsq v1.1.0/go.mod h1:vKq36oyeVXgsS5Q8YEO7WghqidAVXQlcFxzQbQTuDEY=
+github.com/olivere/elastic/v7 v7.0.32/go.mod h1:c7PVmLe3Fxq77PIfY/bZmxY/TAamBhCzZ8xDOE09a9k=
+github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc=
+github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
+github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
+github.com/smartystreets/assertions v1.1.1/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo=
+github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM=
+github.com/smartystreets/gunit v1.4.2/go.mod h1:ZjM1ozSIMJlAz/ay4SG8PeKF00ckUp+zMHZXV9/bvak=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4=
+github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
+github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c=
+github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI=
+github.com/xdg-go/scram v1.1.1 h1:VOMT+81stJgXW3CpHyqHN3AXDYIMsx56mEFrB37Mb/E=
+github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g=
+github.com/xdg-go/stringprep v1.0.3 h1:kdwGpVNwPFtjs98xCGkHjQtGKh86rDcRZN17QEMCOIs=
+github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8=
+github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d h1:splanxYIlg+5LfHAM6xpdFEAYOk8iySO56hMFq6uLyA=
+github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA=
+github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k=
+go.mongodb.org/mongo-driver v1.10.1 h1:NujsPveKwHaWuKUer/ceo9DzEe7HIj1SlJ6uvXZG0S4=
+go.mongodb.org/mongo-driver v1.10.1/go.mod h1:z4XpeoU6w+9Vht+jAFyLgVrD+jGSQQe0+CBWFHNiHt8=
+go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E=
+go.opentelemetry.io/otel v1.5.0/go.mod h1:Jm/m+rNp/z0eqJc74H7LPwQ3G87qkU/AnnAydAjSAHk=
+go.opentelemetry.io/otel/trace v1.5.0/go.mod h1:sq55kfhjXYr1zVSyexg0w1mpa03AYXR5eyTkB9NPPdE=
+go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
+go.uber.org/goleak v1.1.11/go.mod h1:cwTWslyiVhfpKIDGSZEM2HlOvcqm+tG4zioyIeLoqMQ=
+go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
+go.uber.org/zap v1.22.0/go.mod h1:H4siCOZOrAolnUPJEkfaSjDqyP+BDS0DdDWzwcgt3+U=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d h1:sK3txAijHtOK88l68nt020reeT1ZdKLIYetKl95FzVY=
+golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
+golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
+golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
+golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
+golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
+golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
+golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
+golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
+golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM=
+golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd h1:O7DYs+zxREGLKzKoMQrtrEacpb0ZVXA5rIwylE2Xchk=
+golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
+golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
+golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20210220032951-036812b2e83c h1:5KslGYwFpkhGh+Q16bwMP3cOontH8FOep7tGV86Y7SQ=
+golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
+golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
+golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
+golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
+google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
+google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc=
+google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
+google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
+google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
+google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
+google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
+google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
+google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
+google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
+google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
+google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
+google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE=
+google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo=
+google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
+google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
+gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/mgo.v2 v2.0.0-20190816093944-a6b53ec6cb22 h1:VpOs+IwYnYBaFnrNAeB8UUWtL3vEUnzSCL1nVjPhqrw=
+gopkg.in/mgo.v2 v2.0.0-20190816093944-a6b53ec6cb22/go.mod h1:yeKp02qBN3iKW1OzL3MGk2IdtZzaj7SFntXj72NppTA=
+gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k=
+gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
+jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20230915054514-628d4fe7544c h1:fgH3lMOi2jG3KCejzC3d0otkJB6mJ9eaq1HecMoHG4c=
+jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20230915054514-628d4fe7544c/go.mod h1:1Rp0ioZBhikjXHYYXmnzL6RNfvTDM/2XvRB+vuPLurI=

+ 383 - 0
flow_repeat/historyRepeat.go

@@ -0,0 +1,383 @@
+package main
+
+import (
+	"encoding/json"
+	"github.com/robfig/cron/v3"
+	"gopkg.in/mgo.v2/bson"
+	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	mu "jygit.jydev.jianyu360.cn/data_processing/common_utils/udp"
+	"log"
+	"net"
+	"os"
+	"strconv"
+	"sync"
+	"time"
+)
+
+// 历史判重
+func historyRepeat() {
+	defer qu.Catch()
+	for {
+		start := time.Now().Unix()
+		if gtid == "" {
+			log.Println("请传gtid,否则无法运行")
+			os.Exit(0)
+			return
+		}
+		if lteid != "" && !IsFull { //先进行数据迁移
+			log.Println("开启一次迁移任务", gtid, lteid)
+			moveHistoryData(gtid, lteid)
+			gtid = lteid //替换数据
+		}
+		//查询表最后一个id
+		task_sess := task_mgo.GetMgoConn()
+		defer task_mgo.DestoryMongoConn(task_sess)
+		q := map[string]interface{}{}
+		it_last := task_sess.DB(task_mgo.DbName).C(task_coll).Find(&q).Sort("-_id").Iter()
+		isRepeatStatus := false
+		for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+			is_repeat_status := qu.IntAll(tmp["repeat_status"])
+			if is_repeat_status == 1 {
+				lteid = qu.ObjToString(tmp["lteid"])
+				log.Println("查询的最后一个已标记的任务lteid:", lteid)
+				isRepeatStatus = true
+				tmp = make(map[string]interface{})
+				break
+			} else {
+				tmp = make(map[string]interface{})
+			}
+		}
+		if !isRepeatStatus {
+			log.Println("查询不到有标记的lteid数据......睡眠......")
+			time.Sleep(30 * time.Second)
+			continue
+		}
+
+		log.Println("查询找到有标记的lteid......睡眠......", gtid, lteid)
+		if isUpdateSite {
+			initSite()
+		}
+		time.Sleep(30 * time.Second)
+
+		sess := data_mgo.GetMgoConn() //连接器
+		defer data_mgo.DestoryMongoConn(sess)
+		between_time := time.Now().Unix() - (86400 * timingPubScope) //两年周期
+
+		//开始判重
+		q = map[string]interface{}{
+			"_id": map[string]interface{}{
+				"$gt":  StringTOBsonId(gtid),
+				"$lte": StringTOBsonId(lteid),
+			},
+		}
+		log.Println("历史判重查询条件:", q, "时间:", between_time)
+		it := sess.DB(data_mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
+		num, oknum, outnum, deterTime := int64(0), int64(0), int64(0), int64(0) //计数
+		pendAllArr := [][]map[string]interface{}{}                              //待处理数组
+		dayArr := []map[string]interface{}{}
+		for tmp := make(map[string]interface{}); it.Next(&tmp); num++ {
+			if num%10000 == 0 {
+				log.Println("正序遍历:", num)
+			}
+			//取-符合-发布时间X年内的数据
+			if qu.IntAll(tmp["dataging"]) == 1 {
+				pubtime := qu.Int64All(tmp["publishtime"])
+				if pubtime > 0 && pubtime >= between_time && qu.ObjToString(tmp["subtype"]) != "拟建" && qu.ObjToString(tmp["subtype"]) != "产权" &&
+					qu.ObjToString(tmp["spidercode"]) != "sdxzbiddingsjzypc" {
+					oknum++
+					if deterTime == 0 {
+						log.Println("找到第一条符合条件的数据")
+						deterTime = qu.Int64All(tmp["publishtime"])
+						dayArr = append(dayArr, tmp)
+					} else {
+						if pubtime-deterTime > timingSpanDay*86400 {
+							//新数组重新构建,当前组数据加到全部组数据
+							pendAllArr = append(pendAllArr, dayArr)
+							dayArr = []map[string]interface{}{}
+							deterTime = qu.Int64All(tmp["publishtime"])
+							dayArr = append(dayArr, tmp)
+						} else {
+							dayArr = append(dayArr, tmp)
+						}
+					}
+				} else {
+					outnum++
+					//不在两年内的也清标记
+					Update.updatePool <- []map[string]interface{}{ //重复数据打标签
+						map[string]interface{}{
+							"_id": tmp["_id"],
+						},
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"dataging":           0,
+								"history_updatetime": qu.Int64All(time.Now().Unix()),
+							},
+						},
+					}
+				}
+			}
+			tmp = make(map[string]interface{})
+		}
+
+		if len(dayArr) > 0 {
+			pendAllArr = append(pendAllArr, dayArr)
+			dayArr = []map[string]interface{}{}
+		}
+
+		log.Println("查询数量:", num, "符合条件:", oknum, "未在两年内:", outnum)
+
+		if len(pendAllArr) <= 0 {
+			log.Println("没找到dataging==1的数据")
+		}
+
+		//测试分组数量是否正确
+		testNum := 0
+		for k, v := range pendAllArr {
+			log.Println("第", k, "组--", "数量:", len(v))
+			testNum = testNum + len(v)
+		}
+		log.Println("本地构建分组完成:", len(pendAllArr), "组", "测试-总计数量:", testNum)
+
+		n, repeateN := 0, 0
+		log.Println("线程数:", threadNum)
+		pool := make(chan bool, threadNum)
+		wg := &sync.WaitGroup{}
+		for k, v := range pendAllArr { //每组结束更新一波数据
+			pool <- true
+			wg.Add(1)
+			go func(k int, v []map[string]interface{}) {
+				defer func() {
+					<-pool
+					wg.Done()
+				}()
+				log.Println("构建第", k, "组---(数据池)")
+				//当前组的第一个发布时间
+				first_pt := qu.Int64All(v[len(v)-1]["publishtime"])
+				curTM := TimedTaskDatamap(dupdays+int(timingSpanDay)+1, first_pt+86400, int(k))
+				log.Println("开始遍历判重第", k, "组  共计数量:", len(v))
+				n = n + len(v)
+				log.Println("统计目前总数量:", n, "重复数量:", repeateN)
+				for _, tmp := range v {
+					info := NewInfo(tmp)
+					b, source, reason := curTM.check(info)
+					if b { //有重复,更新
+						repeateN++
+						Update.updatePool <- []map[string]interface{}{ //重复数据打标签
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat":             1,
+									"repeat_reason":      reason,
+									"repeat_id":          source.id,
+									"dataging":           0,
+									"history_updatetime": qu.Int64All(time.Now().Unix()),
+								},
+							},
+						}
+						//关闭数据替换功能
+						//if judgeIsReplaceInfo(source.href, info.href) && !IsFull {
+						//	datalock.Lock()
+						//	temp_source_id := source.id
+						//	temp_info_id := info.id
+						//	temp_source := info
+						//	temp_source.id = temp_source_id
+						//	curTM.replacePoolData(temp_source)
+						//	//替换抽取表数据
+						//	is_log, is_exists, ext_s_data, ext_i_data := confrimHistoryExtractData(temp_source_id, temp_info_id)
+						//	is_bid, bid_s_data, bid_i_data := confrimBiddingData(temp_source_id, temp_info_id)
+						//
+						//	if is_log && is_bid {
+						//		data_mgo.Save(extract_log, map[string]interface{}{
+						//			"_id":        StringTOBsonId(temp_info_id),
+						//			"replace_id": temp_source_id,
+						//			"is_history": 1,
+						//		})
+						//		ext_s_data["repeat"] = 0
+						//		ext_s_data["dataging"] = 0
+						//		ext_i_data["repeat"] = 1
+						//		ext_i_data["repeat_id"] = temp_source_id
+						//		ext_i_data["repeat_reason"] = reason
+						//		ext_i_data["dataging"] = 0
+						//		ext_i_data["history_updatetime"] = qu.Int64All(time.Now().Unix())
+						//		if is_exists {
+						//			data_mgo.DeleteById(extract, temp_source_id)
+						//			data_mgo.Save(extract, ext_s_data)
+						//		} else {
+						//			data_mgo.DeleteById(extract_back, temp_source_id)
+						//			data_mgo.Save(extract_back, ext_s_data)
+						//			is_del := data_mgo.DeleteById(extract, temp_source_id)
+						//			if is_del > 0 {
+						//				data_mgo.Save(extract, ext_s_data)
+						//			}
+						//		}
+						//		data_mgo.DeleteById(extract, temp_info_id)
+						//		data_mgo.Save(extract, ext_i_data)
+						//
+						//		task_mgo.DeleteById(task_bidding, temp_source_id)
+						//		task_mgo.Save(task_bidding, bid_s_data)
+						//		task_mgo.DeleteById(task_bidding, temp_info_id)
+						//		task_mgo.Save(task_bidding, bid_i_data)
+						//
+						//		//通道填充数据
+						//		msg := "id=" + temp_source_id
+						//		_ = nspdata_1.Publish(msg)
+						//		_ = nspdata_2.Publish(msg)
+						//
+						//	} else {
+						//		log.Println("替换~相关表~未查询到数据~", temp_source_id, "~", temp_info_id)
+						//	}
+						//
+						//	datalock.Unlock()
+						//} else {
+						//	Update.updatePool <- []map[string]interface{}{ //重复数据打标签
+						//		map[string]interface{}{
+						//			"_id": tmp["_id"],
+						//		},
+						//		map[string]interface{}{
+						//			"$set": map[string]interface{}{
+						//				"repeat":             1,
+						//				"repeat_reason":      reason,
+						//				"repeat_id":          source.id,
+						//				"dataging":           0,
+						//				"history_updatetime": util.Int64All(time.Now().Unix()),
+						//			},
+						//		},
+						//	}
+						//}
+					} else {
+						Update.updatePool <- []map[string]interface{}{ //重复数据打标签
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"dataging":           0, //符合条件的都为dataging==0
+									"history_updatetime": qu.Int64All(time.Now().Unix()),
+								},
+							},
+						}
+					}
+				}
+			}(k, v)
+		}
+		wg.Wait()
+
+		log.Println("this timeTask over.", n, "repeateN:", repeateN, gtid, lteid)
+
+		time.Sleep(30 * time.Second)
+		//任务完成,开始发送广播通知下面节点 发udp 去升索引待定 + 合并
+		if gtid != lteid {
+			for _, to := range nextNode {
+				next_sid := qu.BsonIdToSId(gtid)
+				next_eid := qu.BsonIdToSId(lteid)
+				key := next_sid + "-" + next_eid + "-" + qu.ObjToString(to["stype"])
+				by, _ := json.Marshal(map[string]interface{}{
+					"gtid":  next_sid,
+					"lteid": next_eid,
+					"stype": qu.ObjToString(to["stype"]),
+					"key":   key,
+				})
+				addr := &net.UDPAddr{
+					IP:   net.ParseIP(to["addr"].(string)),
+					Port: qu.IntAll(to["port"]),
+				}
+				node := &udpNode{by, addr, time.Now().Unix(), 0}
+				udptaskmap.Store(key, node)
+				udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+			}
+		}
+
+		end := time.Now().Unix()
+
+		log.Println(gtid, lteid)
+
+		if end-start < 60*5 {
+			log.Println("睡眠.............")
+			time.Sleep(5 * time.Minute)
+		}
+		log.Println("继续下一段的历史判重")
+	}
+}
+
+// 判断是否在当前id段落
+func judgeIsCurIds(gtid string, lteid string, curid string) bool {
+
+	gt_time, _ := strconv.ParseInt(gtid[:8], 16, 64)
+	lte_time, _ := strconv.ParseInt(lteid[:8], 16, 64)
+	cur_time, _ := strconv.ParseInt(curid[:8], 16, 64)
+	if cur_time >= gt_time && cur_time <= lte_time {
+		return true
+	}
+	return false
+}
+
+// 迁移上一段数据
+func moveHistoryData(startid string, endid string) {
+	sess := data_mgo.GetMgoConn()
+	defer data_mgo.DestoryMongoConn(sess)
+	year, month, day := time.Now().Date()
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(startid),
+			"$lte": StringTOBsonId(endid),
+		},
+	}
+	log.Println(q)
+	it := sess.DB(data_mgo.DbName).C(extract).Find(&q).Iter()
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		data_mgo.Save(extract_back, tmp)
+		tmp = map[string]interface{}{}
+		if index%1000 == 0 {
+			log.Println("index", index)
+		}
+	}
+	log.Println("save to", extract_back, " ok index", index)
+
+	qv := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$lt": time.Date(year, month, day, 0, 0, 0, 0, time.Local).Add(-time.Duration(dupdays+1) * 24 * time.Hour * 2).Unix(),
+		},
+	}
+	delnum := data_mgo.Delete(extract, qv)
+	log.Println("remove from ", extract, delnum)
+
+}
+
+// 暂时弃用
+func moveTimeoutData() {
+	log.Println("部署迁移定时任务")
+	c := cron.New()
+	c.AddFunc("0 0 0 * * ?", func() { moveOnceTimeOut() })
+	c.Start()
+}
+func moveOnceTimeOut() {
+	log.Println("执行一次迁移超时数据")
+	sess := data_mgo.GetMgoConn()
+	defer data_mgo.DestoryMongoConn(sess)
+	now := time.Now()
+
+	move_time := time.Date(now.Year()-2, now.Month(), now.Day(), 0, 0, 0, 0, time.Local)
+	task_id := qu.BsonIdToSId(bson.NewObjectIdWithTime(move_time))
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$lt": StringTOBsonId(task_id),
+		},
+	}
+
+	it := sess.DB(data_mgo.DbName).C("result_20200714").Find(&q).Iter()
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		if index%10000 == 0 {
+			log.Println("index", index)
+		}
+		del_id := BsonTOStringId(tmp["_id"])
+		data_mgo.Save("result_20200713", tmp)
+		data_mgo.DeleteById("result_20200714", del_id)
+		tmp = map[string]interface{}{}
+	}
+	log.Println("save and delete", " ok index", index)
+
+}

+ 267 - 0
flow_repeat/increaseRepeat.go

@@ -0,0 +1,267 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	mu "jygit.jydev.jianyu360.cn/data_processing/common_utils/udp"
+	"log"
+	"net"
+	"sync"
+	"time"
+)
+
+// 开始增量判重程序
+func increaseRepeat(mapInfo map[string]interface{}) {
+	defer qu.Catch()
+	//区间id
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
+	}
+	log.Println("~~~~~~")
+	log.Println("开始增量数据判重~查询条件:", data_mgo.DbName, extract, q)
+	sess := data_mgo.GetMgoConn()
+	defer data_mgo.DestoryMongoConn(sess)
+	it := sess.DB(data_mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
+	total, isok, repeatN := 0, 0, 0
+	dataAllDict := make(map[string][]map[string]interface{}, 0)
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total%1000 == 0 {
+			log.Println("current index : ", total, isok)
+		}
+		if qu.IntAll(tmp["repeat"]) == 1 {
+			repeatN++
+			tmp = make(map[string]interface{})
+			continue
+		}
+		if qu.IntAll(tmp["dataging"]) == 1 && !IsFull {
+			tmp = make(map[string]interface{})
+			continue
+		}
+		if qu.ObjToString(tmp["subtype"]) == "拟建" || qu.ObjToString(tmp["subtype"]) == "产权" {
+			tmp = make(map[string]interface{})
+			continue
+		}
+		if qu.ObjToString(tmp["spidercode"]) == "sdxzbiddingsjzypc" {
+			tmp = make(map[string]interface{})
+			continue
+		}
+
+		//数据分组-按照类别分组
+		isok++
+		subtype := qu.ObjToString(tmp["subtype"])
+		if subtype == "招标" || subtype == "邀标" || subtype == "询价" ||
+			subtype == "竞谈" || subtype == "竞价" {
+			subtype = "招标"
+		}
+		dataArr := dataAllDict[subtype]
+		if dataArr == nil {
+			dataArr = []map[string]interface{}{}
+		}
+		dataArr = append(dataArr, tmp)
+		dataAllDict[subtype] = dataArr
+		tmp = make(map[string]interface{})
+	}
+	log.Println("类别组:", len(dataAllDict), "组", "~", "总计:", total, "~", "需判重:", isok)
+	pool := make(chan bool, threadNum)
+	wg := &sync.WaitGroup{}
+	for _, dataArr := range dataAllDict {
+		fmt.Print("...")
+		pool <- true
+		wg.Add(1)
+		go func(dataArr []map[string]interface{}) {
+			defer func() {
+				<-pool
+				wg.Done()
+			}()
+			num := 0
+			for _, tmp := range dataArr {
+				info := NewInfo(tmp)
+				b, source, reason := DM.check(info)
+				if b {
+					//判断信息是否为-指定剑鱼发布数据
+					if jyfb_data[info.spidercode] != "" { //伪判重标记
+						Update.updatePool <- []map[string]interface{}{ //原始数据打标签
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat_jyfb": 1,
+								},
+							},
+						}
+					} else {
+						num++
+						//判断是否为~替换数据~模式
+						if judgeIsReplaceInfo(source.href, info.href) && !IsFull {
+							datalock.Lock()
+							temp_source_id := source.id
+							temp_info_id := info.id
+							temp_source := info
+							temp_source.id = temp_source_id
+							DM.replacePoolData(temp_source)
+							//替换抽取表数据
+							is_log, ext_s_data, ext_i_data := confrimExtractData(temp_source_id, temp_info_id)
+							is_bid, bid_s_data, bid_i_data := confrimBiddingData(temp_source_id, temp_info_id)
+							if is_log && is_bid {
+								data_mgo.Save(extract_log, map[string]interface{}{
+									"_id":        tmp["_id"],
+									"replace_id": temp_source_id,
+									"is_history": 0,
+								})
+								ext_s_data["repeat"] = 0
+								ext_i_data["repeat"] = 1
+								ext_i_data["repeat_id"] = temp_source_id
+								ext_i_data["repeat_reason"] = reason
+
+								data_mgo.DeleteById(extract, temp_source_id)
+								data_mgo.Save(extract, ext_s_data)
+
+								is_del := data_mgo.DeleteById(extract_back, temp_source_id)
+								if is_del > 0 {
+									data_mgo.Save(extract_back, ext_s_data)
+								}
+
+								data_mgo.DeleteById(extract, temp_info_id)
+								data_mgo.Save(extract, ext_i_data)
+
+								task_mgo.DeleteById(task_bidding, temp_source_id)
+								task_mgo.Save(task_bidding, bid_s_data)
+								task_mgo.DeleteById(task_bidding, temp_info_id)
+								task_mgo.Save(task_bidding, bid_i_data)
+
+								//通道填充数据
+								msg := "id=" + temp_source_id
+								_ = nspdata_1.Publish(msg)
+								_ = nspdata_2.Publish(msg)
+							} else {
+								log.Println("替换~相关表~未查询到数据~", temp_source_id, "~", temp_info_id)
+							}
+							datalock.Unlock()
+						} else {
+							//更新池~更新
+							Update.updatePool <- []map[string]interface{}{ //重复数据打标签
+								map[string]interface{}{
+									"_id": tmp["_id"],
+								},
+								map[string]interface{}{
+									"$set": map[string]interface{}{
+										"repeat":        1,
+										"repeat_reason": reason,
+										"repeat_id":     source.id,
+									},
+								},
+							}
+						}
+					}
+				}
+			}
+			numlock.Lock()
+			repeatN += num
+			numlock.Unlock()
+		}(dataArr)
+	}
+	wg.Wait()
+	log.Println("当前~判重~结束~", total, "重复~", repeatN)
+	//更新流程记录表
+	updateProcessUdpIdsInfo(qu.ObjToString(mapInfo["gtid"]), qu.ObjToString(mapInfo["lteid"]))
+	time.Sleep(10 * time.Second)
+	log.Println("判重任务完成...发送下节点udp...")
+	for _, to := range nextNode {
+		sid, _ := mapInfo["gtid"].(string)
+		eid, _ := mapInfo["lteid"].(string)
+		key := sid + "-" + eid + "-" + qu.ObjToString(to["stype"])
+		by, _ := json.Marshal(map[string]interface{}{
+			"gtid":  sid,
+			"lteid": eid,
+			"stype": qu.ObjToString(to["stype"]),
+			"key":   key,
+		})
+		addr := &net.UDPAddr{
+			IP:   net.ParseIP(to["addr"].(string)),
+			Port: qu.IntAll(to["port"]),
+		}
+		node := &udpNode{by, addr, time.Now().Unix(), 0}
+		udptaskmap.Store(key, node)
+		udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+	}
+}
+
+// 更新流程记录id段落
+func updateProcessUdpIdsInfo(sid string, eid string) {
+	//判重有合并操作~所以要联合查询
+	query := map[string]interface{}{
+		"gtid": map[string]interface{}{
+			"$gte": sid,
+		},
+		"lteid": map[string]interface{}{
+			"$lte": eid,
+		},
+	}
+	datas, _ := task_mgo.Find(task_coll, query, nil, nil)
+	if len(datas) > 0 {
+		log.Println("开始更新流程段落记录~~", len(datas), "段")
+		for _, v := range datas {
+			up_id := BsonTOStringId(v["_id"])
+			if up_id != "" {
+				update := map[string]interface{}{
+					"$set": map[string]interface{}{
+						"dataprocess":   6,
+						"repeat_status": 1,
+						"updatetime":    time.Now().Unix(),
+					},
+				}
+				task_mgo.UpdateById(task_coll, up_id, update)
+				log.Println("流程段落记录~~更新完毕~", update)
+			}
+		}
+	} else {
+		log.Println("未查询到记录id段落~", query)
+	}
+}
+
+// 更新ocr表~弃用
+func updateOcrFileData(cur_lteid string) {
+	//更新ocr 分类表-判重的状态
+	log.Println("开始更新Ocr表-标记", cur_lteid)
+	task_sess := task_mgo.GetMgoConn()
+	defer task_mgo.DestoryMongoConn(task_sess)
+	q_task := map[string]interface{}{}
+	it_last := task_sess.DB(task_mgo.DbName).C(task_coll).Find(&q_task).Sort("-_id").Iter()
+	isUpdateOcr := false
+	updateOcrFile := [][]map[string]interface{}{}
+	for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+		cur_id := BsonTOStringId(tmp["_id"])
+		lte_id := qu.ObjToString(tmp["lteid"])
+		if lte_id == cur_lteid { //需要更新
+			log.Println("找到该lteid数据", cur_lteid, cur_id)
+			isUpdateOcr = true
+			updateOcrFile = append(updateOcrFile, []map[string]interface{}{ //重复数据打标签
+				map[string]interface{}{
+					"_id": tmp["_id"],
+				},
+				map[string]interface{}{
+					"$set": map[string]interface{}{
+						"is_repeat_status": 1,
+						"is_repeat_time":   qu.Int64All(time.Now().Unix()),
+					},
+				},
+			})
+			tmp = make(map[string]interface{})
+			break
+		} else {
+			tmp = make(map[string]interface{})
+		}
+	}
+	if !isUpdateOcr {
+		log.Println("出现异常问题,查询不到ocr的lteid", cur_lteid)
+	} else {
+		if len(updateOcrFile) > 0 {
+			task_mgo.UpSertBulk(task_coll, updateOcrFile...)
+		}
+	}
+}

+ 121 - 0
flow_repeat/initData.go

@@ -0,0 +1,121 @@
+package main
+
+import (
+	"flow_repeat/nsqdata"
+	"github.com/robfig/cron/v3"
+	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	"log"
+	"regexp"
+)
+
+func InitAllInfos() {
+	initMgo()
+	initVar()
+	initNsq()
+	initSite()
+	initData()
+}
+
+func initMgo() {
+	spider_mconf := Sysconfig["spider_mongodb"].(map[string]interface{})
+	spider_mgo = &MongodbSim{
+		MongodbAddr: spider_mconf["spider_addr"].(string),
+		DbName:      spider_mconf["spider_db"].(string),
+		Size:        qu.IntAllDef(spider_mconf["spider_pool"], 5),
+		UserName:    spider_mconf["username"].(string),
+		Password:    spider_mconf["password"].(string),
+	}
+	spider_mgo.InitPool()
+	spider_coll = spider_mconf["spider_coll"].(string)
+
+	task_mconf := Sysconfig["task_mongodb"].(map[string]interface{})
+	task_mgo = &MongodbSim{
+		MongodbAddr: task_mconf["task_addr"].(string),
+		DbName:      task_mconf["task_db"].(string),
+		Size:        qu.IntAllDef(task_mconf["task_pool"], 10),
+		UserName:    task_mconf["username"].(string),
+		Password:    task_mconf["password"].(string),
+	}
+	task_mgo.InitPool()
+	task_coll = task_mconf["task_coll"].(string)
+	task_bidding = task_mconf["task_bidding"].(string)
+
+	nextNode = qu.ObjArrToMapArr(Sysconfig["nextNode"].([]interface{}))
+	mconf := Sysconfig["mongodb"].(map[string]interface{})
+	data_mgo = &MongodbSim{
+		MongodbAddr: mconf["addr"].(string),
+		DbName:      mconf["db"].(string),
+		Size:        qu.IntAllDef(mconf["pool"], 10),
+		UserName:    mconf["username"].(string),
+		Password:    mconf["password"].(string),
+	}
+	data_mgo.InitPool()
+
+	extract = mconf["extract"].(string)
+	extract_back = mconf["extract_back"].(string)
+	extract_log = mconf["extract_log"].(string)
+}
+func initVar() {
+	FilterRegTitle = regexp.MustCompile(qu.ObjToString(Sysconfig["specialwords"]))
+	FilterRegTitle_0 = regexp.MustCompile(qu.ObjToString(Sysconfig["specialtitle_0"]))
+	FilterRegTitle_1 = regexp.MustCompile(qu.ObjToString(Sysconfig["specialtitle_1"]))
+	FilterRegTitle_2 = regexp.MustCompile(qu.ObjToString(Sysconfig["specialtitle_2"]))
+	threadNum = qu.IntAllDef(Sysconfig["threads"], 1)
+	LowHeavy = Sysconfig["lowHeavy"].(bool)
+	TimingTask = Sysconfig["timingTask"].(bool)
+	timingSpanDay = qu.Int64All(Sysconfig["timingSpanDay"])
+	timingPubScope = qu.Int64All(Sysconfig["timingPubScope"])
+	jyfb_arr := qu.ObjArrToStringArr(Sysconfig["jyfb_data"].([]interface{}))
+	jyfb_data = make(map[string]string, 0)
+	for _, v := range jyfb_arr {
+		jyfb_data[v] = v
+	}
+}
+func initSite() {
+	cronlock.Lock()
+	isUpdateSite = false
+	SiteMap = make(map[string]map[string]interface{}, 0)
+	sess := spider_mgo.GetMgoConn()
+	defer data_mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{}
+	res := sess.DB(spider_mgo.DbName).C(spider_coll).Find(&q).Sort("_id").Iter()
+	for tmp := make(map[string]interface{}); res.Next(&tmp); {
+		data := map[string]interface{}{
+			"area":     qu.ObjToString(tmp["area"]),
+			"city":     qu.ObjToString(tmp["city"]),
+			"district": qu.ObjToString(tmp["district"]),
+		}
+		SiteMap[qu.ObjToString(tmp["site"])] = data
+	}
+	log.Println("new站点加载完毕~", len(SiteMap))
+	cronlock.Unlock()
+}
+func initNsq() {
+	nsqAddr := "172.17.162.36:4150"
+	if !IsFull {
+		var err error
+		nspdata_1, err = nsqdata.NewProducer(nsqAddr, "bidding_id", true)
+		if err != nil {
+			log.Fatal("通道配置异常~", err)
+		} else {
+			log.Println("通道配置正常")
+		}
+		nspdata_2, err = nsqdata.NewProducer(nsqAddr, "project_id", true)
+		if err != nil {
+			log.Fatal("通道配置异常~", err)
+		} else {
+			log.Println("通道配置正常~")
+		}
+	}
+}
+func initData() {
+	dupdays = qu.IntAllDef(Sysconfig["dupdays"], 5)
+	DM = NewDatamap(dupdays, lastid)
+	Update = newUpdatePool()
+	go Update.updateData()
+	c := cron.New()
+	c.AddFunc("0 0 6 * * ?", func() {
+		isUpdateSite = true
+	})
+	c.Start()
+}

+ 201 - 1
flow_repeat/main.go

@@ -1 +1,201 @@
-package flow_repeat
+package main
+
+/**
+招标信息判重
+**/
+
+import (
+	"encoding/json"
+	"flag"
+	"flow_repeat/nsqdata"
+	"fmt"
+	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	mu "jygit.jydev.jianyu360.cn/data_processing/common_utils/udp"
+	"log"
+	"net"
+	"regexp"
+	"sync"
+	"time"
+)
+
+var (
+	Sysconfig                            map[string]interface{} //配置文件
+	data_mgo, task_mgo, spider_mgo       *MongodbSim
+	task_coll, task_bidding, spider_coll string
+	extract, extract_back, extract_log   string
+	udpclient                            mu.UdpClient
+	nextNode                             []map[string]interface{}
+	dupdays                              = 7
+	DM, FullDM                           *datamap
+	Update                               *updateInfo
+	AddGroupPool                         *addGroupInfo
+	//正则筛选相关
+	FilterRegTitle                             = regexp.MustCompile("^_$")
+	FilterRegTitle_0                           = regexp.MustCompile("^_$")
+	FilterRegTitle_1                           = regexp.MustCompile("^_$")
+	FilterRegTitle_2                           = regexp.MustCompile("^_$")
+	threadNum                                  int
+	SiteMap                                    map[string]map[string]interface{}
+	LowHeavy, TimingTask, IsFull, isUpdateSite bool
+	timingSpanDay, timingPubScope              int64
+	gtid, lastid, sec_gtid, sec_lteid, lteid   string
+	updatelock, datalock, numlock, cronlock    sync.Mutex
+	jyfb_data                                  map[string]string
+	taskList                                   []map[string]interface{}
+	nspdata_1, nspdata_2                       *nsqdata.Producer
+	responselock                               sync.Mutex
+	lastNodeResponse                           int64
+)
+
+// 初始化加载
+func init() {
+	flag.StringVar(&lastid, "id", "", "增量加载的lastid") //增量
+	flag.StringVar(&gtid, "gtid", "", "历史增量的起始id")   //历史
+	flag.StringVar(&sec_gtid, "sec_gtid", "", "全量分段起始id")
+	flag.StringVar(&sec_lteid, "sec_lteid", "", "全量分段结束id")
+	flag.Parse()
+	qu.ReadConfig(&Sysconfig)
+	InitAllInfos() //加载所有信息...
+}
+
+func main() {
+	go checkMailJob()
+	lastNodeResponse = time.Now().Unix()
+	updport := Sysconfig["udpport"].(string)
+	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
+	udpclient.Listen(processUdpMsg)
+	log.Println("Udp服务监听", updport)
+	if TimingTask {
+		log.Println("正常历史部署...")
+		go historyRepeat()
+	} else {
+		if !IsFull {
+			log.Println("正常增量部署与监控机制...")
+			go lastUdpJob()
+			go getRepeatTask()
+		}
+	}
+	time.Sleep(99999 * time.Hour)
+}
+
+func mainTest() {
+	increaseRepeat(map[string]interface{}{
+		"gtid":  "12ec61170ae152a3c2310f02",
+		"lteid": "92ec61170ae152a3c2310f02",
+	})
+	time.Sleep(99999 * time.Hour)
+}
+
+// 主函数
+func mainTestTest() {
+	go checkMailJob()
+	lastNodeResponse = time.Now().Unix()
+	updport := Sysconfig["udpport"].(string)
+	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
+	udpclient.Listen(processUdpMsg)
+	log.Println("Udp服务监听", updport)
+	if TimingTask {
+		log.Println("正常历史部署...")
+		go historyRepeat()
+	} else {
+		if !IsFull {
+			log.Println("正常增量部署与监控机制...")
+			go lastUdpJob()
+			go getRepeatTask()
+		}
+	}
+	time.Sleep(99999 * time.Hour)
+}
+
+// udp接收
+func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
+	switch act {
+	case mu.OP_TYPE_DATA:
+		var mapInfo map[string]interface{}
+		err := json.Unmarshal(data, &mapInfo)
+		if err != nil {
+			udpclient.WriteUdp([]byte("err:"+err.Error()), mu.OP_NOOP, ra)
+		} else if mapInfo != nil {
+			sid, eid := qu.ObjToString(mapInfo["gtid"]), qu.ObjToString(mapInfo["lteid"])
+			stype := qu.ObjToString(mapInfo["stype"])
+			if stype == "monitor" {
+				log.Println("收到监测......")
+				key := qu.ObjToString(mapInfo["key"])
+				udpclient.WriteUdp([]byte(key), mu.OP_NOOP, ra)
+				return
+			}
+			if sid == "" || eid == "" {
+				log.Println("接收id段异常-err ", "sid=", sid, ",eid=", eid)
+			} else {
+				lastNodeResponse = time.Now().Unix()
+				key := sid + "-" + eid + "-" + qu.ObjToString(mapInfo["stype"])
+				udpclient.WriteUdp([]byte(key), mu.OP_NOOP, ra)
+				//计算是否需要加载站点~每天加载一次
+				if isUpdateSite {
+					initSite()
+				}
+				//插入任务-判断任务-是否存在
+				updatelock.Lock()
+				taskList = append(taskList, mapInfo)
+				log.Println("udp收到任务...数量:", len(taskList), "具体任务:", taskList)
+				updatelock.Unlock()
+			}
+		}
+	case mu.OP_NOOP: //下个节点回应
+		log.Println("下节点回应:", string(data))
+		udptaskmap.Delete(string(data))
+	}
+}
+
+// 监听-获取-分发判重任务
+func getRepeatTask() {
+	for {
+		if len(taskList) > 0 {
+			updatelock.Lock()
+			len_list := len(taskList)
+			if len_list > 1 {
+				first_id := taskList[0]["gtid"]
+				end_id := taskList[len_list-1]["lteid"]
+				if first_id != "" && end_id != "" {
+					log.Println("合并段落~正常~", first_id, "~", end_id)
+					increaseRepeat(map[string]interface{}{
+						"gtid":  first_id,
+						"lteid": end_id,
+					})
+					taskList = taskList[len_list:]
+					log.Println("此段落结束当前任务池...", len(taskList), taskList)
+				} else {
+					log.Println("合并段落~错误~正常取段落~~~")
+					mapInfo := taskList[0]
+					if mapInfo != nil {
+						increaseRepeat(mapInfo) //判重方法
+					}
+					taskList = taskList[1:]
+					log.Println("此段落结束当前任务池...", len(taskList), taskList)
+				}
+			} else {
+				mapInfo := taskList[0]
+				if mapInfo != nil {
+					increaseRepeat(mapInfo) //判重方法
+				}
+				taskList = taskList[1:]
+				log.Println("此段落结束当前任务池...", len(taskList), taskList)
+			}
+			updatelock.Unlock()
+		} else {
+			time.Sleep(15 * time.Second)
+		}
+	}
+}
+
+func lastUdpJob() {
+	for {
+		responselock.Lock()
+		if time.Now().Unix()-lastNodeResponse >= 1800 {
+			lastNodeResponse = time.Now().Unix() //重置时间
+			sendErrMailApi("判重增量~发现处理流程超时~给予告警", fmt.Sprintf("半小时左右~无新段落数据进入判重增量流程...相关人员检查..."))
+		}
+		responselock.Unlock()
+		time.Sleep(300 * time.Second)
+	}
+}

+ 442 - 0
flow_repeat/mgo.go

@@ -0,0 +1,442 @@
+package main
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log"
+	"math/big"
+	"runtime"
+	"strconv"
+	"strings"
+	"time"
+
+	"go.mongodb.org/mongo-driver/bson"
+	"go.mongodb.org/mongo-driver/bson/primitive"
+	"go.mongodb.org/mongo-driver/mongo"
+	"go.mongodb.org/mongo-driver/mongo/options"
+)
+
+type MgoSess struct {
+	Db     string
+	Coll   string
+	Query  interface{}
+	Sorts  []string
+	fields interface{}
+	limit  int64
+	skip   int64
+	M      *MongodbSim
+}
+
+type MgoIter struct {
+	Cursor *mongo.Cursor
+}
+
+func (mt *MgoIter) Next(result interface{}) bool {
+	if mt.Cursor != nil {
+		if mt.Cursor.Next(nil) {
+			err := mt.Cursor.Decode(result)
+			if err != nil {
+				log.Println("mgo cur err", err.Error())
+				mt.Cursor.Close(nil)
+				return false
+			}
+			return true
+		} else {
+			mt.Cursor.Close(nil)
+			return false
+		}
+	} else {
+		return false
+	}
+
+}
+
+func (ms *MgoSess) DB(name string) *MgoSess {
+	ms.Db = name
+	return ms
+}
+
+func (ms *MgoSess) C(name string) *MgoSess {
+	ms.Coll = name
+	return ms
+}
+
+func (ms *MgoSess) Find(q interface{}) *MgoSess {
+	ms.Query = q
+	return ms
+}
+
+func (ms *MgoSess) Select(fields interface{}) *MgoSess {
+	ms.fields = fields
+	return ms
+}
+
+func (ms *MgoSess) Limit(limit int64) *MgoSess {
+	ms.limit = limit
+	return ms
+}
+func (ms *MgoSess) Skip(skip int64) *MgoSess {
+	ms.skip = skip
+	return ms
+}
+
+func (ms *MgoSess) Sort(sorts ...string) *MgoSess {
+	ms.Sorts = sorts
+	return ms
+}
+
+func (ms *MgoSess) Iter() *MgoIter {
+	it := &MgoIter{}
+	find := options.Find()
+	if ms.skip > 0 {
+		find.SetSkip(ms.skip)
+	}
+	if ms.limit > 0 {
+		find.SetLimit(ms.limit)
+	}
+	find.SetBatchSize(100)
+	if len(ms.Sorts) > 0 {
+		sort := bson.M{}
+		for _, k := range ms.Sorts {
+			switch k[:1] {
+			case "-":
+				sort[k[1:]] = -1
+			case "+":
+				sort[k[1:]] = 1
+			default:
+				sort[k] = 1
+			}
+		}
+		find.SetSort(sort)
+	}
+	if ms.fields != nil {
+		find.SetProjection(ms.fields)
+	}
+	cur, err := ms.M.C.Database(ms.Db).Collection(ms.Coll).Find(ms.M.Ctx, ms.Query, find)
+	if err != nil {
+		log.Println("mgo find err", err.Error())
+	} else {
+		it.Cursor = cur
+	}
+	return it
+}
+
+type MongodbSim struct {
+	MongodbAddr string
+	Size        int
+	//	MinSize     int
+	DbName   string
+	C        *mongo.Client
+	Ctx      context.Context
+	ShortCtx context.Context
+	pool     chan bool
+	UserName string
+	Password string
+}
+
+func (m *MongodbSim) GetMgoConn() *MgoSess {
+	//m.Open()
+	ms := &MgoSess{}
+	ms.M = m
+	return ms
+}
+
+func (m *MongodbSim) DestoryMongoConn(ms *MgoSess) {
+	//m.Close()
+	ms.M = nil
+	ms = nil
+}
+
+func (m *MongodbSim) InitPool() {
+	opts := options.Client()
+	opts.SetConnectTimeout(3 * time.Second)
+	opts.ApplyURI("mongodb://" + m.MongodbAddr)
+	opts.SetMaxPoolSize(uint64(m.Size))
+	m.pool = make(chan bool, m.Size)
+
+	if m.UserName != "" && m.Password != "" {
+		cre := options.Credential{
+			Username: m.UserName,
+			Password: m.Password,
+		}
+		opts.SetAuth(cre)
+	}
+
+	opts.SetMaxConnIdleTime(2 * time.Hour)
+	m.Ctx, _ = context.WithTimeout(context.Background(), 99999*time.Hour)
+	m.ShortCtx, _ = context.WithTimeout(context.Background(), 1*time.Minute)
+	client, err := mongo.Connect(m.ShortCtx, opts)
+	if err != nil {
+		log.Println("mgo init error:", err.Error())
+	} else {
+		m.C = client
+		log.Println("init success")
+	}
+}
+
+func (m *MongodbSim) Open() {
+	m.pool <- true
+}
+func (m *MongodbSim) Close() {
+	<-m.pool
+}
+
+// 批量插入
+func (m *MongodbSim) UpSertBulk(c string, doc ...[]map[string]interface{}) (map[int64]interface{}, bool) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	var writes []mongo.WriteModel
+	for _, d := range doc {
+		write := mongo.NewUpdateOneModel()
+		write.SetFilter(d[0])
+		write.SetUpdate(d[1])
+		write.SetUpsert(true)
+		writes = append(writes, write)
+	}
+	r, e := coll.BulkWrite(m.Ctx, writes)
+	if e != nil {
+		log.Println("mgo upsert error:", e.Error())
+		return nil, false
+	}
+	//	else {
+	//		if r.UpsertedCount != int64(len(doc)) {
+	//			log.Println("mgo upsert uncomplete:uc/dc", r.UpsertedCount, len(doc))
+	//		}
+	//		return true
+	//	}
+	return r.UpsertedIDs, true
+}
+
+// 批量插入
+func (m *MongodbSim) SaveBulk(c string, doc ...map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	var writes []mongo.WriteModel
+	for _, d := range doc {
+		write := mongo.NewInsertOneModel()
+		write.SetDocument(d)
+		writes = append(writes, write)
+	}
+	_, e := coll.BulkWrite(m.Ctx, writes)
+	if e != nil {
+		log.Println("mgo savebulk error:", e.Error())
+		return false
+	}
+	return true
+}
+
+// 保存
+func (m *MongodbSim) Save(c string, doc map[string]interface{}) interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.InsertOne(m.Ctx, doc)
+	if err != nil {
+		return nil
+	}
+	return r.InsertedID
+}
+
+// 按条件更新
+func (m *MongodbSim) Update(c string, q, u interface{}, upsert bool, multi bool) bool {
+	defer catch()
+	m.Open()
+	defer m.Close()
+	ct := options.Update()
+	if upsert {
+		ct.SetUpsert(true)
+	}
+	coll := m.C.Database(m.DbName).Collection(c)
+	var err error
+	if multi {
+		_, err = coll.UpdateMany(m.Ctx, ObjToM(q), ObjToM(u), ct)
+	} else {
+		_, err = coll.UpdateOne(m.Ctx, ObjToM(q), ObjToM(u), ct)
+	}
+	if err != nil {
+		log.Println("删除错误", err.Error())
+		return false
+	}
+	return true
+}
+
+// 更新by Id
+func (m *MongodbSim) UpdateById(c, id string, doc map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	_, err := coll.UpdateOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)}, doc)
+	if err != nil {
+		log.Println(err)
+		return false
+	}
+	return true
+}
+
+// 删除by id
+func (m *MongodbSim) DeleteById(c, id string) int64 {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.DeleteOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)})
+	if err != nil {
+		return 0
+	}
+	return r.DeletedCount
+}
+
+// 通过条件删除
+func (m *MongodbSim) Delete(c string, query map[string]interface{}) int64 {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.DeleteMany(m.Ctx, query)
+	if err != nil {
+		return 0
+	}
+	return r.DeletedCount
+}
+
+// findbyid
+func (m *MongodbSim) FindById(c, id string) map[string]interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r := coll.FindOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)})
+	v := map[string]interface{}{}
+	r.Decode(&v)
+	return v
+}
+
+// findone
+func (m *MongodbSim) FindOne(c string, query map[string]interface{}) map[string]interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r := coll.FindOne(m.Ctx, query)
+	v := map[string]interface{}{}
+	r.Decode(&v)
+	return v
+}
+
+// find
+func (m *MongodbSim) Find(c string, query map[string]interface{}, sort, fields interface{}) ([]map[string]interface{}, error) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	op := options.Find()
+	r, err := coll.Find(m.Ctx, query, op.SetSort(sort), op.SetProjection(fields))
+	if err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+	var results []map[string]interface{}
+	if err = r.All(m.Ctx, &results); err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+	return results, nil
+}
+
+func ObjToOth(query interface{}) *bson.M {
+	return ObjToMQ(query, false)
+}
+func ObjToM(query interface{}) *bson.M {
+	return ObjToMQ(query, true)
+}
+
+// obj(string,M)转M,查询用到
+func ObjToMQ(query interface{}, isQuery bool) *bson.M {
+	data := make(bson.M)
+	defer catch()
+	if s2, ok2 := query.(*map[string]interface{}); ok2 {
+		data = bson.M(*s2)
+	} else if s3, ok3 := query.(*bson.M); ok3 {
+		return s3
+	} else if s3, ok3 := query.(*primitive.M); ok3 {
+		return s3
+	} else if s, ok := query.(string); ok {
+		json.Unmarshal([]byte(strings.Replace(s, "'", "\"", -1)), &data)
+		if ss, oks := data["_id"]; oks && isQuery {
+			switch ss.(type) {
+			case string:
+				data["_id"], _ = primitive.ObjectIDFromHex(ss.(string))
+			case map[string]interface{}:
+				tmp := ss.(map[string]interface{})
+				for k, v := range tmp {
+					tmp[k], _ = primitive.ObjectIDFromHex(v.(string))
+				}
+				data["_id"] = tmp
+			}
+
+		}
+	} else if s1, ok1 := query.(map[string]interface{}); ok1 {
+		data = s1
+	} else if s4, ok4 := query.(bson.M); ok4 {
+		data = s4
+	} else if s4, ok4 := query.(primitive.M); ok4 {
+		data = s4
+	} else {
+		data = nil
+	}
+	return &data
+}
+func intAllDef(num interface{}, defaultNum int) int {
+	if i, ok := num.(int); ok {
+		return int(i)
+	} else if i0, ok0 := num.(int32); ok0 {
+		return int(i0)
+	} else if i1, ok1 := num.(float64); ok1 {
+		return int(i1)
+	} else if i2, ok2 := num.(int64); ok2 {
+		return int(i2)
+	} else if i3, ok3 := num.(float32); ok3 {
+		return int(i3)
+	} else if i4, ok4 := num.(string); ok4 {
+		in, _ := strconv.Atoi(i4)
+		return int(in)
+	} else if i5, ok5 := num.(int16); ok5 {
+		return int(i5)
+	} else if i6, ok6 := num.(int8); ok6 {
+		return int(i6)
+	} else if i7, ok7 := num.(*big.Int); ok7 {
+		in, _ := strconv.Atoi(fmt.Sprint(i7))
+		return int(in)
+	} else if i8, ok8 := num.(*big.Float); ok8 {
+		in, _ := strconv.Atoi(fmt.Sprint(i8))
+		return int(in)
+	} else {
+		return defaultNum
+	}
+}
+
+// 创建_id
+func NewObjectId() primitive.ObjectID {
+	return primitive.NewObjectID()
+}
+
+func StringTOBsonId(id string) primitive.ObjectID {
+	objectId, _ := primitive.ObjectIDFromHex(id)
+	return objectId
+}
+
+func BsonTOStringId(id interface{}) string {
+	return id.(primitive.ObjectID).Hex()
+}
+
+// 出错拦截
+func catch() {
+	if r := recover(); r != nil {
+		log.Println(r)
+		for skip := 0; ; skip++ {
+			_, file, line, ok := runtime.Caller(skip)
+			if !ok {
+				break
+			}
+			go log.Printf("%v,%v\n", file, line)
+		}
+	}
+}

+ 91 - 0
flow_repeat/nsqdata/consumer.go

@@ -0,0 +1,91 @@
+package nsqdata
+
+import (
+	"encoding/json"
+	"github.com/nsqio/go-nsq"
+	"strings"
+	"time"
+)
+
+type Consumer struct {
+	Ch           chan interface{}
+	C            *nsq.Consumer
+	Topic        string
+	Channel      string
+	IsJsonEncode bool
+	Conf         *Cconfig
+}
+
+type Cconfig struct {
+	IsJsonEncode         bool   //是否进行json序列化,解码也进行序列化,默认不进行json序列化
+	ConnectType          int    //连接类型 0连nsqd 1连nsqlookup
+	Interval             int    //设置服务发现的轮询时间,例如新的nsq出现,默认10秒
+	Addr, Topic, Channel string //连接地址(支持逗号分割多个),主题,通道
+	Concurrent           int    //并发数,默认为1
+}
+
+// 处理消息
+func (c *Consumer) HandleMessage(msg *nsq.Message) error {
+	if c.IsJsonEncode {
+		if len(msg.Body) > 1 {
+			var err error
+			switch msg.Body[0] {
+			case 0x00:
+				var obj interface{}
+				err = json.Unmarshal(msg.Body[1:], &obj)
+				if err == nil && obj != nil {
+					c.Ch <- obj
+				}
+			case 0x01: //[]byte数组
+				var obj []byte
+				err = json.Unmarshal(msg.Body[1:], &obj)
+				if err == nil && obj != nil {
+					c.Ch <- obj
+				}
+			default:
+				var obj interface{}
+				err = json.Unmarshal(msg.Body, &obj)
+				if err == nil && obj != nil {
+					c.Ch <- obj
+				}
+			}
+			return err
+		}
+	} else {
+		c.Ch <- msg.Body
+	}
+	return nil
+}
+
+func NewConsumer(cc *Cconfig) (*Consumer, error) {
+	cfg := nsq.NewConfig()
+	if cc.Interval == 0 {
+		cc.Interval = 10
+	}
+	cfg.LookupdPollInterval = time.Duration(cc.Interval) * time.Second //设置服务发现的轮询时间,例如新的nsq出现
+	c, err := nsq.NewConsumer(cc.Topic, cc.Channel, cfg)               // 新建一个消费者
+	if err != nil {
+		return nil, err
+	}
+	if cc.Concurrent == 0 {
+		cc.Concurrent = 1
+	}
+	consumer := &Consumer{make(chan interface{}, cc.Concurrent), c, cc.Topic, cc.Channel, cc.IsJsonEncode, cc}
+	c.AddConcurrentHandlers(consumer, cc.Concurrent) // 添加消费者接口
+	addrs := strings.Split(cc.Addr, ",")
+	var err1 error
+	if cc.ConnectType == 0 {
+		err1 = c.ConnectToNSQDs(addrs)
+	} else if cc.ConnectType == 1 {
+		err1 = c.ConnectToNSQLookupds(addrs)
+	}
+	return consumer, err1
+}
+
+// 处理消息
+func (c *Consumer) Close(msg *nsq.Message) error {
+	if c.Conf.ConnectType == 1 {
+		return c.C.DisconnectFromNSQLookupd(c.Conf.Addr)
+	}
+	return c.C.DisconnectFromNSQD(c.Conf.Addr)
+}

+ 50 - 0
flow_repeat/nsqdata/producer.go

@@ -0,0 +1,50 @@
+package nsqdata
+
+import (
+	"encoding/json"
+	"errors"
+	"github.com/nsqio/go-nsq"
+)
+
+type Producer struct {
+	//Ch    chan interface{}
+	P            *nsq.Producer
+	Topic        string
+	IsJsonEncode bool //是否进行json序列化,如果否则必须以[]byte传递,如果是则必须用对应的消费者对象[也设置了序列化]处理
+}
+
+func NewProducer(addr, toppic string, IsJsonEncode bool) (*Producer, error) {
+	config := nsq.NewConfig()
+	producer, err := nsq.NewProducer(addr, config)
+	if err != nil {
+		return nil, err
+	} else {
+		return &Producer{producer, toppic, IsJsonEncode}, nil
+	}
+}
+
+func (p *Producer) Publish(msg interface{}) error {
+	if p.IsJsonEncode {
+		//var infoType byte
+		//switch msg.(type) {
+		//case []byte: //原本就是byte数组
+		//	infoType = 0x01
+		//default:
+		//	infoType = 0x00
+		//}
+		data, err := json.Marshal(msg)
+		if err != nil {
+			return err
+		} else if len(data) > 0 { //头部插入类型,用于解码[]byte
+			//data = append([]byte{infoType}, data...)
+			return p.P.Publish(p.Topic, data)
+		} else {
+			return errors.New("producer msg err")
+		}
+	} else { //必须传入[]byte
+		if bs, ok := msg.([]byte); ok {
+			return p.P.Publish(p.Topic, bs)
+		}
+		return errors.New("producer msg err: no []byte")
+	}
+}

+ 73 - 0
flow_repeat/udptaskmap.go

@@ -0,0 +1,73 @@
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"log"
+	"net"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+)
+
+var udptaskmap = &sync.Map{}
+var tomail string
+var api string
+
+type udpNode struct {
+	data      []byte
+	addr      *net.UDPAddr
+	timestamp int64
+	retry     int
+}
+
+func checkMailJob() {
+	//阿里云内网无法发送邮件
+	jkmail, _ := Sysconfig["jkmail"].(map[string]interface{})
+	if jkmail != nil {
+		tomail, _ = jkmail["to"].(string)
+		api, _ = jkmail["api"].(string)
+	}
+	log.Println("start checkMailJob", tomail, Sysconfig["jkmail"])
+	for {
+		udptaskmap.Range(func(k, v interface{}) bool {
+			now := time.Now().Unix()
+			node, _ := v.(*udpNode)
+			if now-node.timestamp > 120 {
+				udptaskmap.Delete(k)
+				info_str := ""
+				if strings.Contains(k.(string), "project") {
+					info_str = fmt.Sprintf("下节点~项目合并~未响应~相关人员检查~%s", k.(string))
+				} else {
+					info_str = fmt.Sprintf("下节点~同步数据~未响应~相关人员检查~%s", k.(string))
+				}
+				res, err := http.Get(fmt.Sprintf("%s?to=%s&title=%s&body=%s", api, tomail, "增量判重程序~严重警告", info_str))
+				if err == nil {
+					defer res.Body.Close()
+					read, err := ioutil.ReadAll(res.Body)
+					log.Println("邮件发送:", string(read), err)
+				}
+			}
+			return true
+		})
+		time.Sleep(60 * time.Second)
+	}
+}
+
+func sendErrMailApi(title, body string) {
+	jkmail, _ := Sysconfig["jkmail"].(map[string]interface{})
+	if jkmail != nil {
+		tomail, _ = jkmail["to"].(string)
+		api, _ = jkmail["api"].(string)
+	}
+	log.Println(tomail, api)
+	res, err := http.Get(fmt.Sprintf("%s?to=%s&title=%s&body=%s", api, tomail, title, body))
+	if err == nil {
+		defer res.Body.Close()
+		read, err := ioutil.ReadAll(res.Body)
+		log.Println("邮件发送成功:", string(read), err)
+	} else {
+		log.Println("邮件发送失败:", err)
+	}
+}

+ 102 - 0
flow_repeat/updateMethod.go

@@ -0,0 +1,102 @@
+package main
+
+import (
+	"log"
+	"time"
+)
+
+var sp = make(chan bool, 5)
+
+type updateInfo struct { //更新或新增通道
+	updatePool chan []map[string]interface{}
+	saveSize   int
+}
+
+func newUpdatePool() *updateInfo {
+	update := &updateInfo{make(chan []map[string]interface{}, 50000), 200}
+	return update
+}
+
+// 临时~新增组
+type addGroupInfo struct {
+	pool     chan map[string]interface{}
+	saveSize int
+}
+
+func newAddGroupPool() *addGroupInfo {
+	info := &addGroupInfo{make(chan map[string]interface{}, 50000), 200}
+	return info
+}
+
+// 监听更新
+func (update *updateInfo) updateData() {
+	log.Println("开始不断监听--待更新数据")
+	tmpArr := make([][]map[string]interface{}, update.saveSize)
+	tmpIndex := 0
+	for {
+		select {
+		case value := <-update.updatePool:
+			tmpArr[tmpIndex] = value
+			tmpIndex++
+			if tmpIndex == update.saveSize {
+				sp <- true
+				go func(dataArr [][]map[string]interface{}) {
+					defer func() {
+						<-sp
+					}()
+					data_mgo.UpSertBulk(extract, dataArr...)
+				}(tmpArr)
+				tmpArr = make([][]map[string]interface{}, update.saveSize)
+				tmpIndex = 0
+			}
+		case <-time.After(5 * time.Second): //无反应时每x秒检测一次
+			if tmpIndex > 0 {
+				sp <- true
+				go func(dataArr [][]map[string]interface{}) {
+					defer func() {
+						<-sp
+					}()
+					data_mgo.UpSertBulk(extract, dataArr...)
+				}(tmpArr[:tmpIndex])
+				tmpArr = make([][]map[string]interface{}, update.saveSize)
+				tmpIndex = 0
+			}
+		}
+	}
+}
+
+// 监听新增
+func (info *addGroupInfo) addGroupData() {
+	tmpArr := make([]map[string]interface{}, info.saveSize)
+	tmpIndex := 0
+	for {
+		select {
+		case value := <-info.pool:
+			tmpArr[tmpIndex] = value
+			tmpIndex++
+			if tmpIndex == info.saveSize {
+				sp <- true
+				go func(dataArr []map[string]interface{}) {
+					defer func() {
+						<-sp
+					}()
+					data_mgo.SaveBulk("zktes_full_repeat", dataArr...)
+				}(tmpArr)
+				tmpArr = make([]map[string]interface{}, info.saveSize)
+				tmpIndex = 0
+			}
+		case <-time.After(7 * time.Second): //无反应时每x秒检测一次
+			if tmpIndex > 0 {
+				sp <- true
+				go func(dataArr []map[string]interface{}) {
+					defer func() {
+						<-sp
+					}()
+					data_mgo.SaveBulk("zktes_full_repeat", dataArr...)
+				}(tmpArr[:tmpIndex])
+				tmpArr = make([]map[string]interface{}, info.saveSize)
+				tmpIndex = 0
+			}
+		}
+	}
+}