浏览代码

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

fengweiqiang 5 年之前
父节点
当前提交
9e6ec15004
共有 5 个文件被更改,包括 890 次插入554 次删除
  1. 3 3
      udpfilterdup/src/config.json
  2. 260 288
      udpfilterdup/src/datamap.go
  3. 262 263
      udpfilterdup/src/main.go
  4. 365 0
      udpprojectset/src/heavy_test.go
  5. 二进制
      udpprojectset/src/zheng.xlsx

+ 3 - 3
udpfilterdup/src/config.json

@@ -2,10 +2,10 @@
     "udpport": ":1485",
     "dupdays": 5,
     "mongodb": {
-        "addr": "192.168.3.207:27082",
+        "addr": "192.168.3.207:27092",
         "pool": 15,
-        "db": "extract_kf",
-        "extract": "a_testbidding_new",
+        "db": "zhaolongyue",
+        "extract": "kedaxunfei_zhengfa_gnq",
         "extract_copy": "a_testbidding",
         "bidding": "bidding_126"
     },

文件差异内容过多而无法显示
+ 260 - 288
udpfilterdup/src/datamap.go


+ 262 - 263
udpfilterdup/src/main.go

@@ -18,13 +18,10 @@ import (
 	"time"
 )
 
-
-
-
 var (
-	Sysconfig    map[string]interface{} //配置文件
-	mconf        map[string]interface{} //mongodb配置信息
-	mgo          *mongodb.MongodbSim    //mongodb操作对象
+	Sysconfig map[string]interface{} //配置文件
+	mconf     map[string]interface{} //mongodb配置信息
+	mgo       *mongodb.MongodbSim    //mongodb操作对象
 	//siteMgo      *mongodb.MongodbSim
 	extract      string
 	extract_copy string
@@ -33,17 +30,17 @@ var (
 	nextNode     []map[string]interface{} //下节点数组
 	dupdays      = 5                      //初始化判重范围
 	DM           *datamap                 //
-	HM           *historymap                 //判重数据
+	HM           *historymap              //判重数据
 	lastid       = ""
 	/*
-	5da3f2c5a5cb26b9b79847fc
+		5da3f2c5a5cb26b9b79847fc
 	*/
 	//正则筛选相关
-	FilterRegTitle = regexp.MustCompile("^_$")
+	FilterRegTitle   = regexp.MustCompile("^_$")
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
 	FilterRegTitle_2 = regexp.MustCompile("^_$")
 
-	isMerger bool //是否合并
+	isMerger bool                              //是否合并
 	SiteMap  map[string]map[string]interface{} //站点map
 )
 
@@ -63,7 +60,6 @@ func init() {
 	extract_copy = mconf["extract_copy"].(string)
 	mgo.InitPool()
 
-
 	//测试可以临时注释
 	dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
 	//加载数据
@@ -73,37 +69,36 @@ func init() {
 	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
 	isMerger = Sysconfig["isMerger"].(bool)
 
-
 	//配置站点Map
-	SiteMap = make(map[string]map[string]interface{},0)
+	SiteMap = make(map[string]map[string]interface{}, 0)
 	start := int(time.Now().Unix())
 	//站点配置
 	sess_site := mgo.GetMgoConn()
 	defer sess_site.Close()
 	res_site := sess_site.DB("zhaolongyue").C("site").Find(nil).Sort("_id").Iter()
 	for site_dict := make(map[string]interface{}); res_site.Next(&site_dict); {
-			data_map := map[string]interface{}{
-				"area":util.ObjToString(site_dict["area"]),
-				"city":util.ObjToString(site_dict["city"]),
-				"district":util.ObjToString(site_dict["district"]),
-				"sitetype":util.ObjToString(site_dict["sitetype"]),
-				"level":util.ObjToString(site_dict["level"]),
-			}
-		SiteMap[util.ObjToString(site_dict["site"])]= data_map
+		data_map := map[string]interface{}{
+			"area":     util.ObjToString(site_dict["area"]),
+			"city":     util.ObjToString(site_dict["city"]),
+			"district": util.ObjToString(site_dict["district"]),
+			"sitetype": util.ObjToString(site_dict["sitetype"]),
+			"level":    util.ObjToString(site_dict["level"]),
+		}
+		SiteMap[util.ObjToString(site_dict["site"])] = data_map
 	}
-	fmt.Printf("用时:%d秒,%d个",int(time.Now().Unix())-start,len(SiteMap))
-
+	fmt.Printf("用时:%d秒,%d个", int(time.Now().Unix())-start, len(SiteMap))
 
 }
 
-
 func main() {
+
 	go checkMapJob()
 
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
 	udpclient.Listen(processUdpMsg)
 	log.Println("Udp服务监听", updport)
+
 	time.Sleep(99999 * time.Hour)
 }
 
@@ -120,14 +115,14 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 			udpclient.WriteUdp([]byte("err:"+err.Error()), mu.OP_NOOP, ra)
 		} else if mapInfo != nil {
 
-			taskType:= util.ObjToString(mapInfo["stype"])
+			taskType := util.ObjToString(mapInfo["stype"])
 			if taskType == "historyTask" {
 				//更新流程
-				go historyTask(data,mapInfo)
-			}else if taskType == "normalTask" {
+				go historyTask(data, mapInfo)
+			} else if taskType == "normalTask" {
 				//判重流程
 				go task(data, mapInfo)
-			}else {
+			} else {
 				//其他
 				go task(data, mapInfo)
 			}
@@ -161,17 +156,16 @@ func task(data []byte, mapInfo map[string]interface{}) {
 			"$lte": util.StringTOBsonId(mapInfo["lteid"].(string)),
 		},
 	}
+
 	it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
 	updateExtract := [][]map[string]interface{}{}
 	pool := make(chan bool, 16)
 	wg := &sync.WaitGroup{}
 	mapLock := &sync.Mutex{}
 	n, repeateN := 0, 0
-
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-
 		if n%10000 == 0 {
-			log.Println("current:", n, tmp["_id"],"repeateN:",repeateN)
+			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
 		}
 		pool <- true
 		wg.Add(1)
@@ -183,7 +177,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 			info := NewInfo(tmp)
 
 			//是否为无效数据
-			if invalidData(info.buyer,info.projectname,info.projectcode) {
+			if invalidData(info.buyer, info.projectname, info.projectcode) {
 				mapLock.Lock()
 				updateExtract = append(updateExtract, []map[string]interface{}{
 					map[string]interface{}{
@@ -191,7 +185,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 					},
 					map[string]interface{}{
 						"$set": map[string]interface{}{
-							"repeat":-1,
+							"repeat": -1,
 						},
 					},
 				})
@@ -201,96 +195,88 @@ func task(data []byte, mapInfo map[string]interface{}) {
 					updateExtract = [][]map[string]interface{}{}
 				}
 				mapLock.Unlock()
-			}else  {
+			} else {
 				//判重原因 reason  tmp["_id"] 对比id   id原始id
-				mapLock.Lock()
-				b, source,reason := DM.check(info)
+				//mapLock.Lock()
+				b, source, reason := DM.check(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
 					repeateN++
-					var mergeArr  = []int64{} 	//更改合并数组记录
-					var newData  = &Info{}		//更换新的数据池数据
-					var id_map  = map[string]interface{}{}
+					var mergeArr = []int64{} //更改合并数组记录
+					var newData = &Info{}    //更换新的数据池数据
+					var id_map = map[string]interface{}{}
 					repeat_id := source.id
-					id_map["_id"]= util.StringTOBsonId(info.id)
-
-					if isMerger{
+					id_map["_id"] = util.StringTOBsonId(info.id)
+					if isMerger {
 						//需要合并相关操作
 						//合并操作--评功权重打分-合并完替换原始数据池
-						basic_bool := basicDataScore(source,info)
+						basic_bool := basicDataScore(source, info)
 						if basic_bool {
 							//已原始数据为标准-对比数据打判重标签
-							newData,mergeArr= mergeDataFields(source,info)
-							DM.replaceSourceData(newData,source.id) //替换
-							id_map["_id"]= util.StringTOBsonId(source.id)
+							newData, mergeArr = mergeDataFields(source, info)
+							DM.replaceSourceData(newData, source.id) //替换
+							id_map["_id"] = util.StringTOBsonId(source.id)
 							repeat_id = source.id
-						}else {
+						} else {
 							//已对比数据为标准 ,数据池的数据打判重标签
-							newData,mergeArr= mergeDataFields(info,source)
-							DM.replaceSourceData(newData,source.id)//替换
-							id_map["_id"]= util.StringTOBsonId(info.id)
+							newData, mergeArr = mergeDataFields(info, source)
+							DM.replaceSourceData(newData, source.id) //替换
+							id_map["_id"] = util.StringTOBsonId(info.id)
 							repeat_id = info.id
 						}
 					}
 
-
-
-					var update_map  = map[string]interface{}{
+					var update_map = map[string]interface{}{
 						"$set": map[string]interface{}{
-							"repeat_reason":reason,
-							"repeat":1,
-							"repeatid":repeat_id,
+							"repeat_reason": reason,
+							"repeat":        1,
+							"repeatid":      repeat_id,
 						},
 					}
 
 					if isMerger {
 						//合并记录
-						if len(newData.mergemap)>0 {
+						if len(newData.mergemap) > 0 {
 							update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
 							//fmt.Println("合并长度:",len(newData.mergemap))
 						}
 
 						//更新合并后的数据
-						for _,value :=range mergeArr {
-							if value==1 {
+						for _, value := range mergeArr {
+							if value == 1 {
 								update_map["$set"].(map[string]interface{})["area"] = newData.area
 								update_map["$set"].(map[string]interface{})["city"] = newData.city
-							}else if value==2 {
+							} else if value == 2 {
 								update_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-							}else if value==3 {
+							} else if value == 3 {
 								update_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-							}else if value==4 {
+							} else if value == 4 {
 								update_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-							}else if value==5 {
+							} else if value == 5 {
 								update_map["$set"].(map[string]interface{})["budget"] = newData.budget
-							}else if value==6 {
+							} else if value == 6 {
 								update_map["$set"].(map[string]interface{})["winner"] = newData.winner
-							}else if value==7 {
+							} else if value == 7 {
 								update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-							}else if value==8 {
+							} else if value == 8 {
 								update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-							}else {
+							} else {
 
 							}
 						}
 
 					}
-
 					//构建数据库更新用到的
 					updateExtract = append(updateExtract, []map[string]interface{}{
 						id_map,
 						update_map,
 					})
-					if len(updateExtract) > 500 {
-						mgo.UpdateBulk(extract, updateExtract...)
-						updateExtract = [][]map[string]interface{}{}
-					}
-					mapLock.Unlock()
-
-				} else {
-					mapLock.Unlock()
 				}
 			}
 		}(tmp)
+		if len(updateExtract) > 500 {
+			mgo.UpdateBulk(extract, updateExtract...)
+			updateExtract = [][]map[string]interface{}{}
+		}
 		tmp = make(map[string]interface{})
 	}
 	wg.Wait()
@@ -338,26 +324,26 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	}
 
 	it := sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
-	minTime,maxTime:=int64(0),int64(0)
-	for tmp := make(map[string]interface{}); it.Next(&tmp);{
+	minTime, maxTime := int64(0), int64(0)
+	for tmp := make(map[string]interface{}); it.Next(&tmp); {
 		//取出最大最小时间
-		if minTime==0||maxTime ==0 {
+		if minTime == 0 || maxTime == 0 {
 			minTime = util.Int64All(tmp["comeintime"])
 			maxTime = util.Int64All(tmp["comeintime"])
-		}else {
+		} else {
 			t := util.Int64All(tmp["comeintime"])
-			if t<minTime&&t!=0 {
+			if t < minTime && t != 0 {
 				minTime = t
 			}
-			if t>maxTime&&t!=0 {
+			if t > maxTime && t != 0 {
 				maxTime = t
 			}
 		}
 	}
-	fmt.Println("最小时间==",minTime,"最大时间==",maxTime)
+	fmt.Println("最小时间==", minTime, "最大时间==", maxTime)
 	//最小时间== 1568087634 最大时间== 1568103381
 	HM = NewHistorymap(util.ObjToString(mapInfo["gtid"]),
-		util.ObjToString(mapInfo["lteid"]),minTime,maxTime)
+		util.ObjToString(mapInfo["lteid"]), minTime, maxTime)
 
 	//return
 	//开始判重...
@@ -380,7 +366,7 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	for tmp := make(map[string]interface{}); it_task.Next(&tmp); n++ {
 
 		if n%10000 == 0 {
-			log.Println("current:", n, tmp["_id"],"repeateN:",repeateN)
+			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
 		}
 		pool <- true
 		wg.Add(1)
@@ -391,7 +377,7 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 			}()
 			info := NewInfo(tmp)
 			//是否为无效数据
-			if invalidData(info.buyer,info.projectname,info.projectcode) {
+			if invalidData(info.buyer, info.projectname, info.projectcode) {
 				mapLock.Lock()
 				updateExtract = append(updateExtract, []map[string]interface{}{
 					map[string]interface{}{
@@ -399,7 +385,7 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 					},
 					map[string]interface{}{
 						"$set": map[string]interface{}{
-							"repeat":   -1,
+							"repeat": -1,
 						},
 					},
 				})
@@ -408,9 +394,8 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 					updateExtract = [][]map[string]interface{}{}
 				}
 				mapLock.Unlock()
-			}else  {
-				mapLock.Lock()
-				b, source,reason := HM.checkHistory(info)
+			} else {
+				b, source, reason := HM.checkHistory(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
 					if reason == "未判重记录" {
 						fmt.Println("未判重记录")
@@ -427,97 +412,86 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 								},
 							},
 						})
-						if len(updateExtract) > 500 {
-							mgo.UpdateBulk(extract, updateExtract...)
-							updateExtract = [][]map[string]interface{}{}
-						}
-						mapLock.Unlock()
-					}else {
+					} else {
 						repeateN++
-						var mergeArr  = []int64{} 	//更改合并数组记录
-						var newData  = &Info{}		//更换新的数据池数据
-						var id_map  = map[string]interface{}{}
+						var mergeArr = []int64{} //更改合并数组记录
+						var newData = &Info{}    //更换新的数据池数据
+						var id_map = map[string]interface{}{}
 						repeat_id := source.id
-						id_map["_id"]= util.StringTOBsonId(info.id)
+						id_map["_id"] = util.StringTOBsonId(info.id)
 
-						if isMerger{
+						if isMerger {
 							//需要合并相关操作
 							//合并操作--评功权重打分-合并完替换原始数据池
-							basic_bool := basicDataScore(source,info)
+							basic_bool := basicDataScore(source, info)
 							if basic_bool {
 								//已原始数据为标准-对比数据打判重标签
-								newData,mergeArr= mergeDataFields(source,info)
-								DM.replaceSourceData(newData,source.id) //替换
-								id_map["_id"]= util.StringTOBsonId(source.id)
+								newData, mergeArr = mergeDataFields(source, info)
+								DM.replaceSourceData(newData, source.id) //替换
+								id_map["_id"] = util.StringTOBsonId(source.id)
 								repeat_id = source.id
-							}else {
+							} else {
 								//已对比数据为标准 ,数据池的数据打判重标签
-								newData,mergeArr= mergeDataFields(info,source)
-								DM.replaceSourceData(newData,source.id)//替换
-								id_map["_id"]= util.StringTOBsonId(info.id)
+								newData, mergeArr = mergeDataFields(info, source)
+								DM.replaceSourceData(newData, source.id) //替换
+								id_map["_id"] = util.StringTOBsonId(info.id)
 								repeat_id = info.id
 							}
 						}
 
-
-
-						var update_map  = map[string]interface{}{
+						var update_map = map[string]interface{}{
 							"$set": map[string]interface{}{
-								"repeat_reason":reason,
-								"repeat":1,
-								"repeatid":repeat_id,
+								"repeat_reason": reason,
+								"repeat":        1,
+								"repeatid":      repeat_id,
 							},
 						}
 
 						if isMerger {
 							//合并记录
-							if len(newData.mergemap)>0 {
+							if len(newData.mergemap) > 0 {
 								update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
 								//fmt.Println("合并长度:",len(newData.mergemap))
 							}
 
 							//更新合并后的数据
-							for _,value :=range mergeArr {
-								if value==1 {
+							for _, value := range mergeArr {
+								if value == 1 {
 									update_map["$set"].(map[string]interface{})["area"] = newData.area
 									update_map["$set"].(map[string]interface{})["city"] = newData.city
-								}else if value==2 {
+								} else if value == 2 {
 									update_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-								}else if value==3 {
+								} else if value == 3 {
 									update_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-								}else if value==4 {
+								} else if value == 4 {
 									update_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-								}else if value==5 {
+								} else if value == 5 {
 									update_map["$set"].(map[string]interface{})["budget"] = newData.budget
-								}else if value==6 {
+								} else if value == 6 {
 									update_map["$set"].(map[string]interface{})["winner"] = newData.winner
-								}else if value==7 {
+								} else if value == 7 {
 									update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-								}else if value==8 {
+								} else if value == 8 {
 									update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-								}else {
+								} else {
 
 								}
 							}
 
 						}
-
 						//构建数据库更新用到的
 						updateExtract = append(updateExtract, []map[string]interface{}{
 							id_map,
 							update_map,
 						})
-						if len(updateExtract) > 500 {
-							mgo.UpdateBulk(extract, updateExtract...)
-							updateExtract = [][]map[string]interface{}{}
-						}
-						mapLock.Unlock()
 					}
-				}else {
-					mapLock.Unlock()
 				}
 			}
 		}(tmp)
+		if len(updateExtract) > 500 {
+			mgo.UpdateBulk(extract, updateExtract...)
+			updateExtract = [][]map[string]interface{}{}
+		}
 		tmp = make(map[string]interface{})
 	}
 	wg.Wait()
@@ -527,10 +501,8 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	}
 	log.Println("this task over.", n, "repeateN:", repeateN, mapInfo["stop"])
 
-
-
 	//任务完成,开始发送广播通知下面节点
-	if n > repeateN &&mapInfo["stop"] == nil {
+	if n > repeateN && mapInfo["stop"] == nil {
 		for _, to := range nextNode {
 			sid, _ := mapInfo["gtid"].(string)
 			eid, _ := mapInfo["lteid"].(string)
@@ -552,293 +524,320 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	}
 }
 
-
-
-
-
-
-
-
-
 //合并字段
-func mergeDataFields(source *Info, info *Info) (*Info,[]int64){
+func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 	var mergeArr []int64
-	mergeArr = make([]int64,0)
+	mergeArr = make([]int64, 0)
 	//1、城市
-	if (source.area==""||source.area=="全国")&&info.area!="全国"&&info.area!=""{
+	if (source.area == "" || source.area == "全国") && info.area != "全国" && info.area != "" {
 		var arrA []string
-		if source.mergemap["area"]==nil {
+		if source.mergemap["area"] == nil {
 			arrA = make([]string, 0)
-		}else {
+		} else {
 			arrA = source.mergemap["area"].([]string)
 		}
-		arrA = append(arrA,source.area)
+		arrA = append(arrA, source.area)
 		source.mergemap["area"] = arrA
 
 		var arrC []string
-		if source.mergemap["city"]==nil {
+		if source.mergemap["city"] == nil {
 			arrC = make([]string, 0)
-		}else {
+		} else {
 			arrC = source.mergemap["city"].([]string)
 		}
-		arrC = append(arrC,source.city)
+		arrC = append(arrC, source.city)
 		source.mergemap["city"] = arrC
 
 		source.area = info.area
 		source.city = info.city
-		mergeArr = append(mergeArr,1)
+		mergeArr = append(mergeArr, 1)
 
 		//fmt.Println("合并-城市")
 	}
 	//2、项目名称
-	if source.projectname==""&&info.projectname!=""{
+	if source.projectname == "" && info.projectname != "" {
 		var arr []string
-		if source.mergemap["projectname"]==nil {
+		if source.mergemap["projectname"] == nil {
 			arr = make([]string, 0)
-		}else {
+		} else {
 			arr = source.mergemap["projectname"].([]string)
 		}
-		arr = append(arr,source.projectname)
+		arr = append(arr, source.projectname)
 		source.mergemap["projectname"] = arr
 
 		source.projectname = info.projectname
-		mergeArr = append(mergeArr,2)
+		mergeArr = append(mergeArr, 2)
 		//fmt.Println("合并-项目名称")
 	}
 	//3、项目编号
-	if source.projectcode==""&&info.projectcode!=""{
+	if source.projectcode == "" && info.projectcode != "" {
 		var arr []string
-		if source.mergemap["projectcode"]==nil {
+		if source.mergemap["projectcode"] == nil {
 			arr = make([]string, 0)
-		}else {
+		} else {
 			arr = source.mergemap["projectcode"].([]string)
 		}
-		arr = append(arr,source.projectcode)
+		arr = append(arr, source.projectcode)
 		source.mergemap["projectcode"] = arr
 
 		source.projectcode = info.projectcode
-		mergeArr = append(mergeArr,3)
+		mergeArr = append(mergeArr, 3)
 		//fmt.Println("合并-项目标号")
 	}
 	//4、采购单位
-	if source.buyer==""&&info.buyer!=""{
+	if source.buyer == "" && info.buyer != "" {
 		var arr []string
-		if source.mergemap["buyer"]==nil {
+		if source.mergemap["buyer"] == nil {
 			arr = make([]string, 0)
-		}else {
+		} else {
 			arr = source.mergemap["buyer"].([]string)
 		}
-		arr = append(arr,source.buyer)
+		arr = append(arr, source.buyer)
 		source.mergemap["buyer"] = arr
 
 		source.buyer = info.buyer
-		mergeArr = append(mergeArr,4)
+		mergeArr = append(mergeArr, 4)
 		//fmt.Println("合并-采购单位")
 	}
 	//5、预算
-	if source.budget==0&&info.budget!=0{
+	if source.budget == 0 && info.budget != 0 {
 		var arr []float64
-		if source.mergemap["budget"]==nil {
+		if source.mergemap["budget"] == nil {
 			arr = make([]float64, 0)
-		}else {
+		} else {
 			arr = source.mergemap["budget"].([]float64)
 		}
-		arr = append(arr,source.budget)
+		arr = append(arr, source.budget)
 		source.mergemap["budget"] = arr
 
 		source.budget = info.budget
-		mergeArr = append(mergeArr,5)
+		mergeArr = append(mergeArr, 5)
 		//fmt.Println("合并-预算")
 	}
 	//6、中标单位
-	if source.winner==""&&info.winner!=""{
+	if source.winner == "" && info.winner != "" {
 		var arr []string
-		if source.mergemap["winner"]==nil {
+		if source.mergemap["winner"] == nil {
 			arr = make([]string, 0)
-		}else {
+		} else {
 			arr = source.mergemap["winner"].([]string)
 		}
-		arr = append(arr,source.winner)
+		arr = append(arr, source.winner)
 		source.mergemap["winner"] = arr
 
 		source.winner = info.winner
-		mergeArr = append(mergeArr,6)
+		mergeArr = append(mergeArr, 6)
 		//fmt.Println("合并-中标单位")
 	}
 	//7、中标金额
-	if source.bidamount==0&&info.bidamount!=0{
+	if source.bidamount == 0 && info.bidamount != 0 {
 		var arr []float64
-		if source.mergemap["bidamount"]==nil {
+		if source.mergemap["bidamount"] == nil {
 			arr = make([]float64, 0)
-		}else {
+		} else {
 			arr = source.mergemap["bidamount"].([]float64)
 		}
-		arr = append(arr,source.bidamount)
+		arr = append(arr, source.bidamount)
 		source.mergemap["bidamount"] = arr
 
 		source.bidamount = info.bidamount
-		mergeArr = append(mergeArr,7)
+		mergeArr = append(mergeArr, 7)
 		//fmt.Println("合并-中标金额")
 	}
 	//8、开天时间-地点
-	if source.bidopentime==0&&info.bidopentime!=0{
+	if source.bidopentime == 0 && info.bidopentime != 0 {
 		var arr []int64
-		if source.mergemap["bidopentime"]==nil {
+		if source.mergemap["bidopentime"] == nil {
 			arr = make([]int64, 0)
-		}else {
+		} else {
 			arr = source.mergemap["bidopentime"].([]int64)
 		}
-		arr = append(arr,source.bidopentime)
+		arr = append(arr, source.bidopentime)
 		source.mergemap["bidopentime"] = arr
 
 		source.bidopentime = info.bidopentime
-		mergeArr = append(mergeArr,8)
+		mergeArr = append(mergeArr, 8)
 		//fmt.Println("合并-开标时间")
 	}
 
 	//以上合并过于简单,待进一步优化
-	return source,mergeArr
+	return source, mergeArr
 }
 
-
 //权重评估
-func basicDataScore(v *Info, info *Info) bool  {
+func basicDataScore(v *Info, info *Info) bool {
 
 	//权重评估
 	/*
-	网站优先级判定规则:
-    1、中央>省>市>县区
-    2、政府采购>公共资源>采购单位官网>招标代理公司/平台
+			网站优先级判定规则:
+		    1、中央>省>市>县区
+		    2、政府采购>公共资源>采购单位官网>招标代理公司/平台
 	*/
 
-
-	v_score,info_score :=-1,-1
+	v_score, info_score := -1, -1
 	dict_v := SiteMap[v.site]
 	dict_info := SiteMap[info.site]
 	//先判断level
-	if dict_v !=nil {
+	if dict_v != nil {
 		v_level := util.ObjToString(dict_v["level"])
-		if v_level =="中央" {
+		if v_level == "中央" {
 			v_score = 4
-		}else if v_level =="省级" {
+		} else if v_level == "省级" {
 			v_score = 3
-		}else if v_level =="市级" {
+		} else if v_level == "市级" {
 			v_score = 2
-		}else if v_level =="县区" {
+		} else if v_level == "县区" {
 			v_score = 1
-		}else if v_level =="" {
-		}else {
+		} else if v_level == "" {
+		} else {
 			v_score = 0
 		}
 	}
 
-	if dict_info !=nil {
+	if dict_info != nil {
 		info_level := util.ObjToString(dict_info["level"])
-		if info_level =="中央" {
+		if info_level == "中央" {
 			info_score = 4
-		}else if info_level =="省级" {
+		} else if info_level == "省级" {
 			info_score = 3
-		}else if info_level =="市级" {
+		} else if info_level == "市级" {
 			info_score = 2
-		}else if info_level =="县区" {
+		} else if info_level == "县区" {
 			info_score = 1
-		}else if info_level == ""{
+		} else if info_level == "" {
 
-		}else {
+		} else {
 			v_score = 0
 		}
 	}
 
-	if v_score>info_score{
+	if v_score > info_score {
 		return true
 	}
-	if v_score<info_score{
+	if v_score < info_score {
 		return false
 	}
 
 	//判断sitetype
-	if dict_v !=nil {
+	if dict_v != nil {
 		v_sitetype := util.ObjToString(dict_v["sitetype"])
-		if v_sitetype =="政府采购"||v_sitetype=="政府门户" {
+		if v_sitetype == "政府采购" || v_sitetype == "政府门户" {
 			v_score = 4
-		}else if v_sitetype =="公共资源" {
+		} else if v_sitetype == "公共资源" {
 			v_score = 3
-		}else if v_sitetype =="官方网站" {
+		} else if v_sitetype == "官方网站" {
 			v_score = 2
-		}else if v_sitetype =="社会公共招标平台"||v_sitetype =="企业招标平台" {
+		} else if v_sitetype == "社会公共招标平台" || v_sitetype == "企业招标平台" {
 			v_score = 1
-		}else if v_sitetype =="" {
-		}else {
+		} else if v_sitetype == "" {
+		} else {
 			v_score = 0
 		}
 	}
 
-	if dict_info !=nil {
+	if dict_info != nil {
 		info_sitetype := util.ObjToString(dict_info["sitetype"])
-		if info_sitetype =="政府采购"||info_sitetype=="政府门户" {
+		if info_sitetype == "政府采购" || info_sitetype == "政府门户" {
 			info_score = 4
-		}else if info_sitetype =="公共资源" {
+		} else if info_sitetype == "公共资源" {
 			info_score = 3
-		}else if info_sitetype =="官方网站" {
+		} else if info_sitetype == "官方网站" {
 			info_score = 2
-		}else if info_sitetype =="社会公共招标平台"||info_sitetype =="企业招标平台" {
+		} else if info_sitetype == "社会公共招标平台" || info_sitetype == "企业招标平台" {
 			info_score = 1
-		}else if info_sitetype =="" {
-		}else {
+		} else if info_sitetype == "" {
+		} else {
 			info_score = 0
 		}
 	}
 
-	if v_score>info_score{
+	if v_score > info_score {
 		return true
 	}
-	if v_score<info_score{
+	if v_score < info_score {
 		return false
 	}
 
+	//网站评估
+	m, n := 0, 0
+	if v.projectname != "" {
+		m++
+	}
+	if v.buyer != "" {
+		m++
+	}
+	if v.projectcode != "" {
+		m++
+	}
+	if v.budget != 0 {
+		m++
+	}
+	if v.bidamount != 0 {
+		m++
+	}
+	if v.winner != "" {
+		m++
+	}
+	if v.bidopentime != 0 {
+		m++
+	}
+	if v.agencyaddr != "" {
+		m++
+	}
+	if v.agency != "" {
+		m = m + 2
+	}
+	if v.city != "" {
+		m = m + 2
+	}
 
+	if info.projectname != "" {
+		n++
+	}
+	if info.buyer != "" {
+		n++
+	}
+	if info.projectcode != "" {
+		n++
+	}
+	if info.budget != 0 {
+		n++
+	}
+	if info.bidamount != 0 {
+		n++
+	}
+	if info.winner != "" {
+		n++
+	}
+	if info.bidopentime != 0 {
+		n++
+	}
+	if info.agencyaddr != "" {
+		n++
+	}
+	if info.agency != "" {
+		n = m + 2
+	}
+	if info.city != "" {
+		n = m + 2
+	}
 
-	//网站评估
-	m,n:=0,0
-	if v.projectname!="" {m++}
-	if v.buyer!="" {m++}
-	if v.projectcode!="" {m++}
-	if v.budget!=0 {m++}
-	if v.bidamount!=0 {m++}
-	if v.winner!="" {m++}
-	if v.bidopentime!=0 {m++}
-	if v.agencyaddr!="" {m++}
-	if v.agency!="" {m=m+2}
-	if v.city!="" {m=m+2}
-
-	if info.projectname!="" {n++}
-	if info.buyer!="" {n++}
-	if info.projectcode!="" {n++}
-	if info.budget!=0 {n++}
-	if info.bidamount!=0 {n++}
-	if info.winner!="" {n++}
-	if info.bidopentime!=0 {n++}
-	if info.agencyaddr!="" {n++}
-	if info.agency!="" {n=m+2}
-	if info.city!="" {n=m+2}
-
-	if m>n {
+	if m > n {
 		return true
-	}else if m==n {
-		if v.comeintime>=info.comeintime {
+	} else if m == n {
+		if v.comeintime >= info.comeintime {
 			return true
-		}else {
+		} else {
 			return false
 		}
-	}else {
+	} else {
 		return false
 	}
 }
 
-
 //无效数据
-func invalidData(d1 string,d2 string,d3 string)  bool{
+func invalidData(d1 string, d2 string, d3 string) bool {
 	var n int
 	if d1 != "" {
 		n++
@@ -849,8 +848,8 @@ func invalidData(d1 string,d2 string,d3 string)  bool{
 	if d3 != "" {
 		n++
 	}
-	if n==0 {
+	if n == 0 {
 		return true
 	}
 	return false
-}
+}

+ 365 - 0
udpprojectset/src/heavy_test.go

@@ -0,0 +1,365 @@
+package main
+
+import (
+	"fmt"
+	"github.com/tealeg/xlsx"
+	"log"
+	"qfw/util"
+	"qfw/util/mongodb"
+	"sync"
+	"testing"
+	"time"
+)
+
+var (
+	mgo          *mongodb.MongodbSim    //mongodb操作对象
+)
+
+
+func Test_heavy(t *testing.T) {
+
+	//mapinfo := map[string]interface{}{
+	//	"gtid":  "586b6d7061a0721f15b8f264",
+	//	"lteid": "5e0b2b780cf41612e0639460",
+	//}
+	//task([]byte{}, mapinfo)
+
+
+	//log.Println("1")
+	//代码copy数据
+	//sessTest :=mgoTest.GetMgoConn()
+	//defer sessTest.Close()
+	//
+	//sess := mgo.GetMgoConn()
+	//defer sess.Close()
+	//
+	////var arr []map[string]interface{}
+	//
+	//res_test := sessTest.DB("qfw").C("bidding").Find(mongodb.ObjToMQ(`{"comeintime":{"$gte": 1571025600, "$lte": 1571976000}}`, true)).Iter()
+	//res :=sess.DB("extract_kf").C("a_testbidding")
+	//5
+	//
+	//
+	//
+	//
+	//i:=0
+	//for dict := make(map[string]interface{}); res_test.Next(&dict); i++{
+	//
+	//	//插入
+	//	if i%2000==0 {
+	//		log.Println("当前:",i)
+	//	}
+	//	res.Insert(dict)
+	//	//if len(arr)>=500 {
+	//	//	arr = make([]map[string]interface{},0)
+	//	//}else {
+	//	//	arr = append(arr,dict)
+	//	//}
+	//}
+	//
+
+	//extract,extract_copy:="a_testbidding_new","a_testbidding"
+	//
+	//sess := mgo.GetMgoConn()
+	//defer mgo.DestoryMongoConn(sess)
+	//res_copy := sess.DB("extract_kf").C(extract_copy).Find(nil).Iter()
+	//
+	//m1 :=map[string]int{} //老版本
+	//m2 :=map[string]int{} //新版本
+	//
+	//i:=0
+	//j:=0
+	//for v1 := make(map[string]interface{}); res_copy.Next(&v1); i++{
+	//	if i%2000==0 {
+	//		log.Println("当前i:",i)
+	//	}
+	//	m1[(v1["_id"].(bson.ObjectId).Hex())]= util.IntAll(v1["repeat"])
+	//}
+	//
+	//sesss := mgo.GetMgoConn()
+	//defer mgo.DestoryMongoConn(sesss)
+	//res := sesss.DB("extract_kf").C(extract).Find(nil).Iter()
+	//
+	//
+	//for v2 := make(map[string]interface{}); res.Next(&v2); j++{
+	//	if j%2000==0 {
+	//		log.Println("当前j:",j)
+	//	}
+	//	m2[(v2["_id"].(bson.ObjectId).Hex())]= util.IntAll(v2["repeat"])
+	//}
+	//
+	//fmt.Println(len(m1),len(m2))
+	//n1:=0
+	//n2:=0
+	//n3:=0
+	//n4:=0
+	//n5:=0
+	//n6:=0
+	//
+	//var arr1 []string
+	//var arr2 []string
+	//for k,v:=range m1{
+	//
+	//	if m2[k]==1&&v==0{//0:1
+	//		n1++
+	//		arr2 = append(arr2,fmt.Sprintf("目标_id:%s",k))
+	//	}
+	//	if m2[k]==0&&v==1{ //1:0
+	//		n2++
+	//		arr1 = append(arr1,fmt.Sprintf("目标_id:%s",k))
+	//	}
+	//	if m2[k]==0&&v==0{ //0:0
+	//		n3++
+	//	}
+	//	if m2[k]==1&&v==1{//1:1
+	//		n4++
+	//	}
+	//	if m2[k]==-1&&v==0{ //0:-1
+	//		n5++
+	//	}
+	//	if m2[k]==-1&&v==1{//1:-1
+	//		n6++
+	//	}
+	//
+	//}
+	////打印 1:0情况    ;
+	//mm:=0
+	//for _,v:=range arr1 {
+	//	mm++
+	//	if mm%200==0 {
+	//		log.Println(v)
+	//	}
+	//}
+	//
+	//log.Println("分割线---------------")
+	//log.Println("分割线---------------")
+	//
+	//
+	////打印 0:1情况
+	//nn:=0
+	//for _,v:=range arr2 {
+	//	nn++
+	//	if nn%200==0 {
+	//		log.Println(v)
+	//	}
+	//}
+	//
+	//log.Println("V1 0:1---",n1)
+	//log.Println("V1 1:0---",n2)
+	//log.Println("V1 0:0---",n3)
+	//log.Println("V1 1:1---",n4)
+	//log.Println("V1 0:-1---",n5)
+	//log.Println("V1 1:-1---",n6)
+}
+
+
+
+
+func Test_field(t *testing.T) {
+
+	mgo = &mongodb.MongodbSim{
+		MongodbAddr: "192.168.3.207:27092",
+		DbName:      "extract_kf",
+		Size:        util.IntAllDef(15, 10),
+	}
+	mgo.InitPool()
+
+	//调试 - 导出数据
+	//1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
+	//2:人工抽查数据质量,用于jsondata权重评估
+
+	//取 固有字段 1-为存在
+	//now := int64(time.Now().Unix())
+	//date_time := int64(86400*2)
+
+	field_map := make(map[string]string,0)
+	sess_field := mgo.GetMgoConn()
+	defer sess_field.Close()
+	res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
+	for dict := make(map[string]interface{}); res_field.Next(&dict); {
+		field_map[dict["s_field"].(string)] = "1"
+	}
+
+	//固定死的需要分析的字段
+
+
+
+
+	/*	ObjectId("5da3f2c5a5cb26b9b79847fc")
+		ObjectId("5da3fd6da5cb26b9b7a8683c")
+		ObjectId("5da40bdaa5cb26b9b7bea472")
+	*/
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  util.StringTOBsonId("5da3f2c5a5cb26b9b79847fc"),
+			"$lte": util.StringTOBsonId("5da3fd6da5cb26b9b7a8683c"),
+		},
+	}
+	it := sess.DB(mgo.DbName).C("a_testbidding").Find(&q).Sort("_id").Iter()
+
+	//爬虫组
+	crawlerMap,n := make(map[string]map[string]interface{},0),0
+
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		if tmp["spidercode"]!="" {
+			//判断是否有次类别分组
+			dict := make(map[string]interface{},0)
+			if crawlerMap[tmp["spidercode"].(string)]!= nil {
+				dict = crawlerMap[tmp["spidercode"].(string)]
+			}
+			jsonData := util.ObjToMap(tmp["jsondata"])
+
+			if jsonData!=nil {
+				for k,v :=range *jsonData  {
+					if fmt.Sprint(v) =="" {
+						//无效数据
+					}else {
+						arr := dict[k]
+						if arr==nil {
+							dict[k] = make([]string,0)
+							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+						}else {
+							//if a,ok :=arr.([]string);ok{
+							//	a = append(a,fmt.Sprint(v))
+							//}
+							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+						}
+					}
+				}
+			}
+			if dict!=nil {
+				crawlerMap[tmp["spidercode"].(string)] = dict
+			}
+		}
+	}
+
+	log.Println("总计",n,"条数据")
+	log.Println("判重类别个数:",len(crawlerMap))
+
+
+	//计算每个爬虫分类的总数-并添加
+
+	//
+	arr :=make([]map[string]interface{},0)
+	for k,v :=range crawlerMap  {
+		total :=0
+		for _,v1 :=range v {
+			total =total + len(v1.([]string))
+		}
+		v["total"]= total
+		v["key"] = k
+		arr = append(arr,v)
+	}
+
+
+	//爬虫类别下-有效字段总数排列 前100
+	start := time.Now().Unix()
+	quickSort(0,len(arr)-1,&arr)
+	end :=time.Now().Unix()
+	fmt.Println("耗时:",end-start,"秒")
+
+	f :=xlsx.NewFile()
+	sheet, _ := f.AddSheet("排序")
+
+	//第一行先写标题
+	row1 := sheet.AddRow()
+	row1.AddCell().Value = "排名"
+	row1.AddCell().Value = "爬虫类"
+	row1.AddCell().Value = "字段有效数"
+
+	mapLock := &sync.Mutex{}
+	limit :=0
+	for _,v :=range arr  {
+		limit++
+		row := sheet.AddRow()
+		row.AddCell().SetInt(limit)
+		row.AddCell().SetString(v["key"].(string))
+		row.AddCell().SetInt(v["total"].(int))
+
+		mapLock.Lock()
+		sheetName := "排名:"+util.ObjToString(v["key"])
+		sheet_detail, err := f.AddSheet(sheetName)
+		if err==nil {
+			row_num,col_num :=0,0
+			for k1,v1 := range v {
+				if a,ok :=v1.([]string);ok {
+					for k2, v2 := range a {
+						if k2==0 {
+							sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
+							row_num++
+							sheet_detail.Cell(row_num, col_num).Value = v2
+						}else {
+							sheet_detail.Cell(row_num, col_num).Value = v2
+						}
+						row_num++
+					}
+					row_num = 0
+					col_num++
+				}
+			}
+		}
+
+		mapLock.Unlock()
+
+		if limit >10{
+			break
+		}
+	}
+
+
+	err := f.Save("zheng.xlsx")
+	if err != nil {
+		log.Println("保存xlsx失败:", err)
+		return
+	}
+	log.Println("xlsx保存成功")
+}
+
+
+func quickSort(left int,right int ,array *[]map[string]interface{}) {
+
+	l:=left
+	r:=right
+
+	pivot := util.IntAll((*array)[(left+right)/2]["total"])//中轴
+	//for 的目标  将比pivot小的左边 反之右边
+	for ;l<r;{
+		//左半区找到大于等于pivot的数
+		for ;util.IntAll((*array)[l]["total"]) > pivot; {
+			l++
+		}
+		//右半区找到小于等于pivot的数
+		for ;util.IntAll((*array)[r]["total"])<pivot; {
+			r--
+		}
+		//本次分解任务完成
+		if l>=r {
+			break
+		}
+
+		(*array)[l],(*array)[r] = (*array)[r],(*array)[l]
+		//优化相等的情况
+		if util.IntAll((*array)[l]["total"]) == pivot {
+			r--
+		}
+		if util.IntAll((*array)[r]["total"]) == pivot {
+			l++
+		}
+
+	}
+	if l==r {
+		l++
+		r--
+	}
+	//向左递归
+	if left<r {
+		quickSort(left,r,array)
+	}
+	//向右递归
+	if right>l {
+		quickSort(l,right,array)
+	}
+
+}

二进制
udpprojectset/src/zheng.xlsx


部分文件因为文件数量过多而无法显示