apple 5 жил өмнө
parent
commit
7c1d57f795

+ 2 - 13
udpfilterdup/src/main.go

@@ -91,9 +91,7 @@ func init() {
 }
 
 func main() {
-
 	go checkMapJob()
-
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
 	udpclient.Listen(processUdpMsg)
@@ -552,8 +550,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 		source.area = info.area
 		source.city = info.city
 		mergeArr = append(mergeArr, 1)
-
-		//fmt.Println("合并-城市")
 	}
 	//2、项目名称
 	if source.projectname == "" && info.projectname != "" {
@@ -568,7 +564,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.projectname = info.projectname
 		mergeArr = append(mergeArr, 2)
-		//fmt.Println("合并-项目名称")
 	}
 	//3、项目编号
 	if source.projectcode == "" && info.projectcode != "" {
@@ -583,7 +578,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.projectcode = info.projectcode
 		mergeArr = append(mergeArr, 3)
-		//fmt.Println("合并-项目标号")
 	}
 	//4、采购单位
 	if source.buyer == "" && info.buyer != "" {
@@ -598,7 +592,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.buyer = info.buyer
 		mergeArr = append(mergeArr, 4)
-		//fmt.Println("合并-采购单位")
 	}
 	//5、预算
 	if source.budget == 0 && info.budget != 0 {
@@ -613,7 +606,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.budget = info.budget
 		mergeArr = append(mergeArr, 5)
-		//fmt.Println("合并-预算")
 	}
 	//6、中标单位
 	if source.winner == "" && info.winner != "" {
@@ -628,7 +620,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.winner = info.winner
 		mergeArr = append(mergeArr, 6)
-		//fmt.Println("合并-中标单位")
 	}
 	//7、中标金额
 	if source.bidamount == 0 && info.bidamount != 0 {
@@ -643,7 +634,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.bidamount = info.bidamount
 		mergeArr = append(mergeArr, 7)
-		//fmt.Println("合并-中标金额")
 	}
 	//8、开天时间-地点
 	if source.bidopentime == 0 && info.bidopentime != 0 {
@@ -658,7 +648,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.bidopentime = info.bidopentime
 		mergeArr = append(mergeArr, 8)
-		//fmt.Println("合并-开标时间")
 	}
 
 	//以上合并过于简单,待进一步优化
@@ -817,10 +806,10 @@ func basicDataScore(v *Info, info *Info) bool {
 		n++
 	}
 	if info.agency != "" {
-		n = m + 2
+		n = n + 2
 	}
 	if info.city != "" {
-		n = m + 2
+		n = n + 2
 	}
 
 	if m > n {

+ 93 - 50
udpprojectset/src/heavy_test.go

@@ -158,8 +158,8 @@ func Test_heavy(t *testing.T) {
 func Test_field(t *testing.T) {
 
 	mgo = &mongodb.MongodbSim{
-		MongodbAddr: "192.168.3.207:27092",
-		DbName:      "extract_kf",
+		MongodbAddr: "192.168.3.207:27081",
+		DbName:      "qfw",
 		Size:        util.IntAllDef(15, 10),
 	}
 	mgo.InitPool()
@@ -172,39 +172,76 @@ func Test_field(t *testing.T) {
 	//now := int64(time.Now().Unix())
 	//date_time := int64(86400*2)
 
-	field_map := make(map[string]string,0)
-	sess_field := mgo.GetMgoConn()
-	defer sess_field.Close()
-	res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
-	for dict := make(map[string]interface{}); res_field.Next(&dict); {
-		field_map[dict["s_field"].(string)] = "1"
-	}
+	//field_map := make(map[string]string,0)
+	//sess_field := mgo.GetMgoConn()
+	//defer sess_field.Close()
+	//res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
+	//for dict := make(map[string]interface{}); res_field.Next(&dict); {
+	//	field_map[dict["s_field"].(string)] = "1"
+	//}
 
 	//固定死的需要分析的字段
+	field_map := map[string]string{
+		"title":"1",
+		"area":"1",
+		"city":"1",
+		"subtype":"1",
+		"buyer":"1",
+		"agency":"1",
+		"winner":"1",
+		"budget":"1",
+		"bidamount":"1",
+		"projectname":"1",
+		"projectcode":"1",
+		"publishtime":"1",
+		"comeintime":"1",
+		"bidopentime":"1",
+		"agencyaddr":"1",
+		"site":"1",
+		"href":"1",
+	}
+
+	/*	ObjectId("5da3f2c5a5cb26b9b79847fc") 0
+		ObjectId("5da3fd6da5cb26b9b7a8683c") 5000
+		ObjectId("5da40bdaa5cb26b9b7bea472") 10000
+		ObjectId("5da44deaa5cb26b9b75efb38") 50000
+		ObjectId("5da53440a5cb26b9b7d3f9aa") 100000
+		ObjectId("5db2735ba5cb26b9b7c99c6f") 761414
+	*/
 
+	/*
+	qfw-bidding
 
+	ObjectId("5e0d4cdd0cf41612e063fc65")  -1
+	ObjectId("5df8bfe4e9d1f601e4e87431") 一百万
+	ObjectId("5dea080ce9d1f601e45cb838") 二百万
 
 
-	/*	ObjectId("5da3f2c5a5cb26b9b79847fc")
-		ObjectId("5da3fd6da5cb26b9b7a8683c")
-		ObjectId("5da40bdaa5cb26b9b7bea472")
 	*/
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
-	q := map[string]interface{}{
-		"_id": map[string]interface{}{
-			"$gt":  util.StringTOBsonId("5da3f2c5a5cb26b9b79847fc"),
-			"$lte": util.StringTOBsonId("5da3fd6da5cb26b9b7a8683c"),
-		},
-	}
-	it := sess.DB(mgo.DbName).C("a_testbidding").Find(&q).Sort("_id").Iter()
+	//q := map[string]interface{}{
+	//	"_id": map[string]interface{}{
+	//		"$gt":  util.StringTOBsonId("5dea080ce9d1f601e45cb838"),
+	//		"$lte": util.StringTOBsonId("5e0d4cdd0cf41612e063fc65"),
+	//	},
+	//}
+	it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
 
 	//爬虫组
 	crawlerMap,n := make(map[string]map[string]interface{},0),0
 
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		if n%10000==0 {
+			log.Println("当前n:",n)
+		}
+
+		if n>3000000 {
+			break
+		}
+
 		if tmp["spidercode"]!="" {
-			//判断是否有次类别分组
+			//判断是否有类别分组
 			dict := make(map[string]interface{},0)
 			if crawlerMap[tmp["spidercode"].(string)]!= nil {
 				dict = crawlerMap[tmp["spidercode"].(string)]
@@ -213,18 +250,17 @@ func Test_field(t *testing.T) {
 
 			if jsonData!=nil {
 				for k,v :=range *jsonData  {
-					if fmt.Sprint(v) =="" {
+					if fmt.Sprint(v) ==""{
 						//无效数据
 					}else {
-						arr := dict[k]
-						if arr==nil {
-							dict[k] = make([]string,0)
-							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
-						}else {
-							//if a,ok :=arr.([]string);ok{
-							//	a = append(a,fmt.Sprint(v))
-							//}
-							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+						if field_map[k]=="1" {
+							arr := dict[k]
+							if arr==nil {
+								dict[k] = make([]string,0)
+								dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+							}else {
+								dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+							}
 						}
 					}
 				}
@@ -236,12 +272,12 @@ func Test_field(t *testing.T) {
 	}
 
 	log.Println("总计",n,"条数据")
-	log.Println("判重类别个数:",len(crawlerMap))
+	log.Println("爬虫类别个数:",len(crawlerMap))
 
 
 	//计算每个爬虫分类的总数-并添加
 
-	//
+	//ObjectId("5e0d4cdd0cf41612e063fc65")
 	arr :=make([]map[string]interface{},0)
 	for k,v :=range crawlerMap  {
 		total :=0
@@ -278,32 +314,39 @@ func Test_field(t *testing.T) {
 		row.AddCell().SetString(v["key"].(string))
 		row.AddCell().SetInt(v["total"].(int))
 
-		mapLock.Lock()
-		sheetName := "排名:"+util.ObjToString(v["key"])
-		sheet_detail, err := f.AddSheet(sheetName)
-		if err==nil {
-			row_num,col_num :=0,0
-			for k1,v1 := range v {
-				if a,ok :=v1.([]string);ok {
-					for k2, v2 := range a {
-						if k2==0 {
-							sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
+		if limit <=20 {
+			mapLock.Lock()
+			sheetName := "排名"+util.ObjToString(limit)+":"+util.ObjToString(v["key"])
+			sheet_detail, err := f.AddSheet(sheetName)
+			if err==nil {
+				row_num,col_num :=0,0
+				for k1,v1 := range v {
+					if a,ok :=v1.([]string);ok {
+						for k2, v2 := range a {
+							if k2==0 {
+								sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
+								row_num++
+								sheet_detail.Cell(row_num, col_num).Value = v2
+							}else {
+								if row_num>2000 {
+									continue
+								}
+								sheet_detail.Cell(row_num, col_num).Value = v2
+							}
 							row_num++
-							sheet_detail.Cell(row_num, col_num).Value = v2
-						}else {
-							sheet_detail.Cell(row_num, col_num).Value = v2
 						}
-						row_num++
+						row_num = 0
+						col_num++
 					}
-					row_num = 0
-					col_num++
 				}
 			}
+
+			mapLock.Unlock()
 		}
 
-		mapLock.Unlock()
 
-		if limit >10{
+
+		if limit >99{
 			break
 		}
 	}

BIN
udpprojectset/src/zheng.xlsx