Browse Source

jsondata权重比较分析

apple 5 years ago
parent
commit
5cdba1c90f
3 changed files with 89 additions and 105 deletions
  1. 0 2
      udpfilterdup/src/datamap.go
  2. 89 103
      udpprojectset/src/heavy_test.go
  3. BIN
      udpprojectset/src/zheng_test.xlsx

+ 0 - 2
udpfilterdup/src/datamap.go

@@ -28,7 +28,6 @@ type Info struct {
 	comeintime  int64   //采集时间
 	bidopentime int64   //开标时间
 	agencyaddr  string  //开标地点
-	detail      string  //招标内容
 	site        string  //站点
 	href        string  //正文的url
 	repeatid    string  //重复id
@@ -210,7 +209,6 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.publishtime = qutil.Int64All(tmp["publishtime"])
 	info.bidopentime = qutil.Int64All(tmp["bidopentime"])
 	info.agencyaddr = qutil.ObjToString(tmp["agencyaddr"])
-	info.detail = qutil.ObjToString(tmp["detail"])
 	info.site = qutil.ObjToString(tmp["site"])
 	info.href = qutil.ObjToString(tmp["href"])
 	info.repeatid = qutil.ObjToString(tmp["repeatid"])

+ 89 - 103
udpprojectset/src/heavy_test.go

@@ -6,7 +6,6 @@ import (
 	"log"
 	"qfw/util"
 	"qfw/util/mongodb"
-	"sync"
 	"testing"
 )
 
@@ -36,9 +35,9 @@ func Test_crawlerExtractitCompare(t *testing.T) {
 			log.Println("当前n:",n)
 		}
 
-		if n>2000 {
-			break
-		}
+		//if n>2000 {
+		//	break
+		//}
 		crawlerMap[util.BsonIdToSId(tmp["_id"])] = util.ObjToString(tmp["spidercode"])
 	}
 
@@ -53,9 +52,9 @@ func Test_crawlerExtractitCompare(t *testing.T) {
 			log.Println("当前n1:",n1)
 		}
 
-		if n1>2000 {
-			break
-		}
+		//if n1>2000 {
+		//	break
+		//}
 
 		//类别
 		dic :=map[string]interface{}{
@@ -94,15 +93,14 @@ func Test_crawlerExtractitCompare(t *testing.T) {
 			log.Println("当前n2:",n2)
 		}
 
-		if n2>2000 {
-			break
-		}
+		//if n2>1000 {
+		//	break
+		//}
 
 		//类别
 		dic :=map[string]interface{}{
 			"_id":util.BsonIdToSId(tmp["_id"]),
 			"href":util.ObjToString(tmp["href"]),
-			"title":util.ObjToString(tmp["title"]),
 			"buyer":util.ObjToString(tmp["buyer"]),
 			"agency":util.ObjToString(tmp["agency"]),
 			"winner":util.ObjToString(tmp["winner"]),
@@ -110,9 +108,6 @@ func Test_crawlerExtractitCompare(t *testing.T) {
 			"bidamount":util.ObjToString(tmp["bidamount"]),
 			"projectname":util.ObjToString(tmp["projectname"]),
 			"projectcode":util.ObjToString(tmp["projectcode"]),
-			"publishtime":util.ObjToString(tmp["publishtime"]),
-			"bidopentime":util.ObjToString(tmp["bidopentime"]),
-			"agencyaddr":util.ObjToString(tmp["agencyaddr"]),
 		}
 		value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
 		arr := crawlerMap_2[value]
@@ -133,7 +128,6 @@ func Test_crawlerExtractitCompare(t *testing.T) {
 		return
 	}
 	var list = []string{
-		"title",
 		"buyer",
 		"agency",
 		"winner",
@@ -141,11 +135,7 @@ func Test_crawlerExtractitCompare(t *testing.T) {
 		"bidamount",
 		"projectname",
 		"projectcode",
-		"publishtime",
-		"bidopentime",
-		"agencyaddr",
 	}
-	fmt.Println(len(list))
 
 	var crawlerArr = []string{
 		"a_zgzfcgw_zfcghtgg_new",
@@ -156,107 +146,103 @@ func Test_crawlerExtractitCompare(t *testing.T) {
 	}
 
 
-	////先添加标题栏
-	f :=xlsx.NewFile()
-
-	//	////第一行先写标题
-	//	//row1 := sheet.AddRow()
-	//	//row1.AddCell().Value = "排名"
-	//	//row1.AddCell().Value = "爬虫类"
-	//	//row1.AddCell().Value = "字段有效数"
-	//	//"site" , "href" 单独
-	mapLock := &sync.Mutex{}
-
+	//数量统计
+	AnaNumMap :=map[string]map[string][]int{
+		"a_zgzfcgw_zfcghtgg_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+		"gd_gdszfcgw_dscght": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+		"a_zgzfcgw_bid_tender_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+		"a_ztxygjzbtbzxyxgs_zbxx": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+		"sd_zgsdzfcgw_xxgk_sxhtgk": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+	}
 
-	//对比数据
+	fmt.Println(len(AnaNumMap))
+	//-对比数据
 	for _,v:=range crawlerArr {
 		if crawlerMap_1[v]==nil||crawlerMap_2[v]==nil {
 			continue
 		}
-		log.Println("当前爬虫类:",v)
 		//取数组数据
-		arr1 := crawlerMap_1[v]
-		arr2 := crawlerMap_2[v]
-
-		log.Println(len(arr1))
-
-		if len(arr1)!=len(arr2) {
-			log.Println("数据个数错误")
-			continue
-		}else {
-			mapLock.Lock()
-			row_num,col_num :=1,2//赋值起始值
-			sheet, err := f.AddSheet(util.ObjToString(v))
-			if err==nil {
-				for i:=0;i<len(arr1);i++ {
-					dict1 := arr1[i]
-					dict2 := arr2[i]
-					if i==0 {
-						//只写入一次标题
-						for j:=0;j<len(list)+2 ;j++  {
-							if j==0 {
-								sheet.Cell(0, j).Value = "_id"
-							} else if j==1 {
-								sheet.Cell(0, j).Value = "href"
-							}else {
-								sheet.Cell(0, j).Value = list[j-2]
-							}
-						}
-
-						isAdd := false
-						for k:=0;k<len(list) ;k++  {
-							if k==0 {
-								sheet.Cell(1, 0).Value = util.BsonIdToSId(dict1["_id"])
-							}
-							string_1:=util.ObjToString(dict1[list[k]])
-							string_2:=util.ObjToString(dict2[list[k]])
-							if string_1!=string_2 {
-								isAdd = true
-								sheet.Cell(1, col_num).Value = string_1+"~"+string_2
-							}
-							col_num++
-						}
-
-						if isAdd {
-							row_num = 2
-						}
-
-					}else {
-						col_num = 2
-						isAdd := false
-						for l:=0;l<len(list) ;l++  {
-							if l==0 {
-								sheet.Cell(row_num, 0).Value = util.BsonIdToSId(dict1["_id"])
-							}
-
-							string_1:=util.ObjToString(dict1[list[l]])
-							string_2:=util.ObjToString(dict2[list[l]])
-							if string_1!=string_2 {
-								isAdd = true
-								sheet.Cell(row_num, col_num).Value = string_1+"~"+string_2
-							}
-							col_num++
-						}
-						if isAdd {
-							row_num++
-						}
+		arr_1 := crawlerMap_1[v]
+		arr_2 := crawlerMap_2[v]
+
+
+		log.Println("数据总量:",len(arr_1))
+		AnaNumMap[v]["total"][0] = len(arr_1)
+		f :=xlsx.NewFile()
+		//创建7个表格
+		for i:=0;i<len(list) ;i++  {
+			isTitle :=false
+			row:=0
+			for j:=0;j<len(arr_1);j++ {
+				string_1 := fmt.Sprint(arr_1[j][list[i]])
+				string_2 := fmt.Sprint(arr_2[j][list[i]])
+				if string_1!=string_2 {
+					if !isTitle{
+						sheet, _ := f.AddSheet(list[i])
+						sheet.Cell(row, 0).Value = "_id"
+						sheet.Cell(row, 1).Value = "href"
+						sheet.Cell(row, 2).Value = fmt.Sprint(list[i])+"_V1"
+						sheet.Cell(row, 3).Value = fmt.Sprint(list[i])+"_V2"
+						isTitle = true
+						row++
 					}
+					sheet :=f.Sheet[list[i]]
+					sheet.Cell(row, 0).Value = util.BsonIdToSId(arr_1[j]["_id"])
+					sheet.Cell(row, 1).Value = util.ObjToString(arr_1[j]["href"])
+					sheet.Cell(row, 2).Value = string_1
+					sheet.Cell(row, 3).Value = string_2
+					row++
+					AnaNumMap[v]["diff"][i] = AnaNumMap[v]["diff"][i]+1
+				}else {
+					AnaNumMap[v]["same"][i] = AnaNumMap[v]["same"][i]+1
 				}
 			}
+		}
+
+		path:="zk_"+v+".xlsx"
+		error := f.Save(path)
+		if error != nil {
+			log.Println("保存xlsx失败:", error)
+			return
+		}
+	}
+
+
 
+	f :=xlsx.NewFile()
+	sheet, _ := f.AddSheet("摘要")
+	for i:=0;i<len(list) ;i++ {
+		sheet.Cell(1, i+3).Value = list[i]
+	}
 
-			mapLock.Unlock()
+	for i:=0;i<len(crawlerArr) ;i++ {
+		sheet.Cell(i+2, 0).Value = crawlerArr[i]
+		total:= fmt.Sprint(AnaNumMap[crawlerArr[i]]["total"][0])
+		sheet.Cell(i+2, 1).Value = total
+		same:=AnaNumMap[crawlerArr[i]]["same"]
+		diff:=AnaNumMap[crawlerArr[i]]["diff"]
+		for j:=0;j<len(same) ;j++  {
+			sheet.Cell(i+2, j+3).Value = fmt.Sprint(same[j])+"~"+fmt.Sprint(diff[j])
 		}
+
 	}
+	sheet.Cell(1, 0).Value = "爬虫代码"
+	sheet.Cell(1, 1).Value = "数据总量"
+	sheet.Cell(1, 2).Value = "相同字段对比"
+	sheet.Cell(2, 2).Value = "相同数量~不同数量"
 
 
 
-	err := f.Save("zheng_test.xlsx")
-	if err != nil {
-		log.Println("保存xlsx失败:", err)
-		return
+	fmt.Println(AnaNumMap)
+
+
+
+	path:="摘要"+".xlsx"
+	error := f.Save(path)
+	if error != nil {
+		log.Println("保存xlsx失败:", error)
 	}
-	log.Println("xlsx保存成功")
+
 }
 
 

BIN
udpprojectset/src/zheng_test.xlsx