|
@@ -6,7 +6,6 @@ import (
|
|
|
"log"
|
|
|
"qfw/util"
|
|
|
"qfw/util/mongodb"
|
|
|
- "sync"
|
|
|
"testing"
|
|
|
)
|
|
|
|
|
@@ -36,9 +35,9 @@ func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
log.Println("当前n:",n)
|
|
|
}
|
|
|
|
|
|
- if n>2000 {
|
|
|
- break
|
|
|
- }
|
|
|
+ //if n>2000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
crawlerMap[util.BsonIdToSId(tmp["_id"])] = util.ObjToString(tmp["spidercode"])
|
|
|
}
|
|
|
|
|
@@ -53,9 +52,9 @@ func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
log.Println("当前n1:",n1)
|
|
|
}
|
|
|
|
|
|
- if n1>2000 {
|
|
|
- break
|
|
|
- }
|
|
|
+ //if n1>2000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
|
|
|
//类别
|
|
|
dic :=map[string]interface{}{
|
|
@@ -94,15 +93,14 @@ func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
log.Println("当前n2:",n2)
|
|
|
}
|
|
|
|
|
|
- if n2>2000 {
|
|
|
- break
|
|
|
- }
|
|
|
+ //if n2>1000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
|
|
|
//类别
|
|
|
dic :=map[string]interface{}{
|
|
|
"_id":util.BsonIdToSId(tmp["_id"]),
|
|
|
"href":util.ObjToString(tmp["href"]),
|
|
|
- "title":util.ObjToString(tmp["title"]),
|
|
|
"buyer":util.ObjToString(tmp["buyer"]),
|
|
|
"agency":util.ObjToString(tmp["agency"]),
|
|
|
"winner":util.ObjToString(tmp["winner"]),
|
|
@@ -110,9 +108,6 @@ func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
"bidamount":util.ObjToString(tmp["bidamount"]),
|
|
|
"projectname":util.ObjToString(tmp["projectname"]),
|
|
|
"projectcode":util.ObjToString(tmp["projectcode"]),
|
|
|
- "publishtime":util.ObjToString(tmp["publishtime"]),
|
|
|
- "bidopentime":util.ObjToString(tmp["bidopentime"]),
|
|
|
- "agencyaddr":util.ObjToString(tmp["agencyaddr"]),
|
|
|
}
|
|
|
value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
|
|
|
arr := crawlerMap_2[value]
|
|
@@ -133,7 +128,6 @@ func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
return
|
|
|
}
|
|
|
var list = []string{
|
|
|
- "title",
|
|
|
"buyer",
|
|
|
"agency",
|
|
|
"winner",
|
|
@@ -141,11 +135,7 @@ func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
"bidamount",
|
|
|
"projectname",
|
|
|
"projectcode",
|
|
|
- "publishtime",
|
|
|
- "bidopentime",
|
|
|
- "agencyaddr",
|
|
|
}
|
|
|
- fmt.Println(len(list))
|
|
|
|
|
|
var crawlerArr = []string{
|
|
|
"a_zgzfcgw_zfcghtgg_new",
|
|
@@ -156,107 +146,103 @@ func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
}
|
|
|
|
|
|
|
|
|
- ////先添加标题栏
|
|
|
- f :=xlsx.NewFile()
|
|
|
-
|
|
|
- // ////第一行先写标题
|
|
|
- // //row1 := sheet.AddRow()
|
|
|
- // //row1.AddCell().Value = "排名"
|
|
|
- // //row1.AddCell().Value = "爬虫类"
|
|
|
- // //row1.AddCell().Value = "字段有效数"
|
|
|
- // //"site" , "href" 单独
|
|
|
- mapLock := &sync.Mutex{}
|
|
|
-
|
|
|
+ //数量统计
|
|
|
+ AnaNumMap :=map[string]map[string][]int{
|
|
|
+ "a_zgzfcgw_zfcghtgg_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "gd_gdszfcgw_dscght": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "a_zgzfcgw_bid_tender_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "a_ztxygjzbtbzxyxgs_zbxx": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "sd_zgsdzfcgw_xxgk_sxhtgk": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ }
|
|
|
|
|
|
- //对比数据
|
|
|
+ fmt.Println(len(AnaNumMap))
|
|
|
+ //-对比数据
|
|
|
for _,v:=range crawlerArr {
|
|
|
if crawlerMap_1[v]==nil||crawlerMap_2[v]==nil {
|
|
|
continue
|
|
|
}
|
|
|
- log.Println("当前爬虫类:",v)
|
|
|
//取数组数据
|
|
|
- arr1 := crawlerMap_1[v]
|
|
|
- arr2 := crawlerMap_2[v]
|
|
|
-
|
|
|
- log.Println(len(arr1))
|
|
|
-
|
|
|
- if len(arr1)!=len(arr2) {
|
|
|
- log.Println("数据个数错误")
|
|
|
- continue
|
|
|
- }else {
|
|
|
- mapLock.Lock()
|
|
|
- row_num,col_num :=1,2//赋值起始值
|
|
|
- sheet, err := f.AddSheet(util.ObjToString(v))
|
|
|
- if err==nil {
|
|
|
- for i:=0;i<len(arr1);i++ {
|
|
|
- dict1 := arr1[i]
|
|
|
- dict2 := arr2[i]
|
|
|
- if i==0 {
|
|
|
- //只写入一次标题
|
|
|
- for j:=0;j<len(list)+2 ;j++ {
|
|
|
- if j==0 {
|
|
|
- sheet.Cell(0, j).Value = "_id"
|
|
|
- } else if j==1 {
|
|
|
- sheet.Cell(0, j).Value = "href"
|
|
|
- }else {
|
|
|
- sheet.Cell(0, j).Value = list[j-2]
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- isAdd := false
|
|
|
- for k:=0;k<len(list) ;k++ {
|
|
|
- if k==0 {
|
|
|
- sheet.Cell(1, 0).Value = util.BsonIdToSId(dict1["_id"])
|
|
|
- }
|
|
|
- string_1:=util.ObjToString(dict1[list[k]])
|
|
|
- string_2:=util.ObjToString(dict2[list[k]])
|
|
|
- if string_1!=string_2 {
|
|
|
- isAdd = true
|
|
|
- sheet.Cell(1, col_num).Value = string_1+"~"+string_2
|
|
|
- }
|
|
|
- col_num++
|
|
|
- }
|
|
|
-
|
|
|
- if isAdd {
|
|
|
- row_num = 2
|
|
|
- }
|
|
|
-
|
|
|
- }else {
|
|
|
- col_num = 2
|
|
|
- isAdd := false
|
|
|
- for l:=0;l<len(list) ;l++ {
|
|
|
- if l==0 {
|
|
|
- sheet.Cell(row_num, 0).Value = util.BsonIdToSId(dict1["_id"])
|
|
|
- }
|
|
|
-
|
|
|
- string_1:=util.ObjToString(dict1[list[l]])
|
|
|
- string_2:=util.ObjToString(dict2[list[l]])
|
|
|
- if string_1!=string_2 {
|
|
|
- isAdd = true
|
|
|
- sheet.Cell(row_num, col_num).Value = string_1+"~"+string_2
|
|
|
- }
|
|
|
- col_num++
|
|
|
- }
|
|
|
- if isAdd {
|
|
|
- row_num++
|
|
|
- }
|
|
|
+ arr_1 := crawlerMap_1[v]
|
|
|
+ arr_2 := crawlerMap_2[v]
|
|
|
+
|
|
|
+
|
|
|
+ log.Println("数据总量:",len(arr_1))
|
|
|
+ AnaNumMap[v]["total"][0] = len(arr_1)
|
|
|
+ f :=xlsx.NewFile()
|
|
|
+ //创建7个表格
|
|
|
+ for i:=0;i<len(list) ;i++ {
|
|
|
+ isTitle :=false
|
|
|
+ row:=0
|
|
|
+ for j:=0;j<len(arr_1);j++ {
|
|
|
+ string_1 := fmt.Sprint(arr_1[j][list[i]])
|
|
|
+ string_2 := fmt.Sprint(arr_2[j][list[i]])
|
|
|
+ if string_1!=string_2 {
|
|
|
+ if !isTitle{
|
|
|
+ sheet, _ := f.AddSheet(list[i])
|
|
|
+ sheet.Cell(row, 0).Value = "_id"
|
|
|
+ sheet.Cell(row, 1).Value = "href"
|
|
|
+ sheet.Cell(row, 2).Value = fmt.Sprint(list[i])+"_V1"
|
|
|
+ sheet.Cell(row, 3).Value = fmt.Sprint(list[i])+"_V2"
|
|
|
+ isTitle = true
|
|
|
+ row++
|
|
|
}
|
|
|
+ sheet :=f.Sheet[list[i]]
|
|
|
+ sheet.Cell(row, 0).Value = util.BsonIdToSId(arr_1[j]["_id"])
|
|
|
+ sheet.Cell(row, 1).Value = util.ObjToString(arr_1[j]["href"])
|
|
|
+ sheet.Cell(row, 2).Value = string_1
|
|
|
+ sheet.Cell(row, 3).Value = string_2
|
|
|
+ row++
|
|
|
+ AnaNumMap[v]["diff"][i] = AnaNumMap[v]["diff"][i]+1
|
|
|
+ }else {
|
|
|
+ AnaNumMap[v]["same"][i] = AnaNumMap[v]["same"][i]+1
|
|
|
}
|
|
|
}
|
|
|
+ }
|
|
|
+
|
|
|
+ path:="zk_"+v+".xlsx"
|
|
|
+ error := f.Save(path)
|
|
|
+ if error != nil {
|
|
|
+ log.Println("保存xlsx失败:", error)
|
|
|
+ return
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
|
|
|
+ f :=xlsx.NewFile()
|
|
|
+ sheet, _ := f.AddSheet("摘要")
|
|
|
+ for i:=0;i<len(list) ;i++ {
|
|
|
+ sheet.Cell(1, i+3).Value = list[i]
|
|
|
+ }
|
|
|
|
|
|
- mapLock.Unlock()
|
|
|
+ for i:=0;i<len(crawlerArr) ;i++ {
|
|
|
+ sheet.Cell(i+2, 0).Value = crawlerArr[i]
|
|
|
+ total:= fmt.Sprint(AnaNumMap[crawlerArr[i]]["total"][0])
|
|
|
+ sheet.Cell(i+2, 1).Value = total
|
|
|
+ same:=AnaNumMap[crawlerArr[i]]["same"]
|
|
|
+ diff:=AnaNumMap[crawlerArr[i]]["diff"]
|
|
|
+ for j:=0;j<len(same) ;j++ {
|
|
|
+ sheet.Cell(i+2, j+3).Value = fmt.Sprint(same[j])+"~"+fmt.Sprint(diff[j])
|
|
|
}
|
|
|
+
|
|
|
}
|
|
|
+ sheet.Cell(1, 0).Value = "爬虫代码"
|
|
|
+ sheet.Cell(1, 1).Value = "数据总量"
|
|
|
+ sheet.Cell(1, 2).Value = "相同字段对比"
|
|
|
+ sheet.Cell(2, 2).Value = "相同数量~不同数量"
|
|
|
|
|
|
|
|
|
|
|
|
- err := f.Save("zheng_test.xlsx")
|
|
|
- if err != nil {
|
|
|
- log.Println("保存xlsx失败:", err)
|
|
|
- return
|
|
|
+ fmt.Println(AnaNumMap)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ path:="摘要"+".xlsx"
|
|
|
+ error := f.Save(path)
|
|
|
+ if error != nil {
|
|
|
+ log.Println("保存xlsx失败:", error)
|
|
|
}
|
|
|
- log.Println("xlsx保存成功")
|
|
|
+
|
|
|
}
|
|
|
|
|
|
|