|
@@ -0,0 +1,688 @@
|
|
|
+package main
|
|
|
+
|
|
|
+import (
|
|
|
+ "fmt"
|
|
|
+ "github.com/tealeg/xlsx"
|
|
|
+ "log"
|
|
|
+ "qfw/util"
|
|
|
+ "qfw/util/mongodb"
|
|
|
+ "testing"
|
|
|
+)
|
|
|
+
|
|
|
+var (
|
|
|
+ mgo *mongodb.MongodbSim //mongodb操作对象
|
|
|
+ //mgo_copy *mongodb.MongodbSim //mongodb操作对象
|
|
|
+)
|
|
|
+
|
|
|
+
|
|
|
+//分类爬虫抽取统计
|
|
|
+func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
+
|
|
|
+ mgo = &mongodb.MongodbSim{
|
|
|
+ MongodbAddr: "192.168.3.207:27092",
|
|
|
+ DbName: "extract_kf",
|
|
|
+ Size: util.IntAllDef(15, 10),
|
|
|
+ }
|
|
|
+ mgo.InitPool()
|
|
|
+
|
|
|
+ sess := mgo.GetMgoConn()
|
|
|
+ defer mgo.DestoryMongoConn(sess)
|
|
|
+ it :=sess.DB("extract_kf").C("zheng_test_1").Find(nil).Sort("_id").Iter()
|
|
|
+ n:=0
|
|
|
+ crawlerMap := make(map[string]string,0)
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
|
|
|
+ if n%10000==0 {
|
|
|
+ log.Println("当前n:",n)
|
|
|
+ }
|
|
|
+
|
|
|
+ //if n>2000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
+ crawlerMap[util.BsonIdToSId(tmp["_id"])] = util.ObjToString(tmp["spidercode"])
|
|
|
+ }
|
|
|
+
|
|
|
+ sess_1 := mgo.GetMgoConn()
|
|
|
+ defer mgo.DestoryMongoConn(sess_1)
|
|
|
+ it_1 :=sess_1.DB("extract_kf").C("zheng_test1_jd1").Find(nil).Sort("_id").Iter()
|
|
|
+ n1:=0
|
|
|
+ crawlerMap_1 := make(map[string][]map[string]interface{},0)
|
|
|
+
|
|
|
+ for tmp := make(map[string]interface{});it_1.Next(&tmp);n1++{
|
|
|
+ if n1%10000==0 {
|
|
|
+ log.Println("当前n1:",n1)
|
|
|
+ }
|
|
|
+
|
|
|
+ //if n1>2000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
+
|
|
|
+ //类别
|
|
|
+ dic :=map[string]interface{}{
|
|
|
+ "_id":util.BsonIdToSId(tmp["_id"]),
|
|
|
+ "href":util.ObjToString(tmp["href"]),
|
|
|
+ "title":util.ObjToString(tmp["title"]),
|
|
|
+ "buyer":util.ObjToString(tmp["buyer"]),
|
|
|
+ "agency":util.ObjToString(tmp["agency"]),
|
|
|
+ "winner":util.ObjToString(tmp["winner"]),
|
|
|
+ "budget":util.ObjToString(tmp["budget"]),
|
|
|
+ "bidamount":util.ObjToString(tmp["bidamount"]),
|
|
|
+ "projectname":util.ObjToString(tmp["projectname"]),
|
|
|
+ "projectcode":util.ObjToString(tmp["projectcode"]),
|
|
|
+ "publishtime":util.ObjToString(tmp["publishtime"]),
|
|
|
+ "bidopentime":util.ObjToString(tmp["bidopentime"]),
|
|
|
+ "agencyaddr":util.ObjToString(tmp["agencyaddr"]),
|
|
|
+ }
|
|
|
+ value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
|
|
|
+ arr := crawlerMap_1[value]
|
|
|
+ if arr==nil {
|
|
|
+ crawlerMap_1[value] = make([]map[string]interface{},0)
|
|
|
+ crawlerMap_1[value] = append(crawlerMap_1[value],dic)
|
|
|
+ }else {
|
|
|
+ crawlerMap_1[value] = append(crawlerMap_1[value],dic)
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ sess_2 :=mgo.GetMgoConn()
|
|
|
+ defer mgo.DestoryMongoConn(sess_2)
|
|
|
+ it_2 :=sess_2.DB("extract_kf").C("zheng_test1_jd2").Find(nil).Sort("_id").Iter()
|
|
|
+ n2:=0
|
|
|
+ crawlerMap_2 := make(map[string][]map[string]interface{})
|
|
|
+ for tmp := make(map[string]interface{}); it_2.Next(&tmp); n2++ {
|
|
|
+ if n2%10000==0 {
|
|
|
+ log.Println("当前n2:",n2)
|
|
|
+ }
|
|
|
+
|
|
|
+ //if n2>1000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
+
|
|
|
+ //类别
|
|
|
+ dic :=map[string]interface{}{
|
|
|
+ "_id":util.BsonIdToSId(tmp["_id"]),
|
|
|
+ "href":util.ObjToString(tmp["href"]),
|
|
|
+ "buyer":util.ObjToString(tmp["buyer"]),
|
|
|
+ "agency":util.ObjToString(tmp["agency"]),
|
|
|
+ "winner":util.ObjToString(tmp["winner"]),
|
|
|
+ "budget":util.ObjToString(tmp["budget"]),
|
|
|
+ "bidamount":util.ObjToString(tmp["bidamount"]),
|
|
|
+ "projectname":util.ObjToString(tmp["projectname"]),
|
|
|
+ "projectcode":util.ObjToString(tmp["projectcode"]),
|
|
|
+ }
|
|
|
+ value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
|
|
|
+ arr := crawlerMap_2[value]
|
|
|
+ if arr==nil {
|
|
|
+ crawlerMap_2[value] = make([]map[string]interface{},0)
|
|
|
+ crawlerMap_2[value] = append(crawlerMap_2[value],dic)
|
|
|
+ }else {
|
|
|
+ crawlerMap_2[value] = append(crawlerMap_2[value],dic)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ log.Println("爬虫类个数分别为:",len(crawlerMap_1),len(crawlerMap_2))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ if len(crawlerMap_1)!=len(crawlerMap_2)||len(crawlerMap_1)==0 {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ var list = []string{
|
|
|
+ "buyer",
|
|
|
+ "agency",
|
|
|
+ "winner",
|
|
|
+ "budget",
|
|
|
+ "bidamount",
|
|
|
+ "projectname",
|
|
|
+ "projectcode",
|
|
|
+ }
|
|
|
+
|
|
|
+ var crawlerArr = []string{
|
|
|
+ "a_zgzfcgw_zfcghtgg_new",
|
|
|
+ "gd_gdszfcgw_dscght",
|
|
|
+ "a_zgzfcgw_bid_tender_new",
|
|
|
+ "a_ztxygjzbtbzxyxgs_zbxx",
|
|
|
+ "sd_zgsdzfcgw_xxgk_sxhtgk",
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ //数量统计
|
|
|
+ AnaNumMap :=map[string]map[string][]int{
|
|
|
+ "a_zgzfcgw_zfcghtgg_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "gd_gdszfcgw_dscght": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "a_zgzfcgw_bid_tender_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "a_ztxygjzbtbzxyxgs_zbxx": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "sd_zgsdzfcgw_xxgk_sxhtgk": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ }
|
|
|
+
|
|
|
+ fmt.Println(len(AnaNumMap))
|
|
|
+ //-对比数据
|
|
|
+ for _,v:=range crawlerArr {
|
|
|
+ if crawlerMap_1[v]==nil||crawlerMap_2[v]==nil {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //取数组数据
|
|
|
+ arr_1 := crawlerMap_1[v]
|
|
|
+ arr_2 := crawlerMap_2[v]
|
|
|
+
|
|
|
+
|
|
|
+ log.Println("数据总量:",len(arr_1))
|
|
|
+ AnaNumMap[v]["total"][0] = len(arr_1)
|
|
|
+ f :=xlsx.NewFile()
|
|
|
+ //创建7个表格
|
|
|
+ for i:=0;i<len(list) ;i++ {
|
|
|
+ isTitle :=false
|
|
|
+ row:=0
|
|
|
+ for j:=0;j<len(arr_1);j++ {
|
|
|
+ string_1 := fmt.Sprint(arr_1[j][list[i]])
|
|
|
+ string_2 := fmt.Sprint(arr_2[j][list[i]])
|
|
|
+ if string_1!=string_2 {
|
|
|
+ if !isTitle{
|
|
|
+ sheet, _ := f.AddSheet(list[i])
|
|
|
+ sheet.Cell(row, 0).Value = "_id"
|
|
|
+ sheet.Cell(row, 1).Value = "href"
|
|
|
+ sheet.Cell(row, 2).Value = fmt.Sprint(list[i])+"_V1"
|
|
|
+ sheet.Cell(row, 3).Value = fmt.Sprint(list[i])+"_V2"
|
|
|
+ isTitle = true
|
|
|
+ row++
|
|
|
+ }
|
|
|
+ sheet :=f.Sheet[list[i]]
|
|
|
+ sheet.Cell(row, 0).Value = util.BsonIdToSId(arr_1[j]["_id"])
|
|
|
+ sheet.Cell(row, 1).Value = util.ObjToString(arr_1[j]["href"])
|
|
|
+ sheet.Cell(row, 2).Value = string_1
|
|
|
+ sheet.Cell(row, 3).Value = string_2
|
|
|
+ row++
|
|
|
+ AnaNumMap[v]["diff"][i] = AnaNumMap[v]["diff"][i]+1
|
|
|
+ }else {
|
|
|
+ AnaNumMap[v]["same"][i] = AnaNumMap[v]["same"][i]+1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ path:="zk_"+v+".xlsx"
|
|
|
+ error := f.Save(path)
|
|
|
+ if error != nil {
|
|
|
+ log.Println("保存xlsx失败:", error)
|
|
|
+ return
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ f :=xlsx.NewFile()
|
|
|
+ sheet, _ := f.AddSheet("摘要")
|
|
|
+ for i:=0;i<len(list) ;i++ {
|
|
|
+ sheet.Cell(1, i+3).Value = list[i]
|
|
|
+ }
|
|
|
+
|
|
|
+ for i:=0;i<len(crawlerArr) ;i++ {
|
|
|
+ sheet.Cell(i+2, 0).Value = crawlerArr[i]
|
|
|
+ total:= fmt.Sprint(AnaNumMap[crawlerArr[i]]["total"][0])
|
|
|
+ sheet.Cell(i+2, 1).Value = total
|
|
|
+ same:=AnaNumMap[crawlerArr[i]]["same"]
|
|
|
+ diff:=AnaNumMap[crawlerArr[i]]["diff"]
|
|
|
+ for j:=0;j<len(same) ;j++ {
|
|
|
+ sheet.Cell(i+2, j+3).Value = fmt.Sprint(same[j])+"~"+fmt.Sprint(diff[j])
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ sheet.Cell(1, 0).Value = "爬虫代码"
|
|
|
+ sheet.Cell(1, 1).Value = "数据总量"
|
|
|
+ sheet.Cell(1, 2).Value = "相同字段对比"
|
|
|
+ sheet.Cell(2, 2).Value = "相同数量~不同数量"
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ fmt.Println(AnaNumMap)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ path:="摘要"+".xlsx"
|
|
|
+ error := f.Save(path)
|
|
|
+ if error != nil {
|
|
|
+ log.Println("保存xlsx失败:", error)
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+//对比判重区别
|
|
|
+//func Test_heavy(t *testing.T) {
|
|
|
+
|
|
|
+ //mapinfo := map[string]interface{}{
|
|
|
+ // "gtid": "586b6d7061a0721f15b8f264",
|
|
|
+ // "lteid": "5e0b2b780cf41612e0639460",
|
|
|
+ //}
|
|
|
+ //task([]byte{}, mapinfo)
|
|
|
+
|
|
|
+ //extract,extract_copy:="a_testbidding_new","a_testbidding"
|
|
|
+ //
|
|
|
+ //sess := mgo.GetMgoConn()
|
|
|
+ //defer mgo.DestoryMongoConn(sess)
|
|
|
+ //res_copy := sess.DB("extract_kf").C(extract_copy).Find(nil).Iter()
|
|
|
+ //
|
|
|
+ //m1 :=map[string]int{} //老版本
|
|
|
+ //m2 :=map[string]int{} //新版本
|
|
|
+ //
|
|
|
+ //i:=0
|
|
|
+ //j:=0
|
|
|
+ //for v1 := make(map[string]interface{}); res_copy.Next(&v1); i++{
|
|
|
+ // if i%2000==0 {
|
|
|
+ // log.Println("当前i:",i)
|
|
|
+ // }
|
|
|
+ // m1[(v1["_id"].(bson.ObjectId).Hex())]= util.IntAll(v1["repeat"])
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //sesss := mgo.GetMgoConn()
|
|
|
+ //defer mgo.DestoryMongoConn(sesss)
|
|
|
+ //res := sesss.DB("extract_kf").C(extract).Find(nil).Iter()
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //for v2 := make(map[string]interface{}); res.Next(&v2); j++{
|
|
|
+ // if j%2000==0 {
|
|
|
+ // log.Println("当前j:",j)
|
|
|
+ // }
|
|
|
+ // m2[(v2["_id"].(bson.ObjectId).Hex())]= util.IntAll(v2["repeat"])
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //fmt.Println(len(m1),len(m2))
|
|
|
+ //n1:=0
|
|
|
+ //n2:=0
|
|
|
+ //n3:=0
|
|
|
+ //n4:=0
|
|
|
+ //n5:=0
|
|
|
+ //n6:=0
|
|
|
+ //
|
|
|
+ //var arr1 []string
|
|
|
+ //var arr2 []string
|
|
|
+ //for k,v:=range m1{
|
|
|
+ //
|
|
|
+ // if m2[k]==1&&v==0{//0:1
|
|
|
+ // n1++
|
|
|
+ // arr2 = append(arr2,fmt.Sprintf("目标_id:%s",k))
|
|
|
+ // }
|
|
|
+ // if m2[k]==0&&v==1{ //1:0
|
|
|
+ // n2++
|
|
|
+ // arr1 = append(arr1,fmt.Sprintf("目标_id:%s",k))
|
|
|
+ // }
|
|
|
+ // if m2[k]==0&&v==0{ //0:0
|
|
|
+ // n3++
|
|
|
+ // }
|
|
|
+ // if m2[k]==1&&v==1{//1:1
|
|
|
+ // n4++
|
|
|
+ // }
|
|
|
+ // if m2[k]==-1&&v==0{ //0:-1
|
|
|
+ // n5++
|
|
|
+ // }
|
|
|
+ // if m2[k]==-1&&v==1{//1:-1
|
|
|
+ // n6++
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ //}
|
|
|
+ ////打印 1:0情况 ;
|
|
|
+ //mm:=0
|
|
|
+ //for _,v:=range arr1 {
|
|
|
+ // mm++
|
|
|
+ // if mm%200==0 {
|
|
|
+ // log.Println(v)
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //log.Println("分割线---------------")
|
|
|
+ //log.Println("分割线---------------")
|
|
|
+ //
|
|
|
+ //
|
|
|
+ ////打印 0:1情况
|
|
|
+ //nn:=0
|
|
|
+ //for _,v:=range arr2 {
|
|
|
+ // nn++
|
|
|
+ // if nn%200==0 {
|
|
|
+ // log.Println(v)
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //log.Println("V1 0:1---",n1)
|
|
|
+ //log.Println("V1 1:0---",n2)
|
|
|
+ //log.Println("V1 0:0---",n3)
|
|
|
+ //log.Println("V1 1:1---",n4)
|
|
|
+ //log.Println("V1 0:-1---",n5)
|
|
|
+ //log.Println("V1 1:-1---",n6)
|
|
|
+//}
|
|
|
+
|
|
|
+//糅合数据
|
|
|
+//func Test_specifiedField(t *testing.T) {
|
|
|
+
|
|
|
+ //mgo = &mongodb.MongodbSim{
|
|
|
+ // MongodbAddr: "192.168.3.207:27081",
|
|
|
+ // DbName: "qfw",
|
|
|
+ // Size: util.IntAllDef(15, 10),
|
|
|
+ //}
|
|
|
+ //mgo.InitPool()
|
|
|
+ //
|
|
|
+ //mgo_copy = &mongodb.MongodbSim{
|
|
|
+ // MongodbAddr: "192.168.3.207:27092",
|
|
|
+ // DbName: "extract_kf",
|
|
|
+ // Size: util.IntAllDef(15, 10),
|
|
|
+ //}
|
|
|
+ //mgo_copy.InitPool()
|
|
|
+ //
|
|
|
+ //
|
|
|
+ ////固定死的需要分析的字段
|
|
|
+ //field_map := map[string]string{
|
|
|
+ // "title":"1",
|
|
|
+ // "area":"1",
|
|
|
+ // "city":"1",
|
|
|
+ // "subtype":"1",
|
|
|
+ // "buyer":"1",
|
|
|
+ // "agency":"1",
|
|
|
+ // "winner":"1",
|
|
|
+ // "budget":"1",
|
|
|
+ // "bidamount":"1",
|
|
|
+ // "projectname":"1",
|
|
|
+ // "projectcode":"1",
|
|
|
+ // "publishtime":"1",
|
|
|
+ // "comeintime":"1",
|
|
|
+ // "bidopentime":"1",
|
|
|
+ // "agencyaddr":"1",
|
|
|
+ // "site":"1",
|
|
|
+ // "href":"1",
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //sess := mgo.GetMgoConn()
|
|
|
+ //defer mgo.DestoryMongoConn(sess)
|
|
|
+ //
|
|
|
+ //sess_1 :=mgo_copy.GetMgoConn()
|
|
|
+ //defer mgo_copy.DestoryMongoConn(sess_1)
|
|
|
+ //
|
|
|
+ //sess_2 :=mgo_copy.GetMgoConn()
|
|
|
+ //defer mgo_copy.DestoryMongoConn(sess_2)
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
|
|
|
+ //it_1 :=sess_1.DB("extract_kf").C("zheng_test_1")
|
|
|
+ //it_2 :=sess_2.DB("extract_kf").C("zheng_test_2")
|
|
|
+ //n:=0
|
|
|
+ //for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
|
|
|
+ // if n%10000==0 {
|
|
|
+ // log.Println("当前n:",n)
|
|
|
+ // }
|
|
|
+ // if n>1000000 { //约半月数据
|
|
|
+ // break
|
|
|
+ // }
|
|
|
+ // if tmp["spidercode"]=="a_zgzfcgw_zfcghtgg_new"|| tmp["spidercode"]=="gd_gdszfcgw_dscght"||
|
|
|
+ // tmp["spidercode"]=="a_zgzfcgw_bid_tender_new"||tmp["spidercode"]=="a_ztxygjzbtbzxyxgs_zbxx"||
|
|
|
+ // tmp["spidercode"]=="sd_zgsdzfcgw_xxgk_sxhtgk"{
|
|
|
+ // jsonData := util.ObjToMap(tmp["jsondata"])
|
|
|
+ // if jsonData!=nil {
|
|
|
+ // for k,v :=range *jsonData {
|
|
|
+ // if fmt.Sprint(v) !=""{
|
|
|
+ // if field_map[k]=="1" {
|
|
|
+ // it_1.Insert(tmp)
|
|
|
+ // it_2.Insert(tmp)
|
|
|
+ // break
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //log.Println("总计",n,"条数据")
|
|
|
+
|
|
|
+//}
|
|
|
+
|
|
|
+
|
|
|
+//统计字段
|
|
|
+//func Test_field(t *testing.T) {
|
|
|
+
|
|
|
+ //mgo = &mongodb.MongodbSim{
|
|
|
+ // MongodbAddr: "192.168.3.207:27081",
|
|
|
+ // DbName: "qfw",
|
|
|
+ // Size: util.IntAllDef(15, 10),
|
|
|
+ //}
|
|
|
+ //mgo.InitPool()
|
|
|
+ //
|
|
|
+ ////调试 - 导出数据
|
|
|
+ ////1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
|
|
|
+ ////2:人工抽查数据质量,用于jsondata权重评估
|
|
|
+ //
|
|
|
+ ////取 固有字段 1-为存在
|
|
|
+ ////now := int64(time.Now().Unix())
|
|
|
+ ////date_time := int64(86400*2)
|
|
|
+ //
|
|
|
+ ////field_map := make(map[string]string,0)
|
|
|
+ ////sess_field := mgo.GetMgoConn()
|
|
|
+ ////defer sess_field.Close()
|
|
|
+ ////res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
|
|
|
+ ////for dict := make(map[string]interface{}); res_field.Next(&dict); {
|
|
|
+ //// field_map[dict["s_field"].(string)] = "1"
|
|
|
+ ////}
|
|
|
+ //
|
|
|
+ ////固定死的需要分析的字段
|
|
|
+ //field_map := map[string]string{
|
|
|
+ // "title":"1",
|
|
|
+ // "area":"1",
|
|
|
+ // "city":"1",
|
|
|
+ // "subtype":"1",
|
|
|
+ // "buyer":"1",
|
|
|
+ // "agency":"1",
|
|
|
+ // "winner":"1",
|
|
|
+ // "budget":"1",
|
|
|
+ // "bidamount":"1",
|
|
|
+ // "projectname":"1",
|
|
|
+ // "projectcode":"1",
|
|
|
+ // "publishtime":"1",
|
|
|
+ // "comeintime":"1",
|
|
|
+ // "bidopentime":"1",
|
|
|
+ // "agencyaddr":"1",
|
|
|
+ // "site":"1",
|
|
|
+ // "href":"1",
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ ///* ObjectId("5da3f2c5a5cb26b9b79847fc") 0
|
|
|
+ // ObjectId("5da3fd6da5cb26b9b7a8683c") 5000
|
|
|
+ // ObjectId("5da40bdaa5cb26b9b7bea472") 10000
|
|
|
+ // ObjectId("5da44deaa5cb26b9b75efb38") 50000
|
|
|
+ // ObjectId("5da53440a5cb26b9b7d3f9aa") 100000
|
|
|
+ // ObjectId("5db2735ba5cb26b9b7c99c6f") 761414
|
|
|
+ //*/
|
|
|
+ //
|
|
|
+ ///*
|
|
|
+ //qfw-bidding
|
|
|
+ //
|
|
|
+ //ObjectId("5e0d4cdd0cf41612e063fc65") -1
|
|
|
+ //ObjectId("5df8bfe4e9d1f601e4e87431") 一百万
|
|
|
+ //ObjectId("5dea080ce9d1f601e45cb838") 二百万
|
|
|
+ //
|
|
|
+ //5df834dd // 半月 大约100万条
|
|
|
+ //
|
|
|
+ //*/
|
|
|
+ //sess := mgo.GetMgoConn()
|
|
|
+ //defer mgo.DestoryMongoConn(sess)
|
|
|
+ ////q := map[string]interface{}{
|
|
|
+ //// "_id": map[string]interface{}{
|
|
|
+ //// "$gt": util.StringTOBsonId("5dea080ce9d1f601e45cb838"),
|
|
|
+ //// "$lte": util.StringTOBsonId("5e0d4cdd0cf41612e063fc65"),
|
|
|
+ //// },
|
|
|
+ ////}
|
|
|
+ //it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
|
|
|
+ //
|
|
|
+ ////爬虫组
|
|
|
+ //crawlerMap,n := make(map[string]map[string]interface{},0),0
|
|
|
+ //
|
|
|
+ //for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
|
|
|
+ // if n%10000==0 {
|
|
|
+ // log.Println("当前n:",n)
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ // if n>3000000 {
|
|
|
+ // break
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ // if tmp["spidercode"]!="" {
|
|
|
+ // //判断是否有此类别分组
|
|
|
+ // dict := make(map[string]interface{},0)
|
|
|
+ // if crawlerMap[tmp["spidercode"].(string)]!= nil {
|
|
|
+ // dict = crawlerMap[tmp["spidercode"].(string)]
|
|
|
+ // }
|
|
|
+ // jsonData := util.ObjToMap(tmp["jsondata"])
|
|
|
+ //
|
|
|
+ // if jsonData!=nil {
|
|
|
+ // for k,v :=range *jsonData {
|
|
|
+ // if fmt.Sprint(v) ==""{
|
|
|
+ // //无效数据
|
|
|
+ // }else {
|
|
|
+ // if field_map[k]=="1" {
|
|
|
+ // arr := dict[k]
|
|
|
+ // if arr==nil {
|
|
|
+ // dict[k] = make([]string,0)
|
|
|
+ // dict[k] = append(dict[k].([]string),fmt.Sprint(v))
|
|
|
+ // }else {
|
|
|
+ // dict[k] = append(dict[k].([]string),fmt.Sprint(v))
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // if dict!=nil {
|
|
|
+ // crawlerMap[tmp["spidercode"].(string)] = dict
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //log.Println("总计",n,"条数据")
|
|
|
+ //log.Println("爬虫类别个数:",len(crawlerMap))
|
|
|
+ //
|
|
|
+ //
|
|
|
+ ////计算每个爬虫分类的总数-并添加
|
|
|
+ //
|
|
|
+ ////ObjectId("5e0d4cdd0cf41612e063fc65")
|
|
|
+ //arr :=make([]map[string]interface{},0)
|
|
|
+ //for k,v :=range crawlerMap {
|
|
|
+ // total :=0
|
|
|
+ // for _,v1 :=range v {
|
|
|
+ // total =total + len(v1.([]string))
|
|
|
+ // }
|
|
|
+ // v["total"]= total
|
|
|
+ // v["key"] = k
|
|
|
+ // arr = append(arr,v)
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //
|
|
|
+ ////爬虫类别下-有效字段总数排列 前100
|
|
|
+ //start := time.Now().Unix()
|
|
|
+ //quickSort(0,len(arr)-1,&arr)
|
|
|
+ //end :=time.Now().Unix()
|
|
|
+ //fmt.Println("耗时:",end-start,"秒")
|
|
|
+ //
|
|
|
+ //f :=xlsx.NewFile()
|
|
|
+ //sheet, _ := f.AddSheet("排序")
|
|
|
+ //
|
|
|
+ ////第一行先写标题
|
|
|
+ //row1 := sheet.AddRow()
|
|
|
+ //row1.AddCell().Value = "排名"
|
|
|
+ //row1.AddCell().Value = "爬虫类"
|
|
|
+ //row1.AddCell().Value = "字段有效数"
|
|
|
+ //
|
|
|
+ //mapLock := &sync.Mutex{}
|
|
|
+ //limit :=0
|
|
|
+ //for _,v :=range arr {
|
|
|
+ // limit++
|
|
|
+ // row := sheet.AddRow()
|
|
|
+ // row.AddCell().SetInt(limit)
|
|
|
+ // row.AddCell().SetString(v["key"].(string))
|
|
|
+ // row.AddCell().SetInt(v["total"].(int))
|
|
|
+ //
|
|
|
+ // if limit <=20 {
|
|
|
+ // mapLock.Lock()
|
|
|
+ // sheetName := "排名"+util.ObjToString(limit)+":"+util.ObjToString(v["key"])
|
|
|
+ // sheet_detail, err := f.AddSheet(sheetName)
|
|
|
+ // if err==nil {
|
|
|
+ // row_num,col_num :=0,0
|
|
|
+ // for k1,v1 := range v {
|
|
|
+ // if a,ok :=v1.([]string);ok {
|
|
|
+ // for k2, v2 := range a {
|
|
|
+ // if k2==0 {
|
|
|
+ // sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
|
|
|
+ // row_num++
|
|
|
+ // sheet_detail.Cell(row_num, col_num).Value = v2
|
|
|
+ // }else {
|
|
|
+ // if row_num>2000 {
|
|
|
+ // continue
|
|
|
+ // }
|
|
|
+ // sheet_detail.Cell(row_num, col_num).Value = v2
|
|
|
+ // }
|
|
|
+ // row_num++
|
|
|
+ // }
|
|
|
+ // row_num = 0
|
|
|
+ // col_num++
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ // mapLock.Unlock()
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //
|
|
|
+ // if limit >99{
|
|
|
+ // break
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //err := f.Save("zheng.xlsx")
|
|
|
+ //if err != nil {
|
|
|
+ // log.Println("保存xlsx失败:", err)
|
|
|
+ // return
|
|
|
+ //}
|
|
|
+ //log.Println("xlsx保存成功")
|
|
|
+//}
|
|
|
+
|
|
|
+
|
|
|
+func quickSort(left int,right int ,array *[]map[string]interface{}) {
|
|
|
+
|
|
|
+ l:=left
|
|
|
+ r:=right
|
|
|
+
|
|
|
+ pivot := util.IntAll((*array)[(left+right)/2]["total"])//中轴
|
|
|
+ //for 的目标 将比pivot小的左边 反之右边
|
|
|
+ for ;l<r;{
|
|
|
+ //左半区找到大于等于pivot的数
|
|
|
+ for ;util.IntAll((*array)[l]["total"]) > pivot; {
|
|
|
+ l++
|
|
|
+ }
|
|
|
+ //右半区找到小于等于pivot的数
|
|
|
+ for ;util.IntAll((*array)[r]["total"])<pivot; {
|
|
|
+ r--
|
|
|
+ }
|
|
|
+ //本次分解任务完成
|
|
|
+ if l>=r {
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ (*array)[l],(*array)[r] = (*array)[r],(*array)[l]
|
|
|
+ //优化相等的情况
|
|
|
+ if util.IntAll((*array)[l]["total"]) == pivot {
|
|
|
+ r--
|
|
|
+ }
|
|
|
+ if util.IntAll((*array)[r]["total"]) == pivot {
|
|
|
+ l++
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ if l==r {
|
|
|
+ l++
|
|
|
+ r--
|
|
|
+ }
|
|
|
+ //向左递归
|
|
|
+ if left<r {
|
|
|
+ quickSort(left,r,array)
|
|
|
+ }
|
|
|
+ //向右递归
|
|
|
+ if right>l {
|
|
|
+ quickSort(l,right,array)
|
|
|
+ }
|
|
|
+
|
|
|
+}
|