|
@@ -6,17 +6,249 @@ import (
|
|
|
"log"
|
|
|
"qfw/util"
|
|
|
"qfw/util/mongodb"
|
|
|
- "sync"
|
|
|
"testing"
|
|
|
- "time"
|
|
|
)
|
|
|
|
|
|
var (
|
|
|
mgo *mongodb.MongodbSim //mongodb操作对象
|
|
|
+ //mgo_copy *mongodb.MongodbSim //mongodb操作对象
|
|
|
)
|
|
|
|
|
|
|
|
|
-func Test_heavy(t *testing.T) {
|
|
|
+//分类爬虫抽取统计
|
|
|
+func Test_crawlerExtractitCompare(t *testing.T) {
|
|
|
+
|
|
|
+ mgo = &mongodb.MongodbSim{
|
|
|
+ MongodbAddr: "192.168.3.207:27092",
|
|
|
+ DbName: "extract_kf",
|
|
|
+ Size: util.IntAllDef(15, 10),
|
|
|
+ }
|
|
|
+ mgo.InitPool()
|
|
|
+
|
|
|
+ sess := mgo.GetMgoConn()
|
|
|
+ defer mgo.DestoryMongoConn(sess)
|
|
|
+ it :=sess.DB("extract_kf").C("zheng_test_1").Find(nil).Sort("_id").Iter()
|
|
|
+ n:=0
|
|
|
+ crawlerMap := make(map[string]string,0)
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
|
|
|
+ if n%10000==0 {
|
|
|
+ log.Println("当前n:",n)
|
|
|
+ }
|
|
|
+
|
|
|
+ //if n>2000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
+ crawlerMap[util.BsonIdToSId(tmp["_id"])] = util.ObjToString(tmp["spidercode"])
|
|
|
+ }
|
|
|
+
|
|
|
+ sess_1 := mgo.GetMgoConn()
|
|
|
+ defer mgo.DestoryMongoConn(sess_1)
|
|
|
+ it_1 :=sess_1.DB("extract_kf").C("zheng_test1_jd1").Find(nil).Sort("_id").Iter()
|
|
|
+ n1:=0
|
|
|
+ crawlerMap_1 := make(map[string][]map[string]interface{},0)
|
|
|
+
|
|
|
+ for tmp := make(map[string]interface{});it_1.Next(&tmp);n1++{
|
|
|
+ if n1%10000==0 {
|
|
|
+ log.Println("当前n1:",n1)
|
|
|
+ }
|
|
|
+
|
|
|
+ //if n1>2000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
+
|
|
|
+ //类别
|
|
|
+ dic :=map[string]interface{}{
|
|
|
+ "_id":util.BsonIdToSId(tmp["_id"]),
|
|
|
+ "href":util.ObjToString(tmp["href"]),
|
|
|
+ "title":util.ObjToString(tmp["title"]),
|
|
|
+ "buyer":util.ObjToString(tmp["buyer"]),
|
|
|
+ "agency":util.ObjToString(tmp["agency"]),
|
|
|
+ "winner":util.ObjToString(tmp["winner"]),
|
|
|
+ "budget":util.ObjToString(tmp["budget"]),
|
|
|
+ "bidamount":util.ObjToString(tmp["bidamount"]),
|
|
|
+ "projectname":util.ObjToString(tmp["projectname"]),
|
|
|
+ "projectcode":util.ObjToString(tmp["projectcode"]),
|
|
|
+ "publishtime":util.ObjToString(tmp["publishtime"]),
|
|
|
+ "bidopentime":util.ObjToString(tmp["bidopentime"]),
|
|
|
+ "agencyaddr":util.ObjToString(tmp["agencyaddr"]),
|
|
|
+ }
|
|
|
+ value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
|
|
|
+ arr := crawlerMap_1[value]
|
|
|
+ if arr==nil {
|
|
|
+ crawlerMap_1[value] = make([]map[string]interface{},0)
|
|
|
+ crawlerMap_1[value] = append(crawlerMap_1[value],dic)
|
|
|
+ }else {
|
|
|
+ crawlerMap_1[value] = append(crawlerMap_1[value],dic)
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ sess_2 :=mgo.GetMgoConn()
|
|
|
+ defer mgo.DestoryMongoConn(sess_2)
|
|
|
+ it_2 :=sess_2.DB("extract_kf").C("zheng_test1_jd2").Find(nil).Sort("_id").Iter()
|
|
|
+ n2:=0
|
|
|
+ crawlerMap_2 := make(map[string][]map[string]interface{})
|
|
|
+ for tmp := make(map[string]interface{}); it_2.Next(&tmp); n2++ {
|
|
|
+ if n2%10000==0 {
|
|
|
+ log.Println("当前n2:",n2)
|
|
|
+ }
|
|
|
+
|
|
|
+ //if n2>1000 {
|
|
|
+ // break
|
|
|
+ //}
|
|
|
+
|
|
|
+ //类别
|
|
|
+ dic :=map[string]interface{}{
|
|
|
+ "_id":util.BsonIdToSId(tmp["_id"]),
|
|
|
+ "href":util.ObjToString(tmp["href"]),
|
|
|
+ "buyer":util.ObjToString(tmp["buyer"]),
|
|
|
+ "agency":util.ObjToString(tmp["agency"]),
|
|
|
+ "winner":util.ObjToString(tmp["winner"]),
|
|
|
+ "budget":util.ObjToString(tmp["budget"]),
|
|
|
+ "bidamount":util.ObjToString(tmp["bidamount"]),
|
|
|
+ "projectname":util.ObjToString(tmp["projectname"]),
|
|
|
+ "projectcode":util.ObjToString(tmp["projectcode"]),
|
|
|
+ }
|
|
|
+ value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
|
|
|
+ arr := crawlerMap_2[value]
|
|
|
+ if arr==nil {
|
|
|
+ crawlerMap_2[value] = make([]map[string]interface{},0)
|
|
|
+ crawlerMap_2[value] = append(crawlerMap_2[value],dic)
|
|
|
+ }else {
|
|
|
+ crawlerMap_2[value] = append(crawlerMap_2[value],dic)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ log.Println("爬虫类个数分别为:",len(crawlerMap_1),len(crawlerMap_2))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ if len(crawlerMap_1)!=len(crawlerMap_2)||len(crawlerMap_1)==0 {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ var list = []string{
|
|
|
+ "buyer",
|
|
|
+ "agency",
|
|
|
+ "winner",
|
|
|
+ "budget",
|
|
|
+ "bidamount",
|
|
|
+ "projectname",
|
|
|
+ "projectcode",
|
|
|
+ }
|
|
|
+
|
|
|
+ var crawlerArr = []string{
|
|
|
+ "a_zgzfcgw_zfcghtgg_new",
|
|
|
+ "gd_gdszfcgw_dscght",
|
|
|
+ "a_zgzfcgw_bid_tender_new",
|
|
|
+ "a_ztxygjzbtbzxyxgs_zbxx",
|
|
|
+ "sd_zgsdzfcgw_xxgk_sxhtgk",
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ //数量统计
|
|
|
+ AnaNumMap :=map[string]map[string][]int{
|
|
|
+ "a_zgzfcgw_zfcghtgg_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "gd_gdszfcgw_dscght": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "a_zgzfcgw_bid_tender_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "a_ztxygjzbtbzxyxgs_zbxx": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ "sd_zgsdzfcgw_xxgk_sxhtgk": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
|
|
|
+ }
|
|
|
+
|
|
|
+ fmt.Println(len(AnaNumMap))
|
|
|
+ //-对比数据
|
|
|
+ for _,v:=range crawlerArr {
|
|
|
+ if crawlerMap_1[v]==nil||crawlerMap_2[v]==nil {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //取数组数据
|
|
|
+ arr_1 := crawlerMap_1[v]
|
|
|
+ arr_2 := crawlerMap_2[v]
|
|
|
+
|
|
|
+
|
|
|
+ log.Println("数据总量:",len(arr_1))
|
|
|
+ AnaNumMap[v]["total"][0] = len(arr_1)
|
|
|
+ f :=xlsx.NewFile()
|
|
|
+ //创建7个表格
|
|
|
+ for i:=0;i<len(list) ;i++ {
|
|
|
+ isTitle :=false
|
|
|
+ row:=0
|
|
|
+ for j:=0;j<len(arr_1);j++ {
|
|
|
+ string_1 := fmt.Sprint(arr_1[j][list[i]])
|
|
|
+ string_2 := fmt.Sprint(arr_2[j][list[i]])
|
|
|
+ if string_1!=string_2 {
|
|
|
+ if !isTitle{
|
|
|
+ sheet, _ := f.AddSheet(list[i])
|
|
|
+ sheet.Cell(row, 0).Value = "_id"
|
|
|
+ sheet.Cell(row, 1).Value = "href"
|
|
|
+ sheet.Cell(row, 2).Value = fmt.Sprint(list[i])+"_V1"
|
|
|
+ sheet.Cell(row, 3).Value = fmt.Sprint(list[i])+"_V2"
|
|
|
+ isTitle = true
|
|
|
+ row++
|
|
|
+ }
|
|
|
+ sheet :=f.Sheet[list[i]]
|
|
|
+ sheet.Cell(row, 0).Value = util.BsonIdToSId(arr_1[j]["_id"])
|
|
|
+ sheet.Cell(row, 1).Value = util.ObjToString(arr_1[j]["href"])
|
|
|
+ sheet.Cell(row, 2).Value = string_1
|
|
|
+ sheet.Cell(row, 3).Value = string_2
|
|
|
+ row++
|
|
|
+ AnaNumMap[v]["diff"][i] = AnaNumMap[v]["diff"][i]+1
|
|
|
+ }else {
|
|
|
+ AnaNumMap[v]["same"][i] = AnaNumMap[v]["same"][i]+1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ path:="zk_"+v+".xlsx"
|
|
|
+ error := f.Save(path)
|
|
|
+ if error != nil {
|
|
|
+ log.Println("保存xlsx失败:", error)
|
|
|
+ return
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ f :=xlsx.NewFile()
|
|
|
+ sheet, _ := f.AddSheet("摘要")
|
|
|
+ for i:=0;i<len(list) ;i++ {
|
|
|
+ sheet.Cell(1, i+3).Value = list[i]
|
|
|
+ }
|
|
|
+
|
|
|
+ for i:=0;i<len(crawlerArr) ;i++ {
|
|
|
+ sheet.Cell(i+2, 0).Value = crawlerArr[i]
|
|
|
+ total:= fmt.Sprint(AnaNumMap[crawlerArr[i]]["total"][0])
|
|
|
+ sheet.Cell(i+2, 1).Value = total
|
|
|
+ same:=AnaNumMap[crawlerArr[i]]["same"]
|
|
|
+ diff:=AnaNumMap[crawlerArr[i]]["diff"]
|
|
|
+ for j:=0;j<len(same) ;j++ {
|
|
|
+ sheet.Cell(i+2, j+3).Value = fmt.Sprint(same[j])+"~"+fmt.Sprint(diff[j])
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ sheet.Cell(1, 0).Value = "爬虫代码"
|
|
|
+ sheet.Cell(1, 1).Value = "数据总量"
|
|
|
+ sheet.Cell(1, 2).Value = "相同字段对比"
|
|
|
+ sheet.Cell(2, 2).Value = "相同数量~不同数量"
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ fmt.Println(AnaNumMap)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ path:="摘要"+".xlsx"
|
|
|
+ error := f.Save(path)
|
|
|
+ if error != nil {
|
|
|
+ log.Println("保存xlsx失败:", error)
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+//对比判重区别
|
|
|
+//func Test_heavy(t *testing.T) {
|
|
|
|
|
|
//mapinfo := map[string]interface{}{
|
|
|
// "gtid": "586b6d7061a0721f15b8f264",
|
|
@@ -24,40 +256,6 @@ func Test_heavy(t *testing.T) {
|
|
|
//}
|
|
|
//task([]byte{}, mapinfo)
|
|
|
|
|
|
-
|
|
|
- //log.Println("1")
|
|
|
- //代码copy数据
|
|
|
- //sessTest :=mgoTest.GetMgoConn()
|
|
|
- //defer sessTest.Close()
|
|
|
- //
|
|
|
- //sess := mgo.GetMgoConn()
|
|
|
- //defer sess.Close()
|
|
|
- //
|
|
|
- ////var arr []map[string]interface{}
|
|
|
- //
|
|
|
- //res_test := sessTest.DB("qfw").C("bidding").Find(mongodb.ObjToMQ(`{"comeintime":{"$gte": 1571025600, "$lte": 1571976000}}`, true)).Iter()
|
|
|
- //res :=sess.DB("extract_kf").C("a_testbidding")
|
|
|
- //5
|
|
|
- //
|
|
|
- //
|
|
|
- //
|
|
|
- //
|
|
|
- //i:=0
|
|
|
- //for dict := make(map[string]interface{}); res_test.Next(&dict); i++{
|
|
|
- //
|
|
|
- // //插入
|
|
|
- // if i%2000==0 {
|
|
|
- // log.Println("当前:",i)
|
|
|
- // }
|
|
|
- // res.Insert(dict)
|
|
|
- // //if len(arr)>=500 {
|
|
|
- // // arr = make([]map[string]interface{},0)
|
|
|
- // //}else {
|
|
|
- // // arr = append(arr,dict)
|
|
|
- // //}
|
|
|
- //}
|
|
|
- //
|
|
|
-
|
|
|
//extract,extract_copy:="a_testbidding_new","a_testbidding"
|
|
|
//
|
|
|
//sess := mgo.GetMgoConn()
|
|
@@ -150,215 +348,297 @@ func Test_heavy(t *testing.T) {
|
|
|
//log.Println("V1 1:1---",n4)
|
|
|
//log.Println("V1 0:-1---",n5)
|
|
|
//log.Println("V1 1:-1---",n6)
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+//}
|
|
|
|
|
|
-func Test_field(t *testing.T) {
|
|
|
+//糅合数据
|
|
|
+//func Test_specifiedField(t *testing.T) {
|
|
|
|
|
|
- mgo = &mongodb.MongodbSim{
|
|
|
- MongodbAddr: "192.168.3.207:27081",
|
|
|
- DbName: "qfw",
|
|
|
- Size: util.IntAllDef(15, 10),
|
|
|
- }
|
|
|
- mgo.InitPool()
|
|
|
-
|
|
|
- //调试 - 导出数据
|
|
|
- //1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
|
|
|
- //2:人工抽查数据质量,用于jsondata权重评估
|
|
|
-
|
|
|
- //取 固有字段 1-为存在
|
|
|
- //now := int64(time.Now().Unix())
|
|
|
- //date_time := int64(86400*2)
|
|
|
-
|
|
|
- //field_map := make(map[string]string,0)
|
|
|
- //sess_field := mgo.GetMgoConn()
|
|
|
- //defer sess_field.Close()
|
|
|
- //res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
|
|
|
- //for dict := make(map[string]interface{}); res_field.Next(&dict); {
|
|
|
- // field_map[dict["s_field"].(string)] = "1"
|
|
|
+ //mgo = &mongodb.MongodbSim{
|
|
|
+ // MongodbAddr: "192.168.3.207:27081",
|
|
|
+ // DbName: "qfw",
|
|
|
+ // Size: util.IntAllDef(15, 10),
|
|
|
//}
|
|
|
-
|
|
|
- //固定死的需要分析的字段
|
|
|
- field_map := map[string]string{
|
|
|
- "title":"1",
|
|
|
- "area":"1",
|
|
|
- "city":"1",
|
|
|
- "subtype":"1",
|
|
|
- "buyer":"1",
|
|
|
- "agency":"1",
|
|
|
- "winner":"1",
|
|
|
- "budget":"1",
|
|
|
- "bidamount":"1",
|
|
|
- "projectname":"1",
|
|
|
- "projectcode":"1",
|
|
|
- "publishtime":"1",
|
|
|
- "comeintime":"1",
|
|
|
- "bidopentime":"1",
|
|
|
- "agencyaddr":"1",
|
|
|
- "site":"1",
|
|
|
- "href":"1",
|
|
|
- }
|
|
|
-
|
|
|
- /* ObjectId("5da3f2c5a5cb26b9b79847fc") 0
|
|
|
- ObjectId("5da3fd6da5cb26b9b7a8683c") 5000
|
|
|
- ObjectId("5da40bdaa5cb26b9b7bea472") 10000
|
|
|
- ObjectId("5da44deaa5cb26b9b75efb38") 50000
|
|
|
- ObjectId("5da53440a5cb26b9b7d3f9aa") 100000
|
|
|
- ObjectId("5db2735ba5cb26b9b7c99c6f") 761414
|
|
|
- */
|
|
|
-
|
|
|
- /*
|
|
|
- qfw-bidding
|
|
|
-
|
|
|
- ObjectId("5e0d4cdd0cf41612e063fc65") -1
|
|
|
- ObjectId("5df8bfe4e9d1f601e4e87431") 一百万
|
|
|
- ObjectId("5dea080ce9d1f601e45cb838") 二百万
|
|
|
-
|
|
|
-
|
|
|
- */
|
|
|
- sess := mgo.GetMgoConn()
|
|
|
- defer mgo.DestoryMongoConn(sess)
|
|
|
- //q := map[string]interface{}{
|
|
|
- // "_id": map[string]interface{}{
|
|
|
- // "$gt": util.StringTOBsonId("5dea080ce9d1f601e45cb838"),
|
|
|
- // "$lte": util.StringTOBsonId("5e0d4cdd0cf41612e063fc65"),
|
|
|
- // },
|
|
|
+ //mgo.InitPool()
|
|
|
+ //
|
|
|
+ //mgo_copy = &mongodb.MongodbSim{
|
|
|
+ // MongodbAddr: "192.168.3.207:27092",
|
|
|
+ // DbName: "extract_kf",
|
|
|
+ // Size: util.IntAllDef(15, 10),
|
|
|
//}
|
|
|
- it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
|
|
|
-
|
|
|
- //爬虫组
|
|
|
- crawlerMap,n := make(map[string]map[string]interface{},0),0
|
|
|
-
|
|
|
- for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
|
|
|
- if n%10000==0 {
|
|
|
- log.Println("当前n:",n)
|
|
|
- }
|
|
|
-
|
|
|
- if n>3000000 {
|
|
|
- break
|
|
|
- }
|
|
|
-
|
|
|
- if tmp["spidercode"]!="" {
|
|
|
- //判断是否有此类别分组
|
|
|
- dict := make(map[string]interface{},0)
|
|
|
- if crawlerMap[tmp["spidercode"].(string)]!= nil {
|
|
|
- dict = crawlerMap[tmp["spidercode"].(string)]
|
|
|
- }
|
|
|
- jsonData := util.ObjToMap(tmp["jsondata"])
|
|
|
-
|
|
|
- if jsonData!=nil {
|
|
|
- for k,v :=range *jsonData {
|
|
|
- if fmt.Sprint(v) ==""{
|
|
|
- //无效数据
|
|
|
- }else {
|
|
|
- if field_map[k]=="1" {
|
|
|
- arr := dict[k]
|
|
|
- if arr==nil {
|
|
|
- dict[k] = make([]string,0)
|
|
|
- dict[k] = append(dict[k].([]string),fmt.Sprint(v))
|
|
|
- }else {
|
|
|
- dict[k] = append(dict[k].([]string),fmt.Sprint(v))
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- if dict!=nil {
|
|
|
- crawlerMap[tmp["spidercode"].(string)] = dict
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- log.Println("总计",n,"条数据")
|
|
|
- log.Println("爬虫类别个数:",len(crawlerMap))
|
|
|
-
|
|
|
-
|
|
|
- //计算每个爬虫分类的总数-并添加
|
|
|
-
|
|
|
- //ObjectId("5e0d4cdd0cf41612e063fc65")
|
|
|
- arr :=make([]map[string]interface{},0)
|
|
|
- for k,v :=range crawlerMap {
|
|
|
- total :=0
|
|
|
- for _,v1 :=range v {
|
|
|
- total =total + len(v1.([]string))
|
|
|
- }
|
|
|
- v["total"]= total
|
|
|
- v["key"] = k
|
|
|
- arr = append(arr,v)
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- //爬虫类别下-有效字段总数排列 前100
|
|
|
- start := time.Now().Unix()
|
|
|
- quickSort(0,len(arr)-1,&arr)
|
|
|
- end :=time.Now().Unix()
|
|
|
- fmt.Println("耗时:",end-start,"秒")
|
|
|
-
|
|
|
- f :=xlsx.NewFile()
|
|
|
- sheet, _ := f.AddSheet("排序")
|
|
|
-
|
|
|
- //第一行先写标题
|
|
|
- row1 := sheet.AddRow()
|
|
|
- row1.AddCell().Value = "排名"
|
|
|
- row1.AddCell().Value = "爬虫类"
|
|
|
- row1.AddCell().Value = "字段有效数"
|
|
|
-
|
|
|
- mapLock := &sync.Mutex{}
|
|
|
- limit :=0
|
|
|
- for _,v :=range arr {
|
|
|
- limit++
|
|
|
- row := sheet.AddRow()
|
|
|
- row.AddCell().SetInt(limit)
|
|
|
- row.AddCell().SetString(v["key"].(string))
|
|
|
- row.AddCell().SetInt(v["total"].(int))
|
|
|
-
|
|
|
- if limit <=20 {
|
|
|
- mapLock.Lock()
|
|
|
- sheetName := "排名"+util.ObjToString(limit)+":"+util.ObjToString(v["key"])
|
|
|
- sheet_detail, err := f.AddSheet(sheetName)
|
|
|
- if err==nil {
|
|
|
- row_num,col_num :=0,0
|
|
|
- for k1,v1 := range v {
|
|
|
- if a,ok :=v1.([]string);ok {
|
|
|
- for k2, v2 := range a {
|
|
|
- if k2==0 {
|
|
|
- sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
|
|
|
- row_num++
|
|
|
- sheet_detail.Cell(row_num, col_num).Value = v2
|
|
|
- }else {
|
|
|
- if row_num>2000 {
|
|
|
- continue
|
|
|
- }
|
|
|
- sheet_detail.Cell(row_num, col_num).Value = v2
|
|
|
- }
|
|
|
- row_num++
|
|
|
- }
|
|
|
- row_num = 0
|
|
|
- col_num++
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- mapLock.Unlock()
|
|
|
- }
|
|
|
-
|
|
|
+ //mgo_copy.InitPool()
|
|
|
+ //
|
|
|
+ //
|
|
|
+ ////固定死的需要分析的字段
|
|
|
+ //field_map := map[string]string{
|
|
|
+ // "title":"1",
|
|
|
+ // "area":"1",
|
|
|
+ // "city":"1",
|
|
|
+ // "subtype":"1",
|
|
|
+ // "buyer":"1",
|
|
|
+ // "agency":"1",
|
|
|
+ // "winner":"1",
|
|
|
+ // "budget":"1",
|
|
|
+ // "bidamount":"1",
|
|
|
+ // "projectname":"1",
|
|
|
+ // "projectcode":"1",
|
|
|
+ // "publishtime":"1",
|
|
|
+ // "comeintime":"1",
|
|
|
+ // "bidopentime":"1",
|
|
|
+ // "agencyaddr":"1",
|
|
|
+ // "site":"1",
|
|
|
+ // "href":"1",
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //sess := mgo.GetMgoConn()
|
|
|
+ //defer mgo.DestoryMongoConn(sess)
|
|
|
+ //
|
|
|
+ //sess_1 :=mgo_copy.GetMgoConn()
|
|
|
+ //defer mgo_copy.DestoryMongoConn(sess_1)
|
|
|
+ //
|
|
|
+ //sess_2 :=mgo_copy.GetMgoConn()
|
|
|
+ //defer mgo_copy.DestoryMongoConn(sess_2)
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
|
|
|
+ //it_1 :=sess_1.DB("extract_kf").C("zheng_test_1")
|
|
|
+ //it_2 :=sess_2.DB("extract_kf").C("zheng_test_2")
|
|
|
+ //n:=0
|
|
|
+ //for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
|
|
|
+ // if n%10000==0 {
|
|
|
+ // log.Println("当前n:",n)
|
|
|
+ // }
|
|
|
+ // if n>1000000 { //约半月数据
|
|
|
+ // break
|
|
|
+ // }
|
|
|
+ // if tmp["spidercode"]=="a_zgzfcgw_zfcghtgg_new"|| tmp["spidercode"]=="gd_gdszfcgw_dscght"||
|
|
|
+ // tmp["spidercode"]=="a_zgzfcgw_bid_tender_new"||tmp["spidercode"]=="a_ztxygjzbtbzxyxgs_zbxx"||
|
|
|
+ // tmp["spidercode"]=="sd_zgsdzfcgw_xxgk_sxhtgk"{
|
|
|
+ // jsonData := util.ObjToMap(tmp["jsondata"])
|
|
|
+ // if jsonData!=nil {
|
|
|
+ // for k,v :=range *jsonData {
|
|
|
+ // if fmt.Sprint(v) !=""{
|
|
|
+ // if field_map[k]=="1" {
|
|
|
+ // it_1.Insert(tmp)
|
|
|
+ // it_2.Insert(tmp)
|
|
|
+ // break
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //log.Println("总计",n,"条数据")
|
|
|
|
|
|
+//}
|
|
|
|
|
|
- if limit >99{
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
|
|
|
+//统计字段
|
|
|
+//func Test_field(t *testing.T) {
|
|
|
|
|
|
- err := f.Save("zheng.xlsx")
|
|
|
- if err != nil {
|
|
|
- log.Println("保存xlsx失败:", err)
|
|
|
- return
|
|
|
- }
|
|
|
- log.Println("xlsx保存成功")
|
|
|
-}
|
|
|
+ //mgo = &mongodb.MongodbSim{
|
|
|
+ // MongodbAddr: "192.168.3.207:27081",
|
|
|
+ // DbName: "qfw",
|
|
|
+ // Size: util.IntAllDef(15, 10),
|
|
|
+ //}
|
|
|
+ //mgo.InitPool()
|
|
|
+ //
|
|
|
+ ////调试 - 导出数据
|
|
|
+ ////1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
|
|
|
+ ////2:人工抽查数据质量,用于jsondata权重评估
|
|
|
+ //
|
|
|
+ ////取 固有字段 1-为存在
|
|
|
+ ////now := int64(time.Now().Unix())
|
|
|
+ ////date_time := int64(86400*2)
|
|
|
+ //
|
|
|
+ ////field_map := make(map[string]string,0)
|
|
|
+ ////sess_field := mgo.GetMgoConn()
|
|
|
+ ////defer sess_field.Close()
|
|
|
+ ////res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
|
|
|
+ ////for dict := make(map[string]interface{}); res_field.Next(&dict); {
|
|
|
+ //// field_map[dict["s_field"].(string)] = "1"
|
|
|
+ ////}
|
|
|
+ //
|
|
|
+ ////固定死的需要分析的字段
|
|
|
+ //field_map := map[string]string{
|
|
|
+ // "title":"1",
|
|
|
+ // "area":"1",
|
|
|
+ // "city":"1",
|
|
|
+ // "subtype":"1",
|
|
|
+ // "buyer":"1",
|
|
|
+ // "agency":"1",
|
|
|
+ // "winner":"1",
|
|
|
+ // "budget":"1",
|
|
|
+ // "bidamount":"1",
|
|
|
+ // "projectname":"1",
|
|
|
+ // "projectcode":"1",
|
|
|
+ // "publishtime":"1",
|
|
|
+ // "comeintime":"1",
|
|
|
+ // "bidopentime":"1",
|
|
|
+ // "agencyaddr":"1",
|
|
|
+ // "site":"1",
|
|
|
+ // "href":"1",
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ ///* ObjectId("5da3f2c5a5cb26b9b79847fc") 0
|
|
|
+ // ObjectId("5da3fd6da5cb26b9b7a8683c") 5000
|
|
|
+ // ObjectId("5da40bdaa5cb26b9b7bea472") 10000
|
|
|
+ // ObjectId("5da44deaa5cb26b9b75efb38") 50000
|
|
|
+ // ObjectId("5da53440a5cb26b9b7d3f9aa") 100000
|
|
|
+ // ObjectId("5db2735ba5cb26b9b7c99c6f") 761414
|
|
|
+ //*/
|
|
|
+ //
|
|
|
+ ///*
|
|
|
+ //qfw-bidding
|
|
|
+ //
|
|
|
+ //ObjectId("5e0d4cdd0cf41612e063fc65") -1
|
|
|
+ //ObjectId("5df8bfe4e9d1f601e4e87431") 一百万
|
|
|
+ //ObjectId("5dea080ce9d1f601e45cb838") 二百万
|
|
|
+ //
|
|
|
+ //5df834dd // 半月 大约100万条
|
|
|
+ //
|
|
|
+ //*/
|
|
|
+ //sess := mgo.GetMgoConn()
|
|
|
+ //defer mgo.DestoryMongoConn(sess)
|
|
|
+ ////q := map[string]interface{}{
|
|
|
+ //// "_id": map[string]interface{}{
|
|
|
+ //// "$gt": util.StringTOBsonId("5dea080ce9d1f601e45cb838"),
|
|
|
+ //// "$lte": util.StringTOBsonId("5e0d4cdd0cf41612e063fc65"),
|
|
|
+ //// },
|
|
|
+ ////}
|
|
|
+ //it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
|
|
|
+ //
|
|
|
+ ////爬虫组
|
|
|
+ //crawlerMap,n := make(map[string]map[string]interface{},0),0
|
|
|
+ //
|
|
|
+ //for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
|
|
|
+ // if n%10000==0 {
|
|
|
+ // log.Println("当前n:",n)
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ // if n>3000000 {
|
|
|
+ // break
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ // if tmp["spidercode"]!="" {
|
|
|
+ // //判断是否有此类别分组
|
|
|
+ // dict := make(map[string]interface{},0)
|
|
|
+ // if crawlerMap[tmp["spidercode"].(string)]!= nil {
|
|
|
+ // dict = crawlerMap[tmp["spidercode"].(string)]
|
|
|
+ // }
|
|
|
+ // jsonData := util.ObjToMap(tmp["jsondata"])
|
|
|
+ //
|
|
|
+ // if jsonData!=nil {
|
|
|
+ // for k,v :=range *jsonData {
|
|
|
+ // if fmt.Sprint(v) ==""{
|
|
|
+ // //无效数据
|
|
|
+ // }else {
|
|
|
+ // if field_map[k]=="1" {
|
|
|
+ // arr := dict[k]
|
|
|
+ // if arr==nil {
|
|
|
+ // dict[k] = make([]string,0)
|
|
|
+ // dict[k] = append(dict[k].([]string),fmt.Sprint(v))
|
|
|
+ // }else {
|
|
|
+ // dict[k] = append(dict[k].([]string),fmt.Sprint(v))
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // if dict!=nil {
|
|
|
+ // crawlerMap[tmp["spidercode"].(string)] = dict
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //log.Println("总计",n,"条数据")
|
|
|
+ //log.Println("爬虫类别个数:",len(crawlerMap))
|
|
|
+ //
|
|
|
+ //
|
|
|
+ ////计算每个爬虫分类的总数-并添加
|
|
|
+ //
|
|
|
+ ////ObjectId("5e0d4cdd0cf41612e063fc65")
|
|
|
+ //arr :=make([]map[string]interface{},0)
|
|
|
+ //for k,v :=range crawlerMap {
|
|
|
+ // total :=0
|
|
|
+ // for _,v1 :=range v {
|
|
|
+ // total =total + len(v1.([]string))
|
|
|
+ // }
|
|
|
+ // v["total"]= total
|
|
|
+ // v["key"] = k
|
|
|
+ // arr = append(arr,v)
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //
|
|
|
+ ////爬虫类别下-有效字段总数排列 前100
|
|
|
+ //start := time.Now().Unix()
|
|
|
+ //quickSort(0,len(arr)-1,&arr)
|
|
|
+ //end :=time.Now().Unix()
|
|
|
+ //fmt.Println("耗时:",end-start,"秒")
|
|
|
+ //
|
|
|
+ //f :=xlsx.NewFile()
|
|
|
+ //sheet, _ := f.AddSheet("排序")
|
|
|
+ //
|
|
|
+ ////第一行先写标题
|
|
|
+ //row1 := sheet.AddRow()
|
|
|
+ //row1.AddCell().Value = "排名"
|
|
|
+ //row1.AddCell().Value = "爬虫类"
|
|
|
+ //row1.AddCell().Value = "字段有效数"
|
|
|
+ //
|
|
|
+ //mapLock := &sync.Mutex{}
|
|
|
+ //limit :=0
|
|
|
+ //for _,v :=range arr {
|
|
|
+ // limit++
|
|
|
+ // row := sheet.AddRow()
|
|
|
+ // row.AddCell().SetInt(limit)
|
|
|
+ // row.AddCell().SetString(v["key"].(string))
|
|
|
+ // row.AddCell().SetInt(v["total"].(int))
|
|
|
+ //
|
|
|
+ // if limit <=20 {
|
|
|
+ // mapLock.Lock()
|
|
|
+ // sheetName := "排名"+util.ObjToString(limit)+":"+util.ObjToString(v["key"])
|
|
|
+ // sheet_detail, err := f.AddSheet(sheetName)
|
|
|
+ // if err==nil {
|
|
|
+ // row_num,col_num :=0,0
|
|
|
+ // for k1,v1 := range v {
|
|
|
+ // if a,ok :=v1.([]string);ok {
|
|
|
+ // for k2, v2 := range a {
|
|
|
+ // if k2==0 {
|
|
|
+ // sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
|
|
|
+ // row_num++
|
|
|
+ // sheet_detail.Cell(row_num, col_num).Value = v2
|
|
|
+ // }else {
|
|
|
+ // if row_num>2000 {
|
|
|
+ // continue
|
|
|
+ // }
|
|
|
+ // sheet_detail.Cell(row_num, col_num).Value = v2
|
|
|
+ // }
|
|
|
+ // row_num++
|
|
|
+ // }
|
|
|
+ // row_num = 0
|
|
|
+ // col_num++
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ // mapLock.Unlock()
|
|
|
+ // }
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //
|
|
|
+ // if limit >99{
|
|
|
+ // break
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ //
|
|
|
+ //
|
|
|
+ //err := f.Save("zheng.xlsx")
|
|
|
+ //if err != nil {
|
|
|
+ // log.Println("保存xlsx失败:", err)
|
|
|
+ // return
|
|
|
+ //}
|
|
|
+ //log.Println("xlsx保存成功")
|
|
|
+//}
|
|
|
|
|
|
|
|
|
func quickSort(left int,right int ,array *[]map[string]interface{}) {
|