123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365 |
- package main
- import (
- "fmt"
- "github.com/tealeg/xlsx"
- "log"
- "qfw/util"
- "qfw/util/mongodb"
- "sync"
- "testing"
- "time"
- )
- var (
- mgo *mongodb.MongodbSim //mongodb操作对象
- )
- func Test_heavy(t *testing.T) {
- //mapinfo := map[string]interface{}{
- // "gtid": "586b6d7061a0721f15b8f264",
- // "lteid": "5e0b2b780cf41612e0639460",
- //}
- //task([]byte{}, mapinfo)
- //log.Println("1")
- //代码copy数据
- //sessTest :=mgoTest.GetMgoConn()
- //defer sessTest.Close()
- //
- //sess := mgo.GetMgoConn()
- //defer sess.Close()
- //
- ////var arr []map[string]interface{}
- //
- //res_test := sessTest.DB("qfw").C("bidding").Find(mongodb.ObjToMQ(`{"comeintime":{"$gte": 1571025600, "$lte": 1571976000}}`, true)).Iter()
- //res :=sess.DB("extract_kf").C("a_testbidding")
- //5
- //
- //
- //
- //
- //i:=0
- //for dict := make(map[string]interface{}); res_test.Next(&dict); i++{
- //
- // //插入
- // if i%2000==0 {
- // log.Println("当前:",i)
- // }
- // res.Insert(dict)
- // //if len(arr)>=500 {
- // // arr = make([]map[string]interface{},0)
- // //}else {
- // // arr = append(arr,dict)
- // //}
- //}
- //
- //extract,extract_copy:="a_testbidding_new","a_testbidding"
- //
- //sess := mgo.GetMgoConn()
- //defer mgo.DestoryMongoConn(sess)
- //res_copy := sess.DB("extract_kf").C(extract_copy).Find(nil).Iter()
- //
- //m1 :=map[string]int{} //老版本
- //m2 :=map[string]int{} //新版本
- //
- //i:=0
- //j:=0
- //for v1 := make(map[string]interface{}); res_copy.Next(&v1); i++{
- // if i%2000==0 {
- // log.Println("当前i:",i)
- // }
- // m1[(v1["_id"].(bson.ObjectId).Hex())]= util.IntAll(v1["repeat"])
- //}
- //
- //sesss := mgo.GetMgoConn()
- //defer mgo.DestoryMongoConn(sesss)
- //res := sesss.DB("extract_kf").C(extract).Find(nil).Iter()
- //
- //
- //for v2 := make(map[string]interface{}); res.Next(&v2); j++{
- // if j%2000==0 {
- // log.Println("当前j:",j)
- // }
- // m2[(v2["_id"].(bson.ObjectId).Hex())]= util.IntAll(v2["repeat"])
- //}
- //
- //fmt.Println(len(m1),len(m2))
- //n1:=0
- //n2:=0
- //n3:=0
- //n4:=0
- //n5:=0
- //n6:=0
- //
- //var arr1 []string
- //var arr2 []string
- //for k,v:=range m1{
- //
- // if m2[k]==1&&v==0{//0:1
- // n1++
- // arr2 = append(arr2,fmt.Sprintf("目标_id:%s",k))
- // }
- // if m2[k]==0&&v==1{ //1:0
- // n2++
- // arr1 = append(arr1,fmt.Sprintf("目标_id:%s",k))
- // }
- // if m2[k]==0&&v==0{ //0:0
- // n3++
- // }
- // if m2[k]==1&&v==1{//1:1
- // n4++
- // }
- // if m2[k]==-1&&v==0{ //0:-1
- // n5++
- // }
- // if m2[k]==-1&&v==1{//1:-1
- // n6++
- // }
- //
- //}
- ////打印 1:0情况 ;
- //mm:=0
- //for _,v:=range arr1 {
- // mm++
- // if mm%200==0 {
- // log.Println(v)
- // }
- //}
- //
- //log.Println("分割线---------------")
- //log.Println("分割线---------------")
- //
- //
- ////打印 0:1情况
- //nn:=0
- //for _,v:=range arr2 {
- // nn++
- // if nn%200==0 {
- // log.Println(v)
- // }
- //}
- //
- //log.Println("V1 0:1---",n1)
- //log.Println("V1 1:0---",n2)
- //log.Println("V1 0:0---",n3)
- //log.Println("V1 1:1---",n4)
- //log.Println("V1 0:-1---",n5)
- //log.Println("V1 1:-1---",n6)
- }
- func Test_field(t *testing.T) {
- mgo = &mongodb.MongodbSim{
- MongodbAddr: "192.168.3.207:27092",
- DbName: "extract_kf",
- Size: util.IntAllDef(15, 10),
- }
- mgo.InitPool()
- //调试 - 导出数据
- //1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
- //2:人工抽查数据质量,用于jsondata权重评估
- //取 固有字段 1-为存在
- //now := int64(time.Now().Unix())
- //date_time := int64(86400*2)
- field_map := make(map[string]string,0)
- sess_field := mgo.GetMgoConn()
- defer sess_field.Close()
- res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
- for dict := make(map[string]interface{}); res_field.Next(&dict); {
- field_map[dict["s_field"].(string)] = "1"
- }
- //固定死的需要分析的字段
- /* ObjectId("5da3f2c5a5cb26b9b79847fc")
- ObjectId("5da3fd6da5cb26b9b7a8683c")
- ObjectId("5da40bdaa5cb26b9b7bea472")
- */
- sess := mgo.GetMgoConn()
- defer mgo.DestoryMongoConn(sess)
- q := map[string]interface{}{
- "_id": map[string]interface{}{
- "$gt": util.StringTOBsonId("5da3f2c5a5cb26b9b79847fc"),
- "$lte": util.StringTOBsonId("5da3fd6da5cb26b9b7a8683c"),
- },
- }
- it := sess.DB(mgo.DbName).C("a_testbidding").Find(&q).Sort("_id").Iter()
- //爬虫组
- crawlerMap,n := make(map[string]map[string]interface{},0),0
- for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
- if tmp["spidercode"]!="" {
- //判断是否有次类别分组
- dict := make(map[string]interface{},0)
- if crawlerMap[tmp["spidercode"].(string)]!= nil {
- dict = crawlerMap[tmp["spidercode"].(string)]
- }
- jsonData := util.ObjToMap(tmp["jsondata"])
- if jsonData!=nil {
- for k,v :=range *jsonData {
- if fmt.Sprint(v) =="" {
- //无效数据
- }else {
- arr := dict[k]
- if arr==nil {
- dict[k] = make([]string,0)
- dict[k] = append(dict[k].([]string),fmt.Sprint(v))
- }else {
- //if a,ok :=arr.([]string);ok{
- // a = append(a,fmt.Sprint(v))
- //}
- dict[k] = append(dict[k].([]string),fmt.Sprint(v))
- }
- }
- }
- }
- if dict!=nil {
- crawlerMap[tmp["spidercode"].(string)] = dict
- }
- }
- }
- log.Println("总计",n,"条数据")
- log.Println("判重类别个数:",len(crawlerMap))
- //计算每个爬虫分类的总数-并添加
- //
- arr :=make([]map[string]interface{},0)
- for k,v :=range crawlerMap {
- total :=0
- for _,v1 :=range v {
- total =total + len(v1.([]string))
- }
- v["total"]= total
- v["key"] = k
- arr = append(arr,v)
- }
- //爬虫类别下-有效字段总数排列 前100
- start := time.Now().Unix()
- quickSort(0,len(arr)-1,&arr)
- end :=time.Now().Unix()
- fmt.Println("耗时:",end-start,"秒")
- f :=xlsx.NewFile()
- sheet, _ := f.AddSheet("排序")
- //第一行先写标题
- row1 := sheet.AddRow()
- row1.AddCell().Value = "排名"
- row1.AddCell().Value = "爬虫类"
- row1.AddCell().Value = "字段有效数"
- mapLock := &sync.Mutex{}
- limit :=0
- for _,v :=range arr {
- limit++
- row := sheet.AddRow()
- row.AddCell().SetInt(limit)
- row.AddCell().SetString(v["key"].(string))
- row.AddCell().SetInt(v["total"].(int))
- mapLock.Lock()
- sheetName := "排名:"+util.ObjToString(v["key"])
- sheet_detail, err := f.AddSheet(sheetName)
- if err==nil {
- row_num,col_num :=0,0
- for k1,v1 := range v {
- if a,ok :=v1.([]string);ok {
- for k2, v2 := range a {
- if k2==0 {
- sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
- row_num++
- sheet_detail.Cell(row_num, col_num).Value = v2
- }else {
- sheet_detail.Cell(row_num, col_num).Value = v2
- }
- row_num++
- }
- row_num = 0
- col_num++
- }
- }
- }
- mapLock.Unlock()
- if limit >10{
- break
- }
- }
- err := f.Save("zheng.xlsx")
- if err != nil {
- log.Println("保存xlsx失败:", err)
- return
- }
- log.Println("xlsx保存成功")
- }
- func quickSort(left int,right int ,array *[]map[string]interface{}) {
- l:=left
- r:=right
- pivot := util.IntAll((*array)[(left+right)/2]["total"])//中轴
- //for 的目标 将比pivot小的左边 反之右边
- for ;l<r;{
- //左半区找到大于等于pivot的数
- for ;util.IntAll((*array)[l]["total"]) > pivot; {
- l++
- }
- //右半区找到小于等于pivot的数
- for ;util.IntAll((*array)[r]["total"])<pivot; {
- r--
- }
- //本次分解任务完成
- if l>=r {
- break
- }
- (*array)[l],(*array)[r] = (*array)[r],(*array)[l]
- //优化相等的情况
- if util.IntAll((*array)[l]["total"]) == pivot {
- r--
- }
- if util.IntAll((*array)[r]["total"]) == pivot {
- l++
- }
- }
- if l==r {
- l++
- r--
- }
- //向左递归
- if left<r {
- quickSort(left,r,array)
- }
- //向右递归
- if right>l {
- quickSort(l,right,array)
- }
- }
|