heavy_test.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. package main
  2. import (
  3. "fmt"
  4. "github.com/tealeg/xlsx"
  5. "log"
  6. "qfw/util"
  7. "qfw/util/mongodb"
  8. "sync"
  9. "testing"
  10. "time"
  11. )
  12. var (
  13. mgo *mongodb.MongodbSim //mongodb操作对象
  14. )
  15. func Test_heavy(t *testing.T) {
  16. //mapinfo := map[string]interface{}{
  17. // "gtid": "586b6d7061a0721f15b8f264",
  18. // "lteid": "5e0b2b780cf41612e0639460",
  19. //}
  20. //task([]byte{}, mapinfo)
  21. //log.Println("1")
  22. //代码copy数据
  23. //sessTest :=mgoTest.GetMgoConn()
  24. //defer sessTest.Close()
  25. //
  26. //sess := mgo.GetMgoConn()
  27. //defer sess.Close()
  28. //
  29. ////var arr []map[string]interface{}
  30. //
  31. //res_test := sessTest.DB("qfw").C("bidding").Find(mongodb.ObjToMQ(`{"comeintime":{"$gte": 1571025600, "$lte": 1571976000}}`, true)).Iter()
  32. //res :=sess.DB("extract_kf").C("a_testbidding")
  33. //5
  34. //
  35. //
  36. //
  37. //
  38. //i:=0
  39. //for dict := make(map[string]interface{}); res_test.Next(&dict); i++{
  40. //
  41. // //插入
  42. // if i%2000==0 {
  43. // log.Println("当前:",i)
  44. // }
  45. // res.Insert(dict)
  46. // //if len(arr)>=500 {
  47. // // arr = make([]map[string]interface{},0)
  48. // //}else {
  49. // // arr = append(arr,dict)
  50. // //}
  51. //}
  52. //
  53. //extract,extract_copy:="a_testbidding_new","a_testbidding"
  54. //
  55. //sess := mgo.GetMgoConn()
  56. //defer mgo.DestoryMongoConn(sess)
  57. //res_copy := sess.DB("extract_kf").C(extract_copy).Find(nil).Iter()
  58. //
  59. //m1 :=map[string]int{} //老版本
  60. //m2 :=map[string]int{} //新版本
  61. //
  62. //i:=0
  63. //j:=0
  64. //for v1 := make(map[string]interface{}); res_copy.Next(&v1); i++{
  65. // if i%2000==0 {
  66. // log.Println("当前i:",i)
  67. // }
  68. // m1[(v1["_id"].(bson.ObjectId).Hex())]= util.IntAll(v1["repeat"])
  69. //}
  70. //
  71. //sesss := mgo.GetMgoConn()
  72. //defer mgo.DestoryMongoConn(sesss)
  73. //res := sesss.DB("extract_kf").C(extract).Find(nil).Iter()
  74. //
  75. //
  76. //for v2 := make(map[string]interface{}); res.Next(&v2); j++{
  77. // if j%2000==0 {
  78. // log.Println("当前j:",j)
  79. // }
  80. // m2[(v2["_id"].(bson.ObjectId).Hex())]= util.IntAll(v2["repeat"])
  81. //}
  82. //
  83. //fmt.Println(len(m1),len(m2))
  84. //n1:=0
  85. //n2:=0
  86. //n3:=0
  87. //n4:=0
  88. //n5:=0
  89. //n6:=0
  90. //
  91. //var arr1 []string
  92. //var arr2 []string
  93. //for k,v:=range m1{
  94. //
  95. // if m2[k]==1&&v==0{//0:1
  96. // n1++
  97. // arr2 = append(arr2,fmt.Sprintf("目标_id:%s",k))
  98. // }
  99. // if m2[k]==0&&v==1{ //1:0
  100. // n2++
  101. // arr1 = append(arr1,fmt.Sprintf("目标_id:%s",k))
  102. // }
  103. // if m2[k]==0&&v==0{ //0:0
  104. // n3++
  105. // }
  106. // if m2[k]==1&&v==1{//1:1
  107. // n4++
  108. // }
  109. // if m2[k]==-1&&v==0{ //0:-1
  110. // n5++
  111. // }
  112. // if m2[k]==-1&&v==1{//1:-1
  113. // n6++
  114. // }
  115. //
  116. //}
  117. ////打印 1:0情况 ;
  118. //mm:=0
  119. //for _,v:=range arr1 {
  120. // mm++
  121. // if mm%200==0 {
  122. // log.Println(v)
  123. // }
  124. //}
  125. //
  126. //log.Println("分割线---------------")
  127. //log.Println("分割线---------------")
  128. //
  129. //
  130. ////打印 0:1情况
  131. //nn:=0
  132. //for _,v:=range arr2 {
  133. // nn++
  134. // if nn%200==0 {
  135. // log.Println(v)
  136. // }
  137. //}
  138. //
  139. //log.Println("V1 0:1---",n1)
  140. //log.Println("V1 1:0---",n2)
  141. //log.Println("V1 0:0---",n3)
  142. //log.Println("V1 1:1---",n4)
  143. //log.Println("V1 0:-1---",n5)
  144. //log.Println("V1 1:-1---",n6)
  145. }
  146. func Test_field(t *testing.T) {
  147. mgo = &mongodb.MongodbSim{
  148. MongodbAddr: "192.168.3.207:27092",
  149. DbName: "extract_kf",
  150. Size: util.IntAllDef(15, 10),
  151. }
  152. mgo.InitPool()
  153. //调试 - 导出数据
  154. //1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
  155. //2:人工抽查数据质量,用于jsondata权重评估
  156. //取 固有字段 1-为存在
  157. //now := int64(time.Now().Unix())
  158. //date_time := int64(86400*2)
  159. field_map := make(map[string]string,0)
  160. sess_field := mgo.GetMgoConn()
  161. defer sess_field.Close()
  162. res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
  163. for dict := make(map[string]interface{}); res_field.Next(&dict); {
  164. field_map[dict["s_field"].(string)] = "1"
  165. }
  166. //固定死的需要分析的字段
  167. /* ObjectId("5da3f2c5a5cb26b9b79847fc")
  168. ObjectId("5da3fd6da5cb26b9b7a8683c")
  169. ObjectId("5da40bdaa5cb26b9b7bea472")
  170. */
  171. sess := mgo.GetMgoConn()
  172. defer mgo.DestoryMongoConn(sess)
  173. q := map[string]interface{}{
  174. "_id": map[string]interface{}{
  175. "$gt": util.StringTOBsonId("5da3f2c5a5cb26b9b79847fc"),
  176. "$lte": util.StringTOBsonId("5da3fd6da5cb26b9b7a8683c"),
  177. },
  178. }
  179. it := sess.DB(mgo.DbName).C("a_testbidding").Find(&q).Sort("_id").Iter()
  180. //爬虫组
  181. crawlerMap,n := make(map[string]map[string]interface{},0),0
  182. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  183. if tmp["spidercode"]!="" {
  184. //判断是否有次类别分组
  185. dict := make(map[string]interface{},0)
  186. if crawlerMap[tmp["spidercode"].(string)]!= nil {
  187. dict = crawlerMap[tmp["spidercode"].(string)]
  188. }
  189. jsonData := util.ObjToMap(tmp["jsondata"])
  190. if jsonData!=nil {
  191. for k,v :=range *jsonData {
  192. if fmt.Sprint(v) =="" {
  193. //无效数据
  194. }else {
  195. arr := dict[k]
  196. if arr==nil {
  197. dict[k] = make([]string,0)
  198. dict[k] = append(dict[k].([]string),fmt.Sprint(v))
  199. }else {
  200. //if a,ok :=arr.([]string);ok{
  201. // a = append(a,fmt.Sprint(v))
  202. //}
  203. dict[k] = append(dict[k].([]string),fmt.Sprint(v))
  204. }
  205. }
  206. }
  207. }
  208. if dict!=nil {
  209. crawlerMap[tmp["spidercode"].(string)] = dict
  210. }
  211. }
  212. }
  213. log.Println("总计",n,"条数据")
  214. log.Println("判重类别个数:",len(crawlerMap))
  215. //计算每个爬虫分类的总数-并添加
  216. //
  217. arr :=make([]map[string]interface{},0)
  218. for k,v :=range crawlerMap {
  219. total :=0
  220. for _,v1 :=range v {
  221. total =total + len(v1.([]string))
  222. }
  223. v["total"]= total
  224. v["key"] = k
  225. arr = append(arr,v)
  226. }
  227. //爬虫类别下-有效字段总数排列 前100
  228. start := time.Now().Unix()
  229. quickSort(0,len(arr)-1,&arr)
  230. end :=time.Now().Unix()
  231. fmt.Println("耗时:",end-start,"秒")
  232. f :=xlsx.NewFile()
  233. sheet, _ := f.AddSheet("排序")
  234. //第一行先写标题
  235. row1 := sheet.AddRow()
  236. row1.AddCell().Value = "排名"
  237. row1.AddCell().Value = "爬虫类"
  238. row1.AddCell().Value = "字段有效数"
  239. mapLock := &sync.Mutex{}
  240. limit :=0
  241. for _,v :=range arr {
  242. limit++
  243. row := sheet.AddRow()
  244. row.AddCell().SetInt(limit)
  245. row.AddCell().SetString(v["key"].(string))
  246. row.AddCell().SetInt(v["total"].(int))
  247. mapLock.Lock()
  248. sheetName := "排名:"+util.ObjToString(v["key"])
  249. sheet_detail, err := f.AddSheet(sheetName)
  250. if err==nil {
  251. row_num,col_num :=0,0
  252. for k1,v1 := range v {
  253. if a,ok :=v1.([]string);ok {
  254. for k2, v2 := range a {
  255. if k2==0 {
  256. sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
  257. row_num++
  258. sheet_detail.Cell(row_num, col_num).Value = v2
  259. }else {
  260. sheet_detail.Cell(row_num, col_num).Value = v2
  261. }
  262. row_num++
  263. }
  264. row_num = 0
  265. col_num++
  266. }
  267. }
  268. }
  269. mapLock.Unlock()
  270. if limit >10{
  271. break
  272. }
  273. }
  274. err := f.Save("zheng.xlsx")
  275. if err != nil {
  276. log.Println("保存xlsx失败:", err)
  277. return
  278. }
  279. log.Println("xlsx保存成功")
  280. }
  281. func quickSort(left int,right int ,array *[]map[string]interface{}) {
  282. l:=left
  283. r:=right
  284. pivot := util.IntAll((*array)[(left+right)/2]["total"])//中轴
  285. //for 的目标 将比pivot小的左边 反之右边
  286. for ;l<r;{
  287. //左半区找到大于等于pivot的数
  288. for ;util.IntAll((*array)[l]["total"]) > pivot; {
  289. l++
  290. }
  291. //右半区找到小于等于pivot的数
  292. for ;util.IntAll((*array)[r]["total"])<pivot; {
  293. r--
  294. }
  295. //本次分解任务完成
  296. if l>=r {
  297. break
  298. }
  299. (*array)[l],(*array)[r] = (*array)[r],(*array)[l]
  300. //优化相等的情况
  301. if util.IntAll((*array)[l]["total"]) == pivot {
  302. r--
  303. }
  304. if util.IntAll((*array)[r]["total"]) == pivot {
  305. l++
  306. }
  307. }
  308. if l==r {
  309. l++
  310. r--
  311. }
  312. //向左递归
  313. if left<r {
  314. quickSort(left,r,array)
  315. }
  316. //向右递归
  317. if right>l {
  318. quickSort(l,right,array)
  319. }
  320. }