main.go 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. package main
  2. import (
  3. "fmt"
  4. "github.com/robfig/cron/v3"
  5. "go.mongodb.org/mongo-driver/bson"
  6. "go.uber.org/zap"
  7. utils "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
  9. "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
  10. "time"
  11. )
  12. var (
  13. MgoB *mongodb.MongodbSim
  14. MgoC *mongodb.MongodbSim
  15. Rest = make(map[string]interface{}, 0) //存储配置 栏目
  16. // 更新mongo
  17. //千里马对应的标讯 channel
  18. channels = []string{"招标公告", "重新招标", "意见征集", "招标预告", "信息变更", "答疑公告", "废标公告", "流标公告",
  19. "开标公示", "候选人公示", "中标通知", "合同公告", "验收合同", "违规公告", "其他公告", "预告", "公告", "变更", "结果", "其他"}
  20. channels2 = []string{"可研", "立项", "核准", "备案", "环评", "审批", "施工许可"} // 拟建数据
  21. channels3 = []string{"国土"} //产权数据
  22. //标讯数据细分,招标预告、招标公告、结果公告
  23. predictionChannels = []string{"预告", "招标预告", "意见征集"} //招标预告
  24. biddingChannels = []string{"公告", "变更", "招标公告", "重新招标", "信息变更", "答疑公告"} //招标公告
  25. resultChannels = []string{"废标公告", "流标公告", "结果", "开标公示", "候选人公示", "中标通知", "合同公告"} //结果公告
  26. Yesterday time.Time
  27. Today time.Time
  28. dataSource = make(map[string]interface{}, 0) //数据源收录指标
  29. dataCollection = make(map[string]interface{}, 0) //数据采集指标
  30. dataCompete = make(map[string]interface{}, 0) //竞品对比指标
  31. dataTime = make(map[string]interface{}, 0) //数据时效指标
  32. dataQuality = make(map[string]interface{}, 0) //数据质量指标
  33. //竞品网站
  34. competeSites = []string{"元博网(采购与招标网)", "中国招标与采购网", "北京隆道网络科技有限公司", "友云采"}
  35. )
  36. func main() {
  37. local, _ := time.LoadLocation("Asia/Shanghai")
  38. c := cron.New(cron.WithLocation(local), cron.WithSeconds())
  39. _, err := c.AddFunc(GF.Cron.Spec, getIndicators)
  40. if err != nil {
  41. log.Error("main", zap.Error(err))
  42. }
  43. log.Info("main", zap.String("spec", GF.Cron.Spec))
  44. c.Start()
  45. defer c.Stop()
  46. select {}
  47. }
  48. // getIndicators 获取数据指标数据
  49. func getIndicators() {
  50. // 获取昨天零点和今天零点的时间戳
  51. now := time.Now()
  52. start := GF.Cron.Start
  53. end := GF.Cron.End
  54. if start == 0 {
  55. start = -1
  56. }
  57. Yesterday = time.Date(now.Year(), now.Month(), now.Day()+start, 0, 0, 0, 0, time.Local)
  58. Today = time.Date(now.Year(), now.Month(), now.Day()+end, 0, 0, 0, 0, time.Local)
  59. dataSource = make(map[string]interface{}, 0) //数据源收录指标
  60. dataCollection = make(map[string]interface{}, 0) //数据采集指标
  61. dataCompete = make(map[string]interface{}, 0) //竞品对比指标
  62. dataTime = make(map[string]interface{}, 0) //数据时效指标
  63. dataQuality = make(map[string]interface{}, 0) //数据质量指标
  64. //1. 数据采集指标
  65. getCollection()
  66. //2.统计竞品对比指标
  67. dayOfWeek := Today.Weekday()
  68. if dayOfWeek == time.Wednesday || GF.Cron.Week {
  69. coverageA()
  70. coverageB()
  71. }
  72. //3.数据时效指标
  73. getTimeLines()
  74. //4.数据行质量合格率,暂时写死
  75. dataQuality["数据行质量合格率"] = GF.Cron.QualityRate
  76. //5.统计 数据源收录指标
  77. getCollectionData()
  78. Rest["数据源收录指标"] = dataSource
  79. Rest["数据采集指标"] = dataCollection
  80. Rest["竞品对比指标"] = dataCompete
  81. Rest["数据时效指标"] = dataTime
  82. Rest["数据质量指标"] = dataQuality
  83. Rest["日期"] = Yesterday.Format("2006-01-02")
  84. MgoB.Save("bidding_zhibiao", Rest)
  85. fmt.Println("over")
  86. }
  87. // getCollection 获取数据采集指标
  88. func getCollection() {
  89. //1.数据日采集量
  90. whereBidding := map[string]interface{}{
  91. "comeintime": map[string]interface{}{
  92. "$gt": Yesterday.Unix(),
  93. "$lte": Today.Unix(),
  94. },
  95. }
  96. biddingCount := MgoB.Count("bidding", whereBidding)
  97. if biddingCount == 0 {
  98. SendMail("数据昨日采为0", "请检查相关流程")
  99. return
  100. }
  101. dataCollection["数据采集日采集量"] = biddingCount
  102. log.Info("getCollection", zap.Int("数据日采集量", biddingCount))
  103. //2. 统计爬虫总量
  104. whereT := map[string]interface{}{
  105. "state": map[string]interface{}{
  106. "$ne": []interface{}{4, 10},
  107. },
  108. }
  109. collectAll := MgoC.Count("luaconfig", whereT)
  110. dataCollection["爬虫总量"] = collectAll
  111. log.Info("getCollection", zap.Int("爬虫总量", collectAll))
  112. //3. 爬虫异常数量
  113. whereCollectErr := map[string]interface{}{
  114. "l_comeintime": map[string]interface{}{
  115. "$gt": Yesterday.Unix(),
  116. "$lte": Today.Unix(),
  117. },
  118. }
  119. collectErrCount := MgoC.Count("task", whereCollectErr)
  120. dataCollection["爬虫日异常量"] = collectErrCount
  121. errPercentage := (float64(collectErrCount) / float64(collectAll)) * 100.0
  122. dataCollection["爬虫日异常量比例"] = fmt.Sprintf("%.2f%%", errPercentage)
  123. log.Info("getCollection", zap.Int("爬虫日异常量", collectErrCount))
  124. //4.爬虫上架时效(小时)
  125. dayOfWeek := Today.Weekday() // 获取星期几
  126. lastSunday := time.Date(Today.Year(), Today.Month(), Today.Day()-1, 0, 0, 0, 0, time.Local) //上周日
  127. lastMonday := time.Date(Today.Year(), Today.Month(), Today.Day()-7, 0, 0, 0, 0, time.Local) //上周一
  128. //4. 周一或者强制统计,出上周一到周日的 爬虫上架时效/爬虫维护时效(小时)
  129. if dayOfWeek == time.Monday || GF.Cron.Week {
  130. //4.1 爬虫上架时效(小时)
  131. whereShelves := map[string]interface{}{
  132. "comeintime": map[string]interface{}{
  133. "$gte": lastMonday.Unix(),
  134. "$lte": lastSunday.Unix(),
  135. },
  136. }
  137. shelves, _ := MgoC.Find("luaconfig", whereShelves, nil, map[string]interface{}{"code": 1, "comeintime": 1}, false, -1, -1)
  138. if len(*shelves) > 0 {
  139. shelvesCount := int64(0)
  140. shelvesTime := int64(0)
  141. for _, v := range *shelves {
  142. code := utils.ObjToString(v["code"])
  143. shelveNew, _ := MgoC.FindOne("lua_logs_auditor_new", map[string]interface{}{"code": code, "types": "审核"})
  144. if shelveNew == nil {
  145. continue
  146. } else {
  147. comeintimeNew := utils.Int64All((*shelveNew)["comeintime"])
  148. comeintime := utils.Int64All(v["comeintime"])
  149. if comeintimeNew == 0 {
  150. continue
  151. }
  152. if comeintimeNew-comeintime > 0 {
  153. shelvesTime = shelvesTime + comeintimeNew - comeintime
  154. shelvesCount++
  155. }
  156. }
  157. }
  158. if shelvesCount > 0 {
  159. dataCollection["爬虫上架时效(小时)"] = (shelvesTime / shelvesCount) / 3600
  160. log.Info("getCollection", zap.Any("爬虫上架时效", (shelvesTime/shelvesCount)/3600))
  161. } else {
  162. dataCollection["爬虫上架时效(小时)"] = ""
  163. }
  164. }
  165. //4.2 爬虫维护时效(小时)
  166. whereAuditor := map[string]interface{}{
  167. "comeintime": map[string]interface{}{
  168. "$gte": lastMonday.Unix(),
  169. "$lte": lastSunday.Unix(),
  170. },
  171. "types": "审核",
  172. }
  173. maintainCount := int64(0) //维护数量
  174. maintainTime := int64(0) //维护总时间
  175. auditors, _ := MgoC.Find("lua_logs_auditor", whereAuditor, nil, nil, false, -1, -1)
  176. if len(*auditors) > 0 {
  177. for _, v := range *auditors {
  178. code := utils.ObjToString(v["code"])
  179. shelveNew, _ := MgoC.FindOne("lua_logs_auditor_new", map[string]interface{}{"code": code, "types": "审核"})
  180. if shelveNew == nil || len(*shelveNew) == 0 {
  181. taskWhere := map[string]interface{}{
  182. "s_code": code,
  183. "i_state": 4,
  184. }
  185. tasks, _ := MgoC.Find("task", taskWhere, map[string]interface{}{"l_complete": -1}, nil, false, -1, -1)
  186. if len(*tasks) > 0 {
  187. completeTime := utils.Int64All((*tasks)[0]["l_comeintime"])
  188. comeinTime := utils.Int64All(v["comeintime"])
  189. diff := completeTime - comeinTime
  190. if diff > 0 {
  191. maintainCount++
  192. maintainTime += diff
  193. }
  194. }
  195. }
  196. }
  197. if maintainCount > 0 {
  198. dataCollection["爬虫维护时效(小时)"] = (maintainTime / maintainCount) / 3600
  199. log.Info("getCollection", zap.Any("爬虫维护时效(小时)", (maintainTime/maintainCount)/3600))
  200. } else {
  201. dataCollection["爬虫维护时效(小时)"] = ""
  202. }
  203. }
  204. }
  205. }
  206. // coverageA 统计 剑鱼对千里马覆盖率
  207. func coverageA() {
  208. //5.竞品覆盖率,每周4统计上周的数据
  209. sessC := MgoC.GetMgoConn()
  210. defer MgoC.DestoryMongoConn(sessC)
  211. //获取上周3,千里马的招标数据;然后获取标讯前后个3天,一共7天的所有数据,对比看标题或者项目名称是否存在
  212. lastWednesday := time.Date(Today.Year(), Today.Month(), Today.Day()-7, 0, 0, 0, 0, time.Local)
  213. whereQlm := map[string]interface{}{
  214. "publishtime": lastWednesday.Format("2006-01-02"),
  215. "site": "千里马",
  216. }
  217. query := sessC.DB("qlm").C("data_merge").Find(whereQlm).Select(map[string]interface{}{"title": 1, "projectname": 1, "channel": 1, "qlm_toptype": 1}).Iter()
  218. count := 0
  219. qlmData := make([]map[string]interface{}, 0) //标讯所有数据
  220. njData := make([]map[string]interface{}, 0) //拟建数据
  221. cqData := make([]map[string]interface{}, 0) //产权数据
  222. preData := make([]map[string]interface{}, 0) //招标预告数据
  223. biddingData := make([]map[string]interface{}, 0) // 招标公告数据
  224. resultData := make([]map[string]interface{}, 0) // 结果公告数据
  225. for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
  226. data := map[string]interface{}{
  227. "title": tmp["title"],
  228. "projectname": tmp["projectname"],
  229. }
  230. channel := utils.ObjToString(tmp["channel"])
  231. qlmToptype := utils.ObjToString(tmp["qlm_toptype"])
  232. //标讯所有数据
  233. if IsInStringArray(channel, channels) {
  234. qlmData = append(qlmData, data)
  235. }
  236. if IsInStringArray(channel, predictionChannels) {
  237. preData = append(preData, data)
  238. }
  239. if IsInStringArray(channel, biddingChannels) {
  240. biddingData = append(biddingData, data)
  241. }
  242. if IsInStringArray(channel, resultChannels) {
  243. resultData = append(resultData, data)
  244. }
  245. //拟建数据
  246. if IsInStringArray(channel, channels2) {
  247. njData = append(njData, data)
  248. }
  249. //产权数据
  250. if IsInStringArray(channel, channels3) || qlmToptype == "产权" {
  251. cqData = append(cqData, data)
  252. }
  253. }
  254. log.Info("getIndicators", zap.Int("千里马上周三总数", count))
  255. biddingWhere := map[string]interface{}{
  256. "publishtime": map[string]interface{}{
  257. "$gte": lastWednesday.AddDate(0, 0, -3).Unix(),
  258. "$lte": lastWednesday.AddDate(0, 0, 3).Unix(),
  259. },
  260. }
  261. biddingDatas, _ := MgoB.Find("bidding", biddingWhere, nil, map[string]interface{}{"title": 1, "projectname": 1}, false, -1, -1)
  262. log.Info("coverageA", zap.Int("标讯一周总数", len(*biddingDatas)))
  263. // 将切片B中的标题和项目名称分别存储在哈希表中
  264. titlesInB, projectsInB := getUniqueFields(*biddingDatas)
  265. //5.1.1 统计 标讯-整体 数据
  266. matches := countMatches(qlmData, titlesInB, projectsInB)
  267. matchesA := map[string]interface{}{
  268. "标讯整体": map[string]interface{}{
  269. "date": lastWednesday.Format("2006-01-02"),
  270. "count": len(qlmData),
  271. "match": matches,
  272. "no-match": len(qlmData) - matches,
  273. "qlm-total": count,
  274. "rate": fmt.Sprintf("%.2f%%", float64(matches)/float64(len(qlmData))*100),
  275. },
  276. }
  277. dataCompete["千里马对剑鱼多出数比例(标讯)"] = fmt.Sprintf("%.2f%%", float64(len(qlmData)-matches)/float64(count)*100)
  278. //5.1.2 统计 标讯-招标预告 数据
  279. matchesPre := countMatches(preData, titlesInB, projectsInB)
  280. matchesA["招标预告"] = map[string]interface{}{
  281. "match": matchesPre,
  282. "no-match": len(preData) - matchesPre,
  283. "total": len(preData),
  284. "rate": fmt.Sprintf("%.2f%%", float64(matchesPre)/float64(len(preData))*100),
  285. }
  286. //5.1.3 统计 标讯-招标公告 数据
  287. matchBidding := countMatches(biddingData, titlesInB, projectsInB)
  288. matchesA["招标公告"] = map[string]interface{}{
  289. "match": matchBidding,
  290. "no-match": len(biddingData) - matchBidding,
  291. "total": len(biddingData),
  292. "rate": fmt.Sprintf("%.2f%%", float64(matchBidding)/float64(len(biddingData))*100),
  293. }
  294. //5.1.4 统计 标讯-结果公告 数据
  295. matchResult := countMatches(resultData, titlesInB, projectsInB)
  296. matchesA["结果公告"] = map[string]interface{}{
  297. "match": matchResult,
  298. "no-match": len(resultData) - matchResult,
  299. "total": len(resultData),
  300. "rate": fmt.Sprintf("%.2f%%", float64(matchResult)/float64(len(resultData))*100),
  301. }
  302. dataCompete["剑鱼对千里马覆盖率(标讯)"] = matchesA
  303. log.Info("coverageA", zap.String("剑鱼对千里马覆盖率-标讯", "处理完毕"))
  304. //5.2 拟建数据覆盖率
  305. matches2 := countMatches(njData, titlesInB, projectsInB)
  306. matchesB := map[string]interface{}{
  307. "match": matches2,
  308. "total": len(njData),
  309. "no-match": len(njData) - matches2,
  310. "date": lastWednesday.Format("2006-01-02"),
  311. "rate": fmt.Sprintf("%.2f%%", float64(matches2)/float64(len(njData))*100),
  312. }
  313. dataCompete["剑鱼对千里马覆盖率(拟建)"] = matchesB
  314. //5.3 产权数据统计
  315. matches3 := countMatches(cqData, titlesInB, projectsInB)
  316. matchesC := map[string]interface{}{
  317. "match": matches3,
  318. "total": len(cqData),
  319. "no-match": len(cqData) - matches3,
  320. "date": lastWednesday.Format("2006-01-02"),
  321. "rate": fmt.Sprintf("%.2f%%", float64(matches3)/float64(len(cqData))*100),
  322. }
  323. dataCompete["剑鱼对千里马覆盖率(产权)"] = matchesC
  324. log.Info("coverageA", zap.String("剑鱼对千里马覆盖率-产权", "处理完毕"))
  325. }
  326. // coverageB 统计 千里马对剑鱼的覆盖率
  327. func coverageB() {
  328. sessB := MgoB.GetMgoConn()
  329. defer MgoB.DestoryMongoConn(sessB)
  330. lastWednesday := time.Date(Today.Year(), Today.Month(), Today.Day()-7, 0, 0, 0, 0, time.Local)
  331. lastThursday := time.Date(Today.Year(), Today.Month(), Today.Day()-6, 0, 0, 0, 0, time.Local)
  332. whereQlm := map[string]interface{}{
  333. "publishtime": map[string]interface{}{
  334. "$gt": lastWednesday.Unix(),
  335. "$lte": lastThursday.Unix(),
  336. },
  337. }
  338. query := sessB.DB(GF.MongoB.DB).C("bidding").Find(whereQlm).Select(map[string]interface{}{"title": 1, "projectname": 1, "toptype": 1, "infoformat": 1}).Iter()
  339. count := 0
  340. qlmData := make([]map[string]interface{}, 0) //标讯所有数据
  341. njData := make([]map[string]interface{}, 0) //拟建数据
  342. cqData := make([]map[string]interface{}, 0) //产权数据
  343. preData := make([]map[string]interface{}, 0) //招标预告数据
  344. biddingData := make([]map[string]interface{}, 0) // 招标公告数据
  345. resultData := make([]map[string]interface{}, 0) // 结果公告数据
  346. for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
  347. data := map[string]interface{}{
  348. "title": tmp["title"],
  349. "projectname": tmp["projectname"],
  350. }
  351. toptype := utils.ObjToString(tmp["toptype"])
  352. //标讯所有数据
  353. if utils.IntAll(tmp["infoformat"]) == 1 {
  354. qlmData = append(qlmData, data)
  355. }
  356. if utils.IntAll(tmp["infoformat"]) == 2 {
  357. njData = append(njData, data)
  358. }
  359. if utils.IntAll(tmp["infoformat"]) == 3 {
  360. cqData = append(cqData, data)
  361. }
  362. if toptype == "预告" || toptype == "采购意向" {
  363. preData = append(preData, data)
  364. }
  365. if toptype == "招标" {
  366. biddingData = append(biddingData, data)
  367. }
  368. if toptype == "结果" {
  369. resultData = append(resultData, data)
  370. }
  371. }
  372. log.Info("coverageB", zap.Int("剑鱼上周三总数", count))
  373. biddingWhere := map[string]interface{}{
  374. "publishtime": map[string]interface{}{
  375. "$gte": lastWednesday.AddDate(0, 0, -3).Format("2006-01-02"),
  376. "$lte": lastWednesday.AddDate(0, 0, 3).Format("2006-01-02"),
  377. },
  378. }
  379. //竞品 qlm 数据库
  380. mgoQ := mongodb.MongodbSim{
  381. MongodbAddr: MgoC.MongodbAddr,
  382. DbName: "qlm",
  383. Size: 10,
  384. UserName: GF.MongoC.Username,
  385. Password: GF.MongoC.Password,
  386. Direct: GF.MongoC.Direct,
  387. }
  388. mgoQ.InitPool()
  389. biddingDatas, _ := mgoQ.Find("data_merge", biddingWhere, nil, map[string]interface{}{"title": 1, "projectname": 1}, false, -1, -1)
  390. log.Info("coverageB", zap.Int("千里马一周总数", len(*biddingDatas)))
  391. // 将切片B中的标题和项目名称分别存储在哈希表中
  392. titlesInB, projectsInB := getUniqueFields(*biddingDatas)
  393. //5.1.1 统计 标讯-整体 数据
  394. matches := countMatches(qlmData, titlesInB, projectsInB)
  395. //totalMatchesA := make(map[string]interface{}, 0) //剑鱼对千里马覆盖率(标讯)
  396. matchesA := map[string]interface{}{
  397. "标讯整体": map[string]interface{}{
  398. "date": lastWednesday.Format("2006-01-02"),
  399. "count": len(qlmData),
  400. "match": matches,
  401. "no-match": len(qlmData) - matches,
  402. "jianyu-total": count,
  403. "rate": fmt.Sprintf("%.2f%%", float64(matches)/float64(len(qlmData))*100),
  404. },
  405. }
  406. dataCompete["剑鱼对千里马多出数据量(标讯)"] = fmt.Sprintf("%.2f%%", float64(len(qlmData)-matches)/float64(count)*100)
  407. //5.1.2 统计 标讯-招标预告 数据
  408. matchesPre := countMatches(preData, titlesInB, projectsInB)
  409. matchesA["招标预告"] = map[string]interface{}{
  410. "match": matchesPre,
  411. "no-match": len(preData) - matchesPre,
  412. "total": len(preData),
  413. "rate": fmt.Sprintf("%.2f%%", float64(matchesPre)/float64(len(preData))*100),
  414. }
  415. //5.1.3 统计 标讯-招标公告 数据
  416. matchBidding := countMatches(biddingData, titlesInB, projectsInB)
  417. matchesA["招标公告"] = map[string]interface{}{
  418. "match": matchBidding,
  419. "no-match": len(biddingData) - matchBidding,
  420. "total": len(biddingData),
  421. "rate": fmt.Sprintf("%.2f%%", float64(matchBidding)/float64(len(biddingData))*100),
  422. }
  423. //5.1.4 统计 标讯-结果公告 数据
  424. matchResult := countMatches(resultData, titlesInB, projectsInB)
  425. matchesA["结果公告"] = map[string]interface{}{
  426. "match": matchResult,
  427. "no-match": len(resultData) - matchResult,
  428. "total": len(resultData),
  429. "rate": fmt.Sprintf("%.2f%%", float64(matchResult)/float64(len(resultData))*100),
  430. }
  431. dataCompete["千里马对剑鱼覆盖率(标讯)"] = matchesA
  432. log.Info("coverageB", zap.String("剑鱼对千里马覆盖率-标讯", "处理完毕"))
  433. //5.2 拟建数据覆盖率
  434. matches2 := countMatches(njData, titlesInB, projectsInB)
  435. matchesB := map[string]interface{}{
  436. "match": matches2,
  437. "total": len(njData),
  438. "no-match": len(njData) - matches2,
  439. "date": lastWednesday.Format("2006-01-02"),
  440. "rate": fmt.Sprintf("%.2f%%", float64(matches2)/float64(len(njData))*100),
  441. }
  442. dataCompete["千里马对剑鱼覆盖率(拟建)"] = matchesB
  443. //5.3 产权数据统计
  444. matches3 := countMatches(cqData, titlesInB, projectsInB)
  445. matchesC := map[string]interface{}{
  446. "match": matches3,
  447. "total": len(cqData),
  448. "no-match": len(cqData) - matches3,
  449. "date": lastWednesday.Format("2006-01-02"),
  450. "rate": fmt.Sprintf("%.2f%%", float64(matches3)/float64(len(cqData))*100),
  451. }
  452. dataCompete["千里马对剑鱼覆盖率(产权)"] = matchesC
  453. log.Info("coverageB", zap.String("千里马对剑鱼覆盖率-产权", "处理完毕"))
  454. }
  455. // getTimeLines 获取时效性指标
  456. func getTimeLines() {
  457. type MaxDifference struct {
  458. Bidding map[string]interface{}
  459. Difference int64
  460. }
  461. // 保存差值最大的前1000条数据及对应的bidding数据
  462. var maxDifferences []MaxDifference
  463. quantileMap := make(map[string]int) //分位数统计指标
  464. quantileTotal := 0
  465. whereAuditor := map[string]interface{}{
  466. "comeintime": map[string]interface{}{
  467. "$gt": time.Now().AddDate(0, -3, 0).Unix(),
  468. },
  469. "types": "审核",
  470. }
  471. //最近3天上架或者维护的采集;lua python 脚本
  472. auditors, _ := MgoC.Find("lua_logs_auditor", whereAuditor, nil, map[string]interface{}{"code": 1, "comeintime": 1}, false, -1, -1)
  473. codeMap := make([]string, 0)
  474. for _, v := range *auditors {
  475. code := utils.ObjToString(v["code"])
  476. codeMap = append(codeMap, code)
  477. }
  478. auditors2, _ := MgoC.Find("python_logs_auditor", whereAuditor, nil, map[string]interface{}{"spidercode": 1, "comeintime": 1}, false, -1, -1)
  479. for _, v := range *auditors2 {
  480. code := utils.ObjToString(v["spidercode"])
  481. codeMap = append(codeMap, code)
  482. }
  483. log.Info("最近3天上架或者维护的采集", zap.Int("脚本总数是:", len(codeMap)))
  484. //6.数据整体流程均耗时(分钟)
  485. whereBidding := map[string]interface{}{
  486. "comeintime": map[string]interface{}{
  487. "$gt": Yesterday.Unix(),
  488. "$lte": Today.Unix(),
  489. },
  490. }
  491. sessB := MgoB.GetMgoConn()
  492. defer MgoB.DestoryMongoConn(sessB)
  493. fd := bson.M{"extracttype": 1, "sensitive": 1, "dataging": 1, "site": 1, "infoformat": 1, "comeintime": 1, "pici": 1, "publishtime": 1, "competehref": 1, "attach_text": 1, "spidercode": 1, "href": 1, "title": 1, "projectname": 1}
  494. queryB := sessB.DB("qfw").C("bidding").Find(whereBidding).Select(fd).Iter()
  495. esCount := 0 //采集的数据需要生索引的数量
  496. biddingRealCount := 0
  497. pici_publish_totaltime := int64(0) //comeintime 和 生索引 publish 时间 差值的总和
  498. pici_comein_totaltime := int64(0) //publishtime 和 生索引 pici 时间 差值的总和
  499. for tmp := make(map[string]interface{}); queryB.Next(tmp); {
  500. if utils.IntAll(tmp["extracttype"]) != -1 && utils.ObjToString(tmp["sensitive"]) != "测试" && utils.IntAll(tmp["dataging"]) != 1 && utils.Float64All(tmp["infoformat"]) != 3 {
  501. comeintime := utils.Int64All(tmp["comeintime"])
  502. publishtime := utils.Int64All(tmp["publishtime"])
  503. pici := utils.Int64All(tmp["pici"])
  504. if pici == 0 || publishtime == 0 || comeintime == 0 {
  505. continue
  506. }
  507. if pici > 0 {
  508. esCount++
  509. }
  510. if (comeintime-publishtime) < 12*60*60 && pici > 0 {
  511. biddingRealCount++
  512. diff1 := pici - publishtime
  513. diff2 := pici - comeintime
  514. pici_publish_totaltime += diff1
  515. pici_comein_totaltime += diff2
  516. }
  517. //排除竞品网站
  518. if !IsInStringArray(utils.ObjToString(tmp["site"]), competeSites) {
  519. diff := pici - publishtime
  520. if diff < 0 {
  521. continue
  522. } else if diff < 5*60 {
  523. quantileMap["a1"]++
  524. } else if diff < 15*60 {
  525. quantileMap["a2"]++
  526. } else if diff < 30*60 {
  527. quantileMap["a3"]++
  528. } else if diff < 60*60 {
  529. quantileMap["a4"]++
  530. } else if diff < 3*60*60 {
  531. quantileMap["a5"]++
  532. } else if diff < 7*60*60 {
  533. quantileMap["a6"]++
  534. } else if diff < 15*60*60 {
  535. quantileMap["a7"]++
  536. } else if diff < 24*60*60 {
  537. quantileMap["a8"]++
  538. } else if diff < 48*60*60 {
  539. quantileMap["a9"]++
  540. } else if diff < 72*60*60 {
  541. quantileMap["a10"]++
  542. } else {
  543. quantileMap["a11"]++
  544. }
  545. quantileTotal++
  546. spiderCode := utils.ObjToString(tmp["spidercode"])
  547. //爬虫代码不是 三天内新上架和三天内维护的代码
  548. if !IsInStringArray(spiderCode, codeMap) {
  549. if diff > 7*24*3600 {
  550. continue
  551. }
  552. // 如果切片还没有满,直接追加
  553. if len(maxDifferences) < 1000 {
  554. maxDifferences = append(maxDifferences, MaxDifference{Bidding: tmp, Difference: diff})
  555. } else {
  556. // 替换切片中最小的值
  557. minIndex := 0
  558. for j, d := range maxDifferences {
  559. if d.Difference < maxDifferences[minIndex].Difference {
  560. minIndex = j
  561. }
  562. }
  563. if diff > maxDifferences[minIndex].Difference {
  564. maxDifferences[minIndex] = MaxDifference{Bidding: tmp, Difference: diff}
  565. }
  566. }
  567. }
  568. }
  569. }
  570. tmp = make(map[string]interface{})
  571. }
  572. //统计前1000条最大时差
  573. diffTotal := int64(0)
  574. for k, _ := range maxDifferences {
  575. data := maxDifferences[k]
  576. diffTotal += data.Difference
  577. data.Bidding["top_time"] = time.Now().Format("2006-01-02")
  578. data.Bidding["diff"] = data.Difference
  579. MgoB.SaveByOriID("bidding_top_1000", data.Bidding)
  580. }
  581. dataTime["数据时效极值(用时最长)top1000"] = fmt.Sprintf("%.2f", float64(diffTotal)/float64(1000*60))
  582. quantileData := make(map[string]interface{})
  583. //统计时效分位数
  584. for k, v := range quantileMap {
  585. quantileData[k] = fmt.Sprintf("%.2f", float64(v)/float64(quantileTotal))
  586. }
  587. dataTime["数据时效分位数统计"] = quantileData
  588. dataCollection["数据采集日索引量"] = esCount //数据采集指标-数据采集日索引量
  589. if biddingRealCount > 0 {
  590. pici_publish_avgtime := pici_publish_totaltime / int64(biddingRealCount)
  591. pici_comein_avgtime := pici_comein_totaltime / int64(biddingRealCount)
  592. dataTime["数据整体流程均耗时(分钟)"] = fmt.Sprintf("%.2f", float64(pici_publish_avgtime)/float64(60))
  593. dataTime["数据处理均耗时(分钟)"] = fmt.Sprintf("%.2f", float64(pici_comein_avgtime)/float64(60))
  594. dataTime["数据采集均耗时(分钟)"] = fmt.Sprintf("%.2f", float64(pici_publish_avgtime-pici_comein_avgtime)/float64(60))
  595. }
  596. }
  597. // getCollectionData 获取收录指标数据
  598. func getCollectionData() {
  599. //1.新收录数据源数量
  600. newCollectionWhere := map[string]interface{}{
  601. "comeintime": map[string]interface{}{
  602. "$gt": Yesterday.Unix(),
  603. },
  604. }
  605. newCount := MgoC.Count("site", newCollectionWhere)
  606. dataSource["新收录数据源数量"] = newCount
  607. //2.已收录数据源数量
  608. Count := MgoC.Count("site", nil)
  609. dataSource["已收录数据源数量"] = Count
  610. //3.待开发数据源数量
  611. whereConfig := map[string]interface{}{
  612. "state": 0,
  613. }
  614. unSiteCount := int64(0) //待开发数据源数量
  615. unSites, _ := MgoC.Find("luaconfig", whereConfig, nil, nil, false, -1, -1)
  616. if len(*unSites) > 0 {
  617. for _, v := range *unSites {
  618. code := utils.ObjToString(v["code"])
  619. num := MgoC.Count("lua_logs_auditor", map[string]interface{}{"code": code})
  620. if num == 0 {
  621. unSiteCount++
  622. }
  623. }
  624. }
  625. dataSource["待开发数据源数量"] = unSiteCount
  626. //4.各网站分类数据源数量
  627. // 用 map 来存储一级分类和对应的二级分类数据的数量
  628. categoryCounts := make(map[string]map[string]int)
  629. classes, _ := MgoC.Find("site", nil, nil, map[string]interface{}{"site_type": 1, "second_type": 1}, false, -1, -1)
  630. for _, v := range *classes {
  631. siteType := utils.ObjToString(v["site_type"])
  632. secondType := utils.ObjToString(v["second_type"])
  633. if _, ok := categoryCounts[siteType]; !ok {
  634. categoryCounts[siteType] = make(map[string]int)
  635. }
  636. categoryCounts[siteType][secondType]++
  637. }
  638. dataSource["网站分类数据源数量"] = categoryCounts
  639. //5.应采尽采率
  640. dataSource["应采尽采率"] = GF.Cron.CollectionRate
  641. }