sitecount.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528
  1. package luatask
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "fmt"
  6. "github.com/donnie4w/go-logger/logger"
  7. "github.com/imroc/req"
  8. "github.com/tealeg/xlsx"
  9. "net/http"
  10. qu "qfw/util"
  11. "strconv"
  12. "sync"
  13. "time"
  14. "util"
  15. )
  16. type SiteInfo struct {
  17. Site string `json:""` //站点
  18. Num int `json:"averagenum"` //每日网站发布平均量
  19. Modifyuser string `json:"modifyuser"` //维护人
  20. State string `json:"state"` //网站状态
  21. Domain string `json:"domain"` //域名
  22. Stype string `json:"stype"` //网站类型
  23. Platform string `json:"platform"` //所属平台
  24. Coverage string `json:"coverage"` //覆盖率
  25. ListAllNum int `json:"listallnum"` //href去重,当天采集数据量
  26. ListSuccessNum int `json:"listsuccessnum"` //href去重,当天采集成功数据量
  27. PTimeSuccessNum int `json:"ptimesuccessnum"` //href去重,当天发布采集成功数据量
  28. PTimeSuccessDbNum int `json:"ptimesuccessdbnum"` //href去重,data_bak当天发布采集成功数据量
  29. Comeintime int64 `json:"comeintime"` //href去重,当天发布采集成功数据量
  30. }
  31. var SiteInfoModel = `{
  32. "msgtype": "file",
  33. "file": {
  34. "media_id": "%s"
  35. }
  36. }`
  37. var LuaListDownloadAllNum int64
  38. var LuaListDownloadSuccessAllNum int64
  39. var LuaBiddingDownloadAllNum int64
  40. var PythonListDownloadAllNum int64
  41. var PythonListDownloadSuccessAllNum int64
  42. var PythonBiddingDownloadAllNum int64
  43. var LuaPythonNumModel = `{
  44. "msgtype": "text",
  45. "text": {
  46. "content": "%s"
  47. }
  48. }`
  49. var MarkdownModel = `{
  50. "msgtype": "markdown",
  51. "markdown": {
  52. "content": "%s"
  53. }
  54. }`
  55. var NumContentModel = `
  56. >平台:<font color=\"warning\">%s</font>
  57. >列表页采集量:<font color=\"warning\">%d</font>
  58. >列表页采集成功量:<font color=\"warning\">%d</font>\n
  59. >Bidding成功量:<font color=\"warning\">%d</font>\n
  60. `
  61. //var AllHref map[string]string
  62. //重点网站每日采集量统计
  63. func SendInfoToWxWork_SiteDataCount() {
  64. defer qu.Catch()
  65. //AllHref = map[string]string{} //初始化
  66. //一、统计
  67. //1、查站点的基础信息
  68. //siteInfoMap := map[string]*SiteInfo{}
  69. //siteInfoMap_Back := map[string]*SiteInfo{}
  70. allSpiderMap := map[string]*SiteInfo{}
  71. list, _ := util.MgoE.Find("site_baseinfo", nil, nil, nil, false, -1, -1)
  72. for _, l := range *list {
  73. site := qu.ObjToString(l["site"])
  74. vByte, _ := json.Marshal(l)
  75. //siteInfo1 := &SiteInfo{}
  76. //siteInfo2 := &SiteInfo{}
  77. siteInfo3 := &SiteInfo{}
  78. //json.Unmarshal(vByte, siteInfo1)
  79. //json.Unmarshal(vByte, siteInfo2)
  80. json.Unmarshal(vByte, siteInfo3)
  81. //siteInfoMap[site] = siteInfo1
  82. //siteInfoMap_Back[site] = siteInfo2
  83. siteInfo3.Comeintime = time.Now().Unix()
  84. allSpiderMap[site] = siteInfo3
  85. }
  86. //stime := util.GetTime(-1)
  87. //etime := util.GetTime(0)
  88. //ptime := qu.FormatDateByInt64(&stime, qu.Date_Short_Layout)
  89. //qu.Debug(stime, etime, ptime)
  90. //2、统计在spider_highlistdata中站点数据量
  91. //GetHighListDataNum(stime, etime, ptime, siteInfoMap)
  92. //3、统计spider_listdata站点数据
  93. //GetListDataNum(stime, etime, ptime, siteInfoMap)
  94. //4、统计data_bak
  95. //GetDataBakNum(stime, etime, siteInfoMap)
  96. //5、单独统计data_bak
  97. //GetDataBakNum_Back(stime, etime, siteInfoMap_Back)
  98. //6、聚合spidercode统计结果
  99. GetAllSpidercodeNum(allSpiderMap)
  100. //6、汇总excel
  101. //GetSiteInfoExcel(siteInfoMap, siteInfoMap_Back, allSpiderMap)
  102. GetSiteInfoExcel(allSpiderMap)
  103. }
  104. func GetAllSpidercodeNum(siteInfoMap map[string]*SiteInfo) {
  105. defer qu.Catch()
  106. logger.Info("统计采集量luacodeinfo开始...")
  107. sess := util.MgoE.GetMgoConn()
  108. defer util.MgoE.DestoryMongoConn(sess)
  109. query := map[string]interface{}{
  110. "comeintime": map[string]interface{}{
  111. "$gte": util.GetTime(0),
  112. },
  113. }
  114. fields := map[string]interface{}{
  115. "repeatptimesuccessnum": 1,
  116. "repeatptimesuccessdbnum": 1,
  117. "repeatdownloadallnum": 1,
  118. "repeatdownloadsuccessnum": 1,
  119. "site": 1,
  120. "platform": 1,
  121. }
  122. lock := &sync.Mutex{}
  123. wg := &sync.WaitGroup{}
  124. ch := make(chan bool, 5)
  125. it := sess.DB(util.MgoE.DbName).C("luacodeinfo").Find(&query).Select(&fields).Iter()
  126. n := 0
  127. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  128. wg.Add(1)
  129. ch <- true
  130. go func(tmp map[string]interface{}) {
  131. defer func() {
  132. <-ch
  133. wg.Done()
  134. }()
  135. site := qu.ObjToString(tmp["site"])
  136. platform := qu.ObjToString(tmp["platform"])
  137. repeatdownloadallnum := qu.IntAll(tmp["repeatdownloadallnum"])
  138. repeatdownloadsuccessnum := qu.IntAll(tmp["repeatdownloadsuccessnum"])
  139. repeatptimesuccessnum := qu.IntAll(tmp["repeatptimesuccessnum"])
  140. repeatptimesuccessdbnum := qu.IntAll(tmp["repeatptimesuccessdbnum"])
  141. if platform == "python" {
  142. site = site + "(python)"
  143. }
  144. lock.Lock()
  145. if info := siteInfoMap[site]; info != nil { //匹配要统计的重点网站
  146. info.ListAllNum += repeatdownloadallnum
  147. info.ListSuccessNum += repeatdownloadsuccessnum
  148. info.PTimeSuccessNum += repeatptimesuccessnum
  149. info.PTimeSuccessDbNum += repeatptimesuccessdbnum
  150. }
  151. lock.Unlock()
  152. }(tmp)
  153. if n%100 == 0 {
  154. logger.Debug(n)
  155. }
  156. tmp = map[string]interface{}{}
  157. }
  158. wg.Wait()
  159. logger.Debug("统计采集量luacodeinfo完成...")
  160. }
  161. func GetSiteInfoExcel(allSpiderInfo map[string]*SiteInfo) {
  162. defer qu.Catch()
  163. file, err := xlsx.OpenFile("res/sitecount.xlsx")
  164. if err != nil {
  165. qu.Debug("open file err:", err)
  166. return
  167. }
  168. sheet := file.Sheets[0]
  169. arr := []map[string]interface{}{}
  170. for site, info := range allSpiderInfo {
  171. style := xlsx.NewStyle()
  172. style.ApplyFill = true
  173. style.ApplyFont = true
  174. font := *xlsx.NewFont(10, "Verdana")
  175. style.Font = font
  176. row := sheet.AddRow()
  177. row.AddCell().SetValue(site)
  178. row.AddCell().SetValue(info.Num)
  179. row.AddCell().SetValue(info.ListAllNum)
  180. row.AddCell().SetValue(info.ListSuccessNum)
  181. row.AddCell().SetValue(info.PTimeSuccessNum)
  182. row.AddCell().SetValue(info.PTimeSuccessDbNum)
  183. coverage := float64(info.PTimeSuccessNum) / float64(info.Num)
  184. fill := &xlsx.Fill{
  185. PatternType: "solid",
  186. }
  187. if coverage < 0.6 { //
  188. fill.FgColor = "00FF0000"
  189. fill.BgColor = "FF000000"
  190. } else if coverage >= 0.6 && coverage < 0.8 {
  191. fill.FgColor = "00FFCC00"
  192. fill.BgColor = "FF000000"
  193. } else if coverage >= 0.8 && coverage < 1.0 {
  194. fill.FgColor = "00FFFF00"
  195. fill.BgColor = "FF000000"
  196. } else {
  197. fill.FgColor = "0066FF33"
  198. fill.BgColor = "FF000000"
  199. }
  200. style.Fill = *fill
  201. value, _ := strconv.ParseFloat(fmt.Sprintf("%.2f", coverage*100), 64)
  202. result := fmt.Sprint(value) + "%"
  203. info.Coverage = result
  204. cell := row.AddCell()
  205. cell.SetValue(result)
  206. cell.SetStyle(style)
  207. row.AddCell().SetValue(info.Modifyuser)
  208. row.AddCell().SetValue(info.State)
  209. row.AddCell().SetValue(info.Domain)
  210. row.AddCell().SetValue(info.Stype)
  211. row.AddCell().SetValue(info.Platform)
  212. tmp := map[string]interface{}{}
  213. infoByte, err := json.Marshal(*info)
  214. if err == nil {
  215. if json.Unmarshal(infoByte, &tmp) == nil {
  216. arr = append(arr, tmp)
  217. } else {
  218. logger.Info("json unmarshal:", err)
  219. }
  220. } else {
  221. logger.Info("json marshal:", err)
  222. }
  223. }
  224. util.MgoE.SaveBulk("site_datacount", arr...)
  225. arr = []map[string]interface{}{}
  226. //file.Save("res/tmp.xlsx")
  227. SendSiteInfoToWxWork(file)
  228. }
  229. func SendSiteInfoToWxWork(file *xlsx.File) {
  230. defer qu.Catch()
  231. //file, err := xlsx.OpenFile("res/sitecount.xlsx")
  232. //if err != nil {
  233. // qu.Debug("open file err:", err)
  234. // return
  235. //}
  236. mw := &util.MyWrite{
  237. Byte: &bytes.Buffer{},
  238. }
  239. file.Write(mw)
  240. bt := mw.Byte.Bytes()
  241. client := req.C()
  242. filename := Publishtime + "重点网站数据量统计.xlsx"
  243. resp, _ := client.R().
  244. SetHeader("Content-Type", "multipart/form-data").
  245. //SetFiles(map[string]string{"file": string(resp4.Bytes())}).
  246. SetFileReader("file", filename, bytes.NewReader(bt)).
  247. Post("https://qyapi.weixin.qq.com/cgi-bin/webhook/upload_media?key=97850772-88d0-4544-a2c3-6201aeddff9e&type=file")
  248. result := map[string]interface{}{}
  249. mediaId := ""
  250. if json.Unmarshal(resp.Bytes(), &result) == nil {
  251. mediaId = qu.ObjToString(result["media_id"])
  252. } else {
  253. qu.Debug("unmarshal result err")
  254. }
  255. msg := fmt.Sprintf(SiteInfoModel, mediaId)
  256. resp1, err := http.Post(
  257. "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=97850772-88d0-4544-a2c3-6201aeddff9e",
  258. "application/json",
  259. bytes.NewBuffer([]byte(msg)),
  260. )
  261. if err != nil {
  262. fmt.Println("request error:", err)
  263. return
  264. }
  265. defer resp1.Body.Close()
  266. }
  267. func SendLuaPythonAllNum() {
  268. defer qu.Catch()
  269. luaContent := fmt.Sprintf(NumContentModel, "Lua", LuaListDownloadAllNum, LuaListDownloadSuccessAllNum, LuaBiddingDownloadAllNum)
  270. pythonContent := fmt.Sprintf(NumContentModel, "python", PythonListDownloadAllNum, PythonListDownloadSuccessAllNum, PythonBiddingDownloadAllNum)
  271. resultContent := Publishtime + ",Lua、Python各维度统计结果如下:\n" + fmt.Sprintf(MarkdownModel, luaContent+pythonContent)
  272. qu.Debug(resultContent)
  273. //保存记录
  274. util.MgoS.Save("spider_luapythoncount", map[string]interface{}{
  275. "lualistnum": LuaListDownloadAllNum,
  276. "lualistsuccessnum": LuaListDownloadSuccessAllNum,
  277. "luabiddingnum": LuaBiddingDownloadAllNum,
  278. "pythonlistnum": PythonListDownloadAllNum,
  279. "pythonlistsuccessnum": PythonListDownloadSuccessAllNum,
  280. "pythonbiddingnum": PythonBiddingDownloadAllNum,
  281. "comeintime": time.Now().Unix(),
  282. "date": Publishtime,
  283. })
  284. //重置
  285. LuaListDownloadAllNum = 0
  286. LuaListDownloadSuccessAllNum = 0
  287. LuaBiddingDownloadAllNum = 0
  288. PythonListDownloadAllNum = 0
  289. PythonListDownloadSuccessAllNum = 0
  290. PythonBiddingDownloadAllNum = 0
  291. //发送统计
  292. resp, err := http.Post(
  293. "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=97850772-88d0-4544-a2c3-6201aeddff9e",
  294. "application/json",
  295. bytes.NewBuffer([]byte(resultContent)),
  296. )
  297. if err != nil {
  298. fmt.Println("request error:", err)
  299. return
  300. }
  301. defer resp.Body.Close()
  302. }
  303. //func GetHighListDataNum(ctime, etime int64, ptime string, siteInfoMap map[string]*SiteInfo) {
  304. // defer qu.Catch()
  305. // sess := util.MgoS.GetMgoConn()
  306. // defer util.MgoS.DestoryMongoConn(sess)
  307. // query := map[string]interface{}{
  308. // "comeintime": map[string]interface{}{
  309. // "$gte": ctime,
  310. // "$lt": etime,
  311. // },
  312. // //"publishtime": map[string]interface{}{
  313. // // "$gte": ctime,
  314. // // "$lt": 1654617600,
  315. // //},
  316. // "publishtime": map[string]interface{}{
  317. // "$regex": ptime,
  318. // },
  319. // }
  320. // fields := map[string]interface{}{
  321. // "state": 1,
  322. // "href": 1,
  323. // "site": 1,
  324. // }
  325. // lock := &sync.Mutex{}
  326. // wg := &sync.WaitGroup{}
  327. // ch := make(chan bool, 3)
  328. // qu.Debug("query:", query)
  329. // it := sess.DB(util.MgoS.DbName).C("spider_highlistdata").Find(&query).Select(&fields).Iter()
  330. // n := 0
  331. // for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  332. // wg.Add(1)
  333. // ch <- true
  334. // go func(tmp map[string]interface{}) {
  335. // defer func() {
  336. // <-ch
  337. // wg.Done()
  338. // }()
  339. // href := qu.ObjToString(tmp["href"])
  340. // state := qu.IntAll(tmp["state"])
  341. // site := qu.ObjToString(tmp["site"])
  342. // lock.Lock()
  343. // if info := siteInfoMap[site]; info != nil && AllHref[href] != site { //匹配要统计的重点网站
  344. // info.ListAllNum++
  345. // if state == 1 {
  346. // info.ListSuccessNum++
  347. // }
  348. // AllHref[href] = site
  349. // }
  350. // lock.Unlock()
  351. // }(tmp)
  352. // if n%1000 == 0 {
  353. // logger.Debug(n)
  354. // }
  355. // tmp = map[string]interface{}{}
  356. // }
  357. // wg.Wait()
  358. //}
  359. //
  360. //func GetListDataNum(ctime, etime int64, ptime string, siteInfoMap map[string]*SiteInfo) {
  361. // defer qu.Catch()
  362. // sess := util.MgoS.GetMgoConn()
  363. // defer util.MgoS.DestoryMongoConn(sess)
  364. // query := map[string]interface{}{
  365. // "comeintime": map[string]interface{}{
  366. // "$gte": ctime,
  367. // "$lt": etime,
  368. // },
  369. // "publishtime": map[string]interface{}{
  370. // "$regex": ptime,
  371. // },
  372. // }
  373. // fields := map[string]interface{}{
  374. // "state": 1,
  375. // "href": 1,
  376. // "site": 1,
  377. // }
  378. // lock := &sync.Mutex{}
  379. // wg := &sync.WaitGroup{}
  380. // ch := make(chan bool, 3)
  381. // it := sess.DB(util.MgoS.DbName).C("spider_listdata").Find(&query).Select(&fields).Iter()
  382. // n := 0
  383. // for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  384. // wg.Add(1)
  385. // ch <- true
  386. // go func(tmp map[string]interface{}) {
  387. // defer func() {
  388. // <-ch
  389. // wg.Done()
  390. // }()
  391. // href := qu.ObjToString(tmp["href"])
  392. // state := qu.IntAll(tmp["state"])
  393. // site := qu.ObjToString(tmp["site"])
  394. // lock.Lock()
  395. // if info := siteInfoMap[site]; info != nil && AllHref[href] != site { //匹配要统计的重点网站
  396. // info.ListAllNum++
  397. // if state == 1 {
  398. // info.ListSuccessNum++
  399. // }
  400. // AllHref[href] = site
  401. // }
  402. // lock.Unlock()
  403. // }(tmp)
  404. // if n%1000 == 0 {
  405. // logger.Debug(n)
  406. // }
  407. // tmp = map[string]interface{}{}
  408. // }
  409. // wg.Wait()
  410. //}
  411. //
  412. //func GetDataBakNum(stime, etime int64, siteInfoMap map[string]*SiteInfo) {
  413. // defer qu.Catch()
  414. // //spider_highlistdata和spider_listdata列表页统计不出的站点,借由data_bak统计
  415. // //存在问题
  416. // //1、列表页无发布时间的,未计入统计
  417. // //2、若果按列表统计后,所有站点再由data_bak统计一遍,列表页、详情页变链接者会多统计
  418. // for site, info := range siteInfoMap {
  419. // if info.ListAllNum == 0 {
  420. // query := map[string]interface{}{
  421. // "comeintime": map[string]interface{}{
  422. // "$gte": stime,
  423. // "$lt": etime,
  424. // },
  425. // "publishtime": map[string]interface{}{
  426. // "$gte": stime,
  427. // "$lt": etime,
  428. // },
  429. // "site": site,
  430. // }
  431. // count := util.MgoS.Count("data_bak", query)
  432. // info.ListAllNum = count
  433. // info.ListSuccessNum = count
  434. // }
  435. // }
  436. // AllHref = map[string]string{}
  437. //}
  438. //
  439. //func GetDataBakNum_Back(stime, etime int64, siteInfoMap map[string]*SiteInfo) {
  440. // defer qu.Catch()
  441. // logger.Info("单独统计data_bak开始...")
  442. // sess := util.MgoS.GetMgoConn()
  443. // defer util.MgoS.DestoryMongoConn(sess)
  444. // query := map[string]interface{}{
  445. // "comeintime": map[string]interface{}{
  446. // "$gte": stime,
  447. // "$lt": etime,
  448. // },
  449. // "publishtime": map[string]interface{}{
  450. // "$gte": stime,
  451. // "$lt": etime,
  452. // },
  453. // }
  454. // fields := map[string]interface{}{
  455. // "href": 1,
  456. // "site": 1,
  457. // }
  458. // lock := &sync.Mutex{}
  459. // wg := &sync.WaitGroup{}
  460. // ch := make(chan bool, 3)
  461. // it := sess.DB(util.MgoS.DbName).C("data_bak").Find(&query).Select(&fields).Iter()
  462. // n := 0
  463. // for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  464. // wg.Add(1)
  465. // ch <- true
  466. // go func(tmp map[string]interface{}) {
  467. // defer func() {
  468. // <-ch
  469. // wg.Done()
  470. // }()
  471. // href := qu.ObjToString(tmp["href"])
  472. // site := qu.ObjToString(tmp["site"])
  473. // lock.Lock()
  474. // if info := siteInfoMap[site]; info != nil && AllHref[href] != site { //匹配要统计的重点网站
  475. // info.ListAllNum++
  476. // info.ListSuccessNum++
  477. // AllHref[href] = site
  478. // }
  479. // lock.Unlock()
  480. // }(tmp)
  481. // if n%1000 == 0 {
  482. // logger.Debug(n)
  483. // }
  484. // tmp = map[string]interface{}{}
  485. // }
  486. // wg.Wait()
  487. // logger.Info("单独统计data_bak完成...")
  488. //}
  489. //func GetSiteInfoExcel(listInfo, backInfo, allSpiderInfo map[string]*SiteInfo) {
  490. // defer qu.Catch()
  491. // file, err := xlsx.OpenFile("res/sitecount.xlsx")
  492. // if err != nil {
  493. // qu.Debug("open file err:", err)
  494. // return
  495. // }
  496. // infoArr := []map[string]*SiteInfo{listInfo, backInfo, allSpiderInfo}
  497. // for i, sheet := range []*xlsx.Sheet{file.Sheets[0], file.Sheets[1], file.Sheets[2]} {
  498. // for site, info := range infoArr[i] {
  499. // row := sheet.AddRow()
  500. // row.AddCell().SetValue(site)
  501. // row.AddCell().SetValue(info.Num)
  502. // row.AddCell().SetValue(info.ListAllNum)
  503. // row.AddCell().SetValue(info.ListSuccessNum)
  504. // if i == 2 {
  505. // row.AddCell().SetValue(info.PTimeSuccessNum)
  506. // }
  507. // row.AddCell().SetValue(0)
  508. // row.AddCell().SetValue(info.Modifyuser)
  509. // row.AddCell().SetValue(info.State)
  510. // row.AddCell().SetValue(info.Domain)
  511. // row.AddCell().SetValue(info.Stype)
  512. // row.AddCell().SetValue(info.Platform)
  513. // }
  514. // }
  515. // //SendSiteInfoToWxWork(file)
  516. // file.Save("res/tmp.xlsx")
  517. //}