|
@@ -16,19 +16,21 @@ import (
|
|
)
|
|
)
|
|
|
|
|
|
type SiteInfo struct {
|
|
type SiteInfo struct {
|
|
- Site string `json:""` //站点
|
|
|
|
- Num int `json:"averagenum"` //每日网站发布平均量
|
|
|
|
- Modifyuser string `json:"modifyuser"` //维护人
|
|
|
|
- State string `json:"state"` //网站状态
|
|
|
|
- Domain string `json:"domain"` //域名
|
|
|
|
- Stype string `json:"stype"` //网站类型
|
|
|
|
- Platform string `json:"platform"` //所属平台
|
|
|
|
- Coverage string `json:"coverage"` //覆盖率
|
|
|
|
- ListAllNum int `json:"listallnum"` //href去重,当天采集数据量
|
|
|
|
- ListSuccessNum int `json:"listsuccessnum"` //href去重,当天采集成功数据量
|
|
|
|
- PTimeSuccessNum int `json:"ptimesuccessnum"` //href去重,当天发布采集成功数据量
|
|
|
|
- PTimeSuccessDbNum int `json:"ptimesuccessdbnum"` //href去重,data_bak当天发布采集成功数据量
|
|
|
|
- Comeintime int64 `json:"comeintime"` //href去重,当天发布采集成功数据量
|
|
|
|
|
|
+ Site string `json:"site"` //站点
|
|
|
|
+ Num int `json:"averagenum"` //每日网站发布平均量
|
|
|
|
+ Modifyuser string `json:"modifyuser"` //维护人
|
|
|
|
+ State string `json:"state"` //网站状态
|
|
|
|
+ Domain string `json:"domain"` //域名
|
|
|
|
+ Stype string `json:"stype"` //网站类型
|
|
|
|
+ Platform string `json:"platform"` //所属平台
|
|
|
|
+ Coverage string `json:"coverage"` //覆盖率
|
|
|
|
+ ListAllNum int `json:"listallnum"` //href去重,当天采集数据量
|
|
|
|
+ ListSuccessNum int `json:"listsuccessnum"` //href去重,当天采集成功数据量
|
|
|
|
+ PTimeSuccessNum int `json:"ptimesuccessnum"` //href去重,当天发布采集成功数据量
|
|
|
|
+ PTimeSuccessDbNum int `json:"ptimesuccessdbnum"` //href去重,data_bak当天发布采集成功数据量
|
|
|
|
+ ThreeDaysAgoNum int `json:"threedaysagonum"` //三天前当天的数据量再次统计(有些站点发布延迟导致当天数据量不准确,再次统计)
|
|
|
|
+ BeforeThreeDaysAgoNum int `json:"beforethreedaysagonum"` //三天前当天的数据量历史统计
|
|
|
|
+ Comeintime int64 `json:"comeintime"` //href去重,当天发布采集成功数据量
|
|
}
|
|
}
|
|
|
|
|
|
var SiteInfoModel = `{
|
|
var SiteInfoModel = `{
|
|
@@ -106,8 +108,10 @@ func SendInfoToWxWork_SiteDataCount() {
|
|
GetAllSpidercodeNum(allSpiderMap)
|
|
GetAllSpidercodeNum(allSpiderMap)
|
|
//6、汇总excel
|
|
//6、汇总excel
|
|
//GetSiteInfoExcel(siteInfoMap, siteInfoMap_Back, allSpiderMap)
|
|
//GetSiteInfoExcel(siteInfoMap, siteInfoMap_Back, allSpiderMap)
|
|
- GetSiteInfoExcel(allSpiderMap)
|
|
|
|
|
|
+ day := GetThreeDaysAgoNum(allSpiderMap)
|
|
|
|
+ GetSiteInfoExcel(allSpiderMap, day)
|
|
}
|
|
}
|
|
|
|
+
|
|
func GetAllSpidercodeNum(siteInfoMap map[string]*SiteInfo) {
|
|
func GetAllSpidercodeNum(siteInfoMap map[string]*SiteInfo) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
logger.Info("统计采集量luacodeinfo开始...")
|
|
logger.Info("统计采集量luacodeinfo开始...")
|
|
@@ -166,7 +170,199 @@ func GetAllSpidercodeNum(siteInfoMap map[string]*SiteInfo) {
|
|
logger.Debug("统计采集量luacodeinfo完成...")
|
|
logger.Debug("统计采集量luacodeinfo完成...")
|
|
}
|
|
}
|
|
|
|
|
|
-func GetSiteInfoExcel(allSpiderInfo map[string]*SiteInfo) {
|
|
|
|
|
|
+func GetThreeDaysAgoNum(siteInfoMap map[string]*SiteInfo) (strStime string) {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ //1、获取三个工作日之前的日期
|
|
|
|
+ baseDay := 3
|
|
|
|
+ for i := 1; i <= baseDay; i++ { //除去三天内周六周日
|
|
|
|
+ beforDay := time.Now().AddDate(0, 0, -i)
|
|
|
|
+ if weekDay := beforDay.Weekday().String(); weekDay == "Saturday" || weekDay == "Sunday" {
|
|
|
|
+ baseDay++
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ logger.Info("baseday:", baseDay)
|
|
|
|
+ stime := util.GetTime(-baseDay) //起始时间戳(三个工作日前)
|
|
|
|
+ strStime = qu.FormatDateByInt64(&stime, qu.Date_Short_Layout) //起始日期
|
|
|
|
+ logger.Info("查询天:", stime, strStime)
|
|
|
|
+ //3、统计数据量
|
|
|
|
+ GetSpiderHighListDataNum(stime, strStime, siteInfoMap) //spider_highlistdata
|
|
|
|
+ GetSpiderListDataNum(stime, strStime, siteInfoMap) //spider_listdata
|
|
|
|
+ GetPythonDataNum(stime, strStime, siteInfoMap)
|
|
|
|
+ GetNumByLastTime(stime, baseDay, siteInfoMap)
|
|
|
|
+ return
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func GetSpiderHighListDataNum(stime int64, strStime string, siteInfoMap map[string]*SiteInfo) {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ sess := util.MgoS.GetMgoConn()
|
|
|
|
+ defer util.MgoS.DestoryMongoConn(sess)
|
|
|
|
+ HrefRepeatMap := map[string]string{}
|
|
|
|
+ lock := &sync.Mutex{}
|
|
|
|
+ wg := &sync.WaitGroup{}
|
|
|
|
+ ch := make(chan bool, 5)
|
|
|
|
+ query := map[string]interface{}{
|
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
|
+ "$gte": stime,
|
|
|
|
+ },
|
|
|
|
+ "publishtime": map[string]interface{}{
|
|
|
|
+ "$regex": strStime,
|
|
|
|
+ },
|
|
|
|
+ }
|
|
|
|
+ fieles := map[string]interface{}{
|
|
|
|
+ "href": 1,
|
|
|
|
+ "site": 1,
|
|
|
|
+ }
|
|
|
|
+ it := sess.DB(util.MgoS.DbName).C("spider_highlistdata").Find(&query).Select(&fieles).Iter()
|
|
|
|
+ n := 0
|
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
|
|
+ wg.Add(1)
|
|
|
|
+ ch <- true
|
|
|
|
+ go func(tmp map[string]interface{}) {
|
|
|
|
+ defer func() {
|
|
|
|
+ <-ch
|
|
|
|
+ wg.Done()
|
|
|
|
+ }()
|
|
|
|
+ site := qu.ObjToString(tmp["site"])
|
|
|
|
+ lock.Lock()
|
|
|
|
+ if sInfo := siteInfoMap[site]; sInfo != nil { //要统计的重点站点
|
|
|
|
+ href := qu.ObjToString(tmp["href"])
|
|
|
|
+ if tmpSite := HrefRepeatMap[href]; tmpSite != site { //同站点去重
|
|
|
|
+ sInfo.ThreeDaysAgoNum++
|
|
|
|
+ HrefRepeatMap[href] = site
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ lock.Unlock()
|
|
|
|
+ }(tmp)
|
|
|
|
+ if n%1000 == 0 {
|
|
|
|
+ logger.Debug(n)
|
|
|
|
+ }
|
|
|
|
+ tmp = map[string]interface{}{}
|
|
|
|
+ }
|
|
|
|
+ wg.Wait()
|
|
|
|
+ HrefRepeatMap = map[string]string{}
|
|
|
|
+ logger.Debug("三天前发布spider_highlistdata统计完毕...")
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func GetSpiderListDataNum(stime int64, strStime string, siteInfoMap map[string]*SiteInfo) {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ sess := util.MgoS.GetMgoConn()
|
|
|
|
+ defer util.MgoS.DestoryMongoConn(sess)
|
|
|
|
+ lock := &sync.Mutex{}
|
|
|
|
+ wg := &sync.WaitGroup{}
|
|
|
|
+ ch := make(chan bool, 5)
|
|
|
|
+ query := map[string]interface{}{
|
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
|
+ "$gte": stime,
|
|
|
|
+ },
|
|
|
|
+ "publishtime": map[string]interface{}{
|
|
|
|
+ "$regex": strStime,
|
|
|
|
+ },
|
|
|
|
+ }
|
|
|
|
+ fieles := map[string]interface{}{
|
|
|
|
+ "site": 1,
|
|
|
|
+ "event": 1,
|
|
|
|
+ }
|
|
|
|
+ it := sess.DB(util.MgoS.DbName).C("spider_listdata").Find(&query).Select(&fieles).Iter()
|
|
|
|
+ n := 0
|
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
|
|
+ wg.Add(1)
|
|
|
|
+ ch <- true
|
|
|
|
+ go func(tmp map[string]interface{}) {
|
|
|
|
+ defer func() {
|
|
|
|
+ <-ch
|
|
|
|
+ wg.Done()
|
|
|
|
+ }()
|
|
|
|
+ if qu.IntAll(tmp["event"]) == 7000 { //排除7000节点
|
|
|
|
+ return
|
|
|
|
+ }
|
|
|
|
+ site := qu.ObjToString(tmp["site"])
|
|
|
|
+ lock.Lock()
|
|
|
|
+ if sInfo := siteInfoMap[site]; sInfo != nil { //要统计的重点站点
|
|
|
|
+ sInfo.ThreeDaysAgoNum++
|
|
|
|
+ }
|
|
|
|
+ lock.Unlock()
|
|
|
|
+ }(tmp)
|
|
|
|
+ if n%1000 == 0 {
|
|
|
|
+ logger.Debug(n)
|
|
|
|
+ }
|
|
|
|
+ tmp = map[string]interface{}{}
|
|
|
|
+ }
|
|
|
|
+ wg.Wait()
|
|
|
|
+ logger.Debug("三天前发布spider_listdata统计完毕...")
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func GetPythonDataNum(stime int64, strStime string, siteInfoMap map[string]*SiteInfo) {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ sess := util.MgoPy.GetMgoConn()
|
|
|
|
+ defer util.MgoPy.DestoryMongoConn(sess)
|
|
|
|
+ lock := &sync.Mutex{}
|
|
|
|
+ wg := &sync.WaitGroup{}
|
|
|
|
+ ch := make(chan bool, 5)
|
|
|
|
+ query := map[string]interface{}{
|
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
|
+ "$gte": stime,
|
|
|
|
+ },
|
|
|
|
+ "publishtime": map[string]interface{}{
|
|
|
|
+ "$regex": strStime,
|
|
|
|
+ },
|
|
|
|
+ }
|
|
|
|
+ fieles := map[string]interface{}{
|
|
|
|
+ "site": 1,
|
|
|
|
+ }
|
|
|
|
+ qu.Debug(query)
|
|
|
|
+ it := sess.DB(util.MgoPy.DbName).C("data_bak").Find(&query).Select(&fieles).Iter()
|
|
|
|
+ n := 0
|
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
|
|
+ wg.Add(1)
|
|
|
|
+ ch <- true
|
|
|
|
+ go func(tmp map[string]interface{}) {
|
|
|
|
+ defer func() {
|
|
|
|
+ <-ch
|
|
|
|
+ wg.Done()
|
|
|
|
+ }()
|
|
|
|
+ site := qu.ObjToString(tmp["site"]) + "(python)"
|
|
|
|
+ lock.Lock()
|
|
|
|
+ if sInfo := siteInfoMap[site]; sInfo != nil { //要统计的重点站点
|
|
|
|
+ sInfo.ThreeDaysAgoNum++
|
|
|
|
+ }
|
|
|
|
+ lock.Unlock()
|
|
|
|
+ }(tmp)
|
|
|
|
+ if n%1000 == 0 {
|
|
|
|
+ logger.Debug(n)
|
|
|
|
+ }
|
|
|
|
+ tmp = map[string]interface{}{}
|
|
|
|
+ }
|
|
|
|
+ wg.Wait()
|
|
|
|
+ logger.Debug("三天前发布python统计完毕...")
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func GetNumByLastTime(stime int64, baseDay int, siteInfoMap map[string]*SiteInfo) {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ stimeWeekDay := time.Now().AddDate(0, 0, -baseDay).Weekday().String()
|
|
|
|
+ start := stime + 86400
|
|
|
|
+ end := stime + 86400*2
|
|
|
|
+ if stimeWeekDay == "Friday" { //每周五的数据是每周一统计
|
|
|
|
+ start = stime + 86400*3
|
|
|
|
+ end = stime + 86400*4
|
|
|
|
+ }
|
|
|
|
+ query := map[string]interface{}{
|
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
|
+ "$gte": start,
|
|
|
|
+ "$lt": end,
|
|
|
|
+ },
|
|
|
|
+ }
|
|
|
|
+ logger.Info("历史站点统计", query)
|
|
|
|
+ list, _ := util.MgoEB.Find("site_datacount", query, nil, map[string]interface{}{"site": 1, "ptimesuccessnum": 1}, false, -1, -1)
|
|
|
|
+ for _, l := range *list {
|
|
|
|
+ site := qu.ObjToString(l["site"])
|
|
|
|
+ pNum := qu.IntAll(l["ptimesuccessnum"])
|
|
|
|
+ if sInfo := siteInfoMap[site]; sInfo != nil {
|
|
|
|
+ sInfo.BeforeThreeDaysAgoNum = pNum
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func GetSiteInfoExcel(allSpiderInfo map[string]*SiteInfo, day string) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
file, err := xlsx.OpenFile("res/sitecount.xlsx")
|
|
file, err := xlsx.OpenFile("res/sitecount.xlsx")
|
|
if err != nil {
|
|
if err != nil {
|
|
@@ -181,6 +377,10 @@ func GetSiteInfoExcel(allSpiderInfo map[string]*SiteInfo) {
|
|
style.ApplyFont = true
|
|
style.ApplyFont = true
|
|
font := *xlsx.NewFont(10, "Verdana")
|
|
font := *xlsx.NewFont(10, "Verdana")
|
|
style.Font = font
|
|
style.Font = font
|
|
|
|
+ title1 := day + "新统计(publishtime)"
|
|
|
|
+ title2 := day + "历史统计(publishtime)"
|
|
|
|
+ sheet.Rows[0].Cells[6].SetValue(title1)
|
|
|
|
+ sheet.Rows[0].Cells[7].SetValue(title2)
|
|
row := sheet.AddRow()
|
|
row := sheet.AddRow()
|
|
row.AddCell().SetValue(site)
|
|
row.AddCell().SetValue(site)
|
|
row.AddCell().SetValue(info.Num)
|
|
row.AddCell().SetValue(info.Num)
|
|
@@ -188,6 +388,8 @@ func GetSiteInfoExcel(allSpiderInfo map[string]*SiteInfo) {
|
|
row.AddCell().SetValue(info.ListSuccessNum)
|
|
row.AddCell().SetValue(info.ListSuccessNum)
|
|
row.AddCell().SetValue(info.PTimeSuccessNum)
|
|
row.AddCell().SetValue(info.PTimeSuccessNum)
|
|
row.AddCell().SetValue(info.PTimeSuccessDbNum)
|
|
row.AddCell().SetValue(info.PTimeSuccessDbNum)
|
|
|
|
+ row.AddCell().SetValue(info.ThreeDaysAgoNum)
|
|
|
|
+ row.AddCell().SetValue(info.BeforeThreeDaysAgoNum)
|
|
coverage := float64(info.PTimeSuccessNum) / float64(info.Num)
|
|
coverage := float64(info.PTimeSuccessNum) / float64(info.Num)
|
|
fill := &xlsx.Fill{
|
|
fill := &xlsx.Fill{
|
|
PatternType: "solid",
|
|
PatternType: "solid",
|