Bladeren bron

任务新增数据里和失败率统计

mxs 8 maanden geleden
bovenliggende
commit
f1bc8976f3
5 gewijzigde bestanden met toevoegingen van 88 en 35 verwijderingen
  1. 33 13
      src/config.json
  2. 30 0
      src/luatask/newtask.go
  3. 1 1
      src/luatask/task.go
  4. 1 1
      src/main.go
  5. 23 20
      src/util/config.go

+ 33 - 13
src/config.json

@@ -49,79 +49,99 @@
 			"server": "comm",
 			"model": 0,
 			"work": 0,
-			"type": 0
+			"type": 0,
+			"listcoll": "spider_historydata"
 		},
 		"7100": {
 			"server": "bid",
 			"model": 1,
 			"work": 0,
-			"type": 1
+			"type": 1,
+			"listcoll": "spider_highlistdata"
 		},
 		"7110": {
 			"server": "bid",
 			"model": 1,
 			"work": 0,
-			"type": 1
+			"type": 1,
+			"listcoll": "spider_highlistdata"
 		},
 		"7200": {
 			"server": "comm",
 			"model": 1,
 			"work": 1,
-			"type": 2
+			"type": 2,
+			"listcoll": "spider_highlistdata"
 		},
 		"7210": {
 			"server": "comm",
 			"model": 1,
 			"work": 1,
-			"type": 2
+			"type": 2,
+			"listcoll": "spider_highlistdata"
 		},
 		"7300": {
 			"server": "comm",
 			"model": 1,
 			"work": 1,
-			"type": 2
+			"type": 2,
+			"listcoll": "spider_highlistdata"
 		},
 		"7310": {
 			"server": "comm",
 			"model": 1,
 			"work": 1,
-			"type": 2
+			"type": 2,
+			"listcoll": "spider_highlistdata"
 		},
 		"7400": {
 			"server": "bid",
 			"model": 1,
 			"work": 0,
-			"type": 1
+			"type": 1,
+			"listcoll": "spider_highlistdata"
+		},
+		"7440": {
+			"server": "bid",
+			"model": 1,
+			"work": 0,
+			"type": 1,
+			"listcoll": "spider_highlistdata"
 		},
 		"7410": {
 			"server": "bid",
 			"model": 0,
 			"work": 0,
-			"type": 1
+			"type": 1,
+			"listcoll": "spider_listdata"
 		},
 		"7500": {
 			"server": "comm",
 			"model": 0,
 			"work": 1,
-			"type": 3
+			"type": 3,
+			"listcoll": "spider_listdata"
 		},
 		"7510": {
 			"server": "comm",
 			"model": 0,
 			"work": 1,
-			"type": 3
+			"type": 3,
+			"listcoll": "spider_listdata"
 		},
 		"7520": {
 			"server": "comm",
 			"model": 0,
 			"work": 1,
-			"type": 4
+			"type": 4,
+			"listcoll": "spider_listdata"
 		},
 		"7700": {
 			"server": "comm",
 			"model": 0,
 			"work": 1,
-			"type": 3
+			"type": 3,
+			"listcoll": "spider_listdata"
 		}
 	},
 	"resetdatastatecron": "0 0 2 ? * MON-FRI",

+ 30 - 0
src/luatask/newtask.go

@@ -5,6 +5,7 @@ import (
 	"github.com/donnie4w/go-logger/logger"
 	"go.mongodb.org/mongo-driver/bson"
 	qu "qfw/util"
+	"strconv"
 	"sync"
 	"time"
 	"util"
@@ -79,6 +80,8 @@ type NewSpider struct {
 	Channel_Status int `bson:"channel_status"` //栏目响应状态
 	//补充信息
 	Comeintime int64 `bson:"comeintime"`
+	//列表页数据存储表
+	ListDataColl string `json:"listdatacoll"`
 	//异常汇总
 	//Error          map[string]*ErrorInfo `json:"error"`
 	ErrType        string       `bson:"errtype"`        //记录权重最高的异常类型
@@ -180,6 +183,7 @@ func getCodeBaseInfo() {
 			}
 			sp.Working = util.CodeEventWorking[sp.Working]
 			sp.Model = util.CodeEventModel[sp.Event]
+			sp.ListDataColl = util.CodeListDataColl[sp.Event]
 			sp.MaxPage = maxPage
 			if sp.Platform == "python" {
 				sp.Model = 1
@@ -709,6 +713,7 @@ func createTask(sp *NewSpider, taskArr *[][]map[string]interface{}, lock *sync.M
 		"i_times":      1,
 		"l_comeintime": 1,
 	}
+	count, failRate := sp.getCodeFailDataCount()
 	list, _ := util.MgoEB.Find("task", query, nil, fields, false, -1, -1)
 	update := []map[string]interface{}{}
 	if list != nil && len(*list) > 0 { //已有任务
@@ -729,6 +734,8 @@ func createTask(sp *NewSpider, taskArr *[][]map[string]interface{}, lock *sync.M
 		comeintime_old := qu.Int64All(task["l_comeintime"]) //历史任务创建时间
 
 		result := map[string]interface{}{
+			"i_count":      count,
+			"f_failrate":   failRate,
 			"i_event":      sp.Event,
 			"l_updatetime": time.Now().Unix(),
 			"i_times":      times_old + 1,
@@ -779,6 +786,8 @@ func createTask(sp *NewSpider, taskArr *[][]map[string]interface{}, lock *sync.M
 			"l_complete":   util.CompleteTime("1"),
 			//"s_urgency":    "1",
 			"s_platform": sp.Platform,
+			"i_count":    count,
+			"f_failrate": failRate,
 		}
 		update = append(update, query)
 		update = append(update, saveMap)
@@ -1147,6 +1156,27 @@ func (sp *NewSpider) getErrHrefs(coll, errType string, query map[string]interfac
 	}
 	return
 }
+func (sp *NewSpider) getCodeFailDataCount() (int, float64) {
+	//7日总下载量
+	query := map[string]interface{}{
+		"spidercode": sp.Code,
+		"comeintime": map[string]interface{}{
+			"$gte": util.GetTime(-7),
+			"$lte": util.GetTime(0),
+		},
+	}
+	allCount := util.MgoS.Count(sp.ListDataColl, query)
+	if allCount == 0 {
+		return allCount, float64(0)
+	}
+	//7日下载失败量
+	query["state"] = -1
+	failCount := util.MgoS.Count(sp.ListDataColl, query)
+
+	avg := float64(failCount) / float64(allCount)
+	value, _ := strconv.ParseFloat(fmt.Sprintf("%.2f", avg*100), 64)
+	return allCount, value
+}
 
 //更新爬虫
 func updateLuaconfig() {

+ 1 - 1
src/luatask/task.go

@@ -2415,7 +2415,7 @@ func ResetHistoryDataState() {
 			publishtime = DateReg.FindString(publishtime)
 			state := qu.IntAll(tmp["state"])
 			lock.Lock()
-			if state == 1 { //下载成功数据迁移至spider_historydata
+			if state == 1 { //下载成功数据迁移至spider_historydata_back
 				save = append(save, tmp)
 				update = append(update, map[string]interface{}{"$set": map[string]interface{}{"delete": true}})
 			} else if state == -1 {

+ 1 - 1
src/main.go

@@ -60,7 +60,7 @@ func main() {
 	//lua小组周报
 	c.AddFunc("0 30 17 ? * FRI", timetask.SpiderWeeklyReportForLua)
 	//c.AddFunc(CodeSummaryCron, SummaryCode)    //上架爬虫信息汇总
-	//重点网站任务统计
+	//重点爬虫任务统计
 	c.AddFunc(util.ImportSiteTaskCron1, timetask.ImportSiteTask)
 	c.AddFunc(util.ImportSiteTaskCron2, timetask.ImportSiteTask)
 	ch := make(chan bool, 1)

+ 23 - 20
src/util/config.go

@@ -24,26 +24,27 @@ var (
 	RandomDataPushCron         string
 	QyworkRemindModifyuserCron string
 	QyworkRemindAuditorCron    string
-	StartTaskCron              string        //任务开始
-	CodeSummaryCron            string        //每天统计爬虫信息
-	ResetDataStateCron         string        //重置数据状态
-	FileWarnCron               string        //每天统计附件异常数据
-	MoveListDataCron           string        //迁移spider_highlistdata、spider_listdata数据
-	SpiderWeeklyReportCron     string        //周报统计
-	LuamoveCron                string        //每月1日统计要转移节点的爬虫
-	UpdateLuaUserCron          string        //每天更新外包爬虫到内部人员
-	UpdateSiteCron             string        //每天更新站点信息
-	NewStarTaskCron            string        //新版爬虫维护任务
-	ImportSiteTaskCron1        string        //重点网站任务
-	ImportSiteTaskCron2        string        //重点网站任务
-	CloseNum                   int           //关闭几天的任务
-	DayNum                     int           //更新数据天数
-	CodeEventModel             map[int]int   //节点对应的采集模式0:老模式;1:新模式
-	CodeEventWorking           map[int]int   //节点对应的采集模式0:高性能模式;1:队列模式
-	CodeEventType              map[int]int   //节点对应的不同类型的采集频率
-	GMail                      *gm.GmailAuth //邮件信息
-	To                         string        //邮件接收人
-	CreateTaskInfoFormat       map[int]bool  //不创建任务的爬虫infoformat类型
+	StartTaskCron              string         //任务开始
+	CodeSummaryCron            string         //每天统计爬虫信息
+	ResetDataStateCron         string         //重置数据状态
+	FileWarnCron               string         //每天统计附件异常数据
+	MoveListDataCron           string         //迁移spider_highlistdata、spider_listdata数据
+	SpiderWeeklyReportCron     string         //周报统计
+	LuamoveCron                string         //每月1日统计要转移节点的爬虫
+	UpdateLuaUserCron          string         //每天更新外包爬虫到内部人员
+	UpdateSiteCron             string         //每天更新站点信息
+	NewStarTaskCron            string         //新版爬虫维护任务
+	ImportSiteTaskCron1        string         //重点网站任务
+	ImportSiteTaskCron2        string         //重点网站任务
+	CloseNum                   int            //关闭几天的任务
+	DayNum                     int            //更新数据天数
+	CodeEventModel             map[int]int    //节点对应的采集模式0:老模式;1:新模式
+	CodeEventWorking           map[int]int    //节点对应的采集模式0:高性能模式;1:队列模式
+	CodeEventType              map[int]int    //节点对应的不同类型的采集频率
+	CodeListDataColl           map[int]string //节点对应的不同类型的采集频率
+	GMail                      *gm.GmailAuth  //邮件信息
+	To                         string         //邮件接收人
+	CreateTaskInfoFormat       map[int]bool   //不创建任务的爬虫infoformat类型
 )
 var TimeReg = regexp.MustCompile("[0-9]{4}-[0-9]{2}-[0-9]{2}")
 
@@ -132,6 +133,7 @@ func InitOther() {
 	CodeEventModel = map[int]int{}
 	CodeEventWorking = map[int]int{}
 	CodeEventType = map[int]int{}
+	CodeListDataColl = map[int]string{}
 	for event, info := range eventsinfo {
 		eventTmp := qu.IntAll(event)
 		infoMap := info.(map[string]interface{})
@@ -139,6 +141,7 @@ func InitOther() {
 		CodeEventModel[eventTmp] = qu.IntAll(infoMap["model"])
 		CodeEventWorking[eventTmp] = qu.IntAll(infoMap["work"])
 		CodeEventType[eventTmp] = qu.IntAll(infoMap["type"])
+		CodeListDataColl[eventTmp] = qu.ObjToString(infoMap["listcoll"])
 	}
 	//qu.Debug(UploadEvents, CodeEventModel, CodeEventWorking)
 	//mail