Browse Source

爬虫周报

maxiaoshan 2 years ago
parent
commit
e9727d0302
5 changed files with 335 additions and 5 deletions
  1. 5 4
      src/config.json
  2. 321 0
      src/luatask/report.go
  3. 2 0
      src/luatask/task.go
  4. 2 0
      src/main.go
  5. 5 1
      src/util/config.go

+ 5 - 4
src/config.json

@@ -10,11 +10,11 @@
 		"size": 15
     },
 	"bideditor": {
-		"addr": "192.168.3.207:27001",
+		"addr": "192.168.3.207:27092",
 		"db": "editor",
 		"size": 2,
-		"username": "root",
-		"password": "root"
+		"username": "",
+		"password": ""
 	},
 	"pyspider":{
 		"addr": "192.168.3.207:27092",
@@ -107,7 +107,8 @@
 	"qyworkremindmodifyusercron": "0 0 9 ? * MON-FRI",
 	"qyworkremindauditorcron": "0 30 17 ? * MON-FRI",
 	"filewarncron": "0 55 8 ? * *",
-	"movelistdata": "0 0 0 ? * *",
+	"movelistdatacron": "0 0 0 ? * *",
+	"spiderweeklyreportcron": "0 0 9 ? * SUN",
 	"closenum": 2,
 	"daynum": 6,
 	"mail": {

+ 321 - 0
src/luatask/report.go

@@ -0,0 +1,321 @@
+package luatask
+
+import (
+	"bytes"
+	"fmt"
+	"github.com/tealeg/xlsx"
+	qu "qfw/util"
+	gm "qfw/util/mail"
+	"strconv"
+	"sync"
+	"time"
+	"util"
+)
+
+func SpiderWeeklyReport() {
+	defer qu.Catch()
+	sTime := util.GetTime(-7)
+	eTime := util.GetTime(0)
+	ssTime := util.GetTime(-14)
+	qu.Debug(ssTime, sTime, eTime)
+	//1、任务相关
+	//本周新建任务数量
+	query := map[string]interface{}{
+		"l_comeintime": map[string]interface{}{
+			"$gte": sTime,
+			"$lt":  eTime,
+		},
+	}
+	thisWeekCreateTaskAllNum := util.MgoE.Count("task", query)
+	qu.Debug("本周新建任务数量:", thisWeekCreateTaskAllNum)
+	//上周未核实任务数量
+	query = map[string]interface{}{
+		"l_comeintime": map[string]interface{}{
+			"$gte": ssTime,
+			"$lt":  sTime,
+		},
+		"i_state": map[string]interface{}{
+			"$lte": 1, //任务状态:待确认、待处理
+		},
+	}
+	lastWeekNoCheckTaskAllNum := util.MgoE.Count("task", query)
+	qu.Debug("上周未核实任务数量:", lastWeekNoCheckTaskAllNum)
+	//本周核实任务数量
+	query = map[string]interface{}{
+		"l_checktime": map[string]interface{}{
+			"$gte": sTime,
+			"$lt":  eTime,
+		},
+		"i_state": map[string]interface{}{
+			"$gte": 2, //任务状态:处理中、待审核、审核通过、未通过、关闭
+		},
+	}
+	thisWeekCheckTaskAllNum := util.MgoE.Count("task", query)
+	qu.Debug("本周核实任务数量:", thisWeekCheckTaskAllNum)
+	//完成进度(本周核实量/(存量待核实+本周核实量))
+	query = map[string]interface{}{
+		"i_state": map[string]interface{}{
+			"$lte": 1, //任务状态:待确认、待处理
+		},
+	}
+	noCheckTaskAllNum := util.MgoE.Count("task", query)
+	qu.Debug("存量待核实任务数量:", noCheckTaskAllNum)
+	checkTaskCompleteSchedule := float64(thisWeekCheckTaskAllNum) / float64(noCheckTaskAllNum+thisWeekCheckTaskAllNum)
+	resultCheckTaskCompleteSchedule, _ := strconv.ParseFloat(fmt.Sprintf("%.4f", checkTaskCompleteSchedule), 64)
+	qu.Debug("任务审核完成进度:", resultCheckTaskCompleteSchedule)
+
+	//2、lua新增爬虫
+	//本周新建爬虫数量
+	query = map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": sTime,
+			"$lt":  eTime,
+		},
+		"platform": "golua平台",
+	}
+	thisWeekCreateLuaAllNum := util.MgoEB.Count("luaconfig", query)
+	qu.Debug("本周lua新建爬虫数量:", thisWeekCreateLuaAllNum)
+	//上周新建爬虫未完成数量
+	query = map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": ssTime,
+			"$lt":  sTime,
+		},
+		"platform": "golua平台",
+		"event":    7000,
+		"state": map[string]interface{}{
+			"$lte": 2, //待完成、待审核、未通过
+		},
+	}
+	lastWeekNoFinishLuaAllNum := CountLastWeekNoFinishNewLuaAllNum(query)
+	qu.Debug("上周lua新建爬虫未完成数量:", lastWeekNoFinishLuaAllNum)
+	//本周已完成新建爬虫数量
+	query = map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": sTime,
+			"$lt":  eTime,
+		},
+	}
+	thisWeekFinishLuaAllNum := util.MgoE.Count("lua_logs_auditor_new", query)
+	qu.Debug("本周lua已完成新建爬虫数量:", thisWeekFinishLuaAllNum)
+	//完成进度(本周完成量/(存量待完成+本周完成量))
+	query = map[string]interface{}{
+		"event":    7000,
+		"platform": "golua平台",
+		"state": map[string]interface{}{
+			"$lte": 2, //待完成、待审核、未通过
+		},
+	}
+	noFinishLuaAllNum := CountNoFinishLuaAllNum(query)
+	qu.Debug("存量待完成新建爬虫数量:", noFinishLuaAllNum)
+	luaCompleteSchedule := float64(thisWeekFinishLuaAllNum) / float64(noFinishLuaAllNum+thisWeekFinishLuaAllNum)
+	resultLuaCompleteSchedule, _ := strconv.ParseFloat(fmt.Sprintf("%.4f", luaCompleteSchedule), 64)
+	qu.Debug("lua新建爬虫完成进度:", resultLuaCompleteSchedule)
+
+	//3、lua历史维护
+	//本周新增待维护任务数量
+	query = map[string]interface{}{
+		"l_checktime": map[string]interface{}{
+			"$gte": sTime,
+			"$lt":  eTime,
+		},
+		"i_state": map[string]interface{}{ //处理中、待审核、审核通过、未通过
+			"$gte": 2,
+			"$lte": 5,
+		},
+	}
+	thisWeekHistoryLuaAllNum := util.MgoE.Count("task", query)
+	qu.Debug("本周新增待维护任务数量:", thisWeekHistoryLuaAllNum)
+	//上周分发未完成任务数量
+	query = map[string]interface{}{
+		"l_checktime": map[string]interface{}{
+			"$gte": ssTime,
+			"$lt":  sTime,
+		},
+		"i_state": map[string]interface{}{ //处理中、待审核、未通过
+			"$in": []int{2, 3, 5},
+		},
+	}
+	lastWeekHistoryNoFinishLuaAllNum := util.MgoE.Count("task", query)
+	qu.Debug("上周分发未完成任务数量:", lastWeekHistoryNoFinishLuaAllNum)
+	//本周完成待维护任务数量
+	query = map[string]interface{}{
+		"l_uploadtime": map[string]interface{}{
+			"$gte": sTime,
+			"$lt":  eTime,
+		},
+		"i_state": 4, //审核通过
+	}
+	thisWeekHistoryFinishLuaAllNum := util.MgoE.Count("task", query)
+	qu.Debug("本周完成待维护任务数量:", thisWeekHistoryFinishLuaAllNum)
+	//完成进度(本周维护量/(存量待维护+本周维护量))
+	query = map[string]interface{}{
+		"i_state": map[string]interface{}{
+			"$in": []int{2, 3, 5}, //处理中、待审核、未通过
+		},
+	}
+	historyNoFinishLuaAllNum := util.MgoE.Count("task", query)
+	luaHistoryCompleteSchedule := float64(thisWeekHistoryFinishLuaAllNum) / float64(historyNoFinishLuaAllNum+thisWeekHistoryFinishLuaAllNum)
+	resultLuaHistoryCompleteSchedule, _ := strconv.ParseFloat(fmt.Sprintf("%.4f", luaHistoryCompleteSchedule), 64)
+	qu.Debug("任务完成进度:", resultLuaHistoryCompleteSchedule)
+
+	//4、python爬虫
+	//本周新建爬虫数量
+	query = map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": sTime,
+			"$lt":  eTime,
+		},
+		"platform": "python",
+	}
+	thisWeekCreatePythonAllNum := util.MgoEB.Count("luaconfig", query)
+	qu.Debug("本周python新建爬虫数量:", thisWeekCreatePythonAllNum)
+	//上周新建爬虫未完成数量
+	query = map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": ssTime,
+			"$lt":  sTime,
+		},
+		"platform": "python",
+		"state": map[string]interface{}{
+			"$ne": 11,
+		},
+	}
+	lastWeekNoFinishPythonAllNum := util.MgoEB.Count("luaconfig", query)
+	qu.Debug("上周python新建爬虫未完成数量:", lastWeekNoFinishPythonAllNum)
+	//lastWeekCreatePythonAllNum := util.MgoEB.Count("luaconfig", query)
+	//lastWeekNoFinishPythonAllNum := lastWeekCreatePythonAllNum - thisWeekFinishPythonAllNum
+	//本周已完成爬虫数量(无审核日志,暂时无法统计准确)
+	query = map[string]interface{}{
+		"platform": "python",
+		"state":    11,
+		"modifytime": map[string]interface{}{
+			"$gte": sTime,
+			"$lt":  eTime,
+		},
+	}
+	thisWeekFinishPythonAllNum := util.MgoEB.Count("luaconfig", query)
+	qu.Debug("本周python已完成爬虫数量:", thisWeekFinishPythonAllNum)
+	//完成进度(本周完成量/(存量待完成+本周完成量))
+	query = map[string]interface{}{
+		"platform": "python",
+		"state": map[string]interface{}{
+			"$ne": 11,
+		},
+	}
+	historyNoFinishPythonAllNum := util.MgoEB.Count("luaconfig", query)
+	pythonHistoryCompleteSchedule := float64(thisWeekFinishPythonAllNum) / float64(historyNoFinishPythonAllNum+thisWeekFinishPythonAllNum)
+	resultPythonHistoryCompleteSchedule, _ := strconv.ParseFloat(fmt.Sprintf("%.4f", pythonHistoryCompleteSchedule), 64)
+	qu.Debug("python新建爬虫完成进度:", resultPythonHistoryCompleteSchedule)
+
+	//生成excel
+	file, err := xlsx.OpenFile("res/report.xlsx")
+	if err != nil {
+		qu.Debug("Open Report File Error:", err)
+		return
+	}
+	sheet := file.Sheets[0]
+	taskRow := sheet.Rows[1]
+	taskRow.Cells[1].SetValue(thisWeekCreateTaskAllNum)
+	taskRow.Cells[2].SetValue(lastWeekNoCheckTaskAllNum)
+	taskRow.Cells[3].SetValue(thisWeekCheckTaskAllNum)
+	taskRow.Cells[4].SetValue(fmt.Sprint(resultCheckTaskCompleteSchedule*100) + "%")
+	newLuaRow := sheet.Rows[2]
+	newLuaRow.Cells[1].SetValue(thisWeekCreateLuaAllNum)
+	newLuaRow.Cells[2].SetValue(lastWeekNoFinishLuaAllNum)
+	newLuaRow.Cells[3].SetValue(thisWeekFinishLuaAllNum)
+	newLuaRow.Cells[4].SetValue(fmt.Sprint(resultLuaCompleteSchedule*100) + "%")
+	historyLuaRow := sheet.Rows[3]
+	historyLuaRow.Cells[1].SetValue(thisWeekHistoryLuaAllNum)
+	historyLuaRow.Cells[2].SetValue(lastWeekHistoryNoFinishLuaAllNum)
+	historyLuaRow.Cells[3].SetValue(thisWeekHistoryFinishLuaAllNum)
+	historyLuaRow.Cells[4].SetValue(fmt.Sprint(resultLuaHistoryCompleteSchedule*100) + "%")
+	pythonRow := sheet.Rows[4]
+	pythonRow.Cells[1].SetValue(thisWeekCreatePythonAllNum)
+	pythonRow.Cells[2].SetValue(lastWeekNoFinishPythonAllNum)
+	pythonRow.Cells[3].SetValue(thisWeekFinishPythonAllNum)
+	pythonRow.Cells[4].SetValue(fmt.Sprint(resultPythonHistoryCompleteSchedule*100) + "%")
+	mw := &util.MyWrite{
+		Byte: &bytes.Buffer{},
+	}
+	file.Write(mw)
+	bt := mw.Byte.Bytes()
+	now := time.Now()
+	name := qu.FormatDate(&now, qu.Date_Short_Layout) + "爬虫统计周报.xlsx"
+	gm.GSendMail_Bq("jy@jianyu360.cn", "maxiaoshan@topnet.net.cn", "", "", "爬虫统计周报", "", name, bt, util.GMail)
+}
+
+//上周新建爬虫未完成数量
+func CountLastWeekNoFinishNewLuaAllNum(query map[string]interface{}) (result int) {
+	defer qu.Catch()
+	sess := util.MgoEB.GetMgoConn()
+	defer util.MgoEB.DestoryMongoConn(sess)
+	ch := make(chan bool, 2)
+	wg := &sync.WaitGroup{}
+	lock := &sync.Mutex{}
+	field := map[string]interface{}{
+		"code": 1,
+	}
+	it := sess.DB(util.MgoEB.DbName).C("luaconfig").Find(&query).Select(&field).Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
+		ch <- true
+		wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			code := qu.ObjToString(tmp["code"])
+			count := util.MgoE.Count("lua_logs_auditor_new", map[string]interface{}{"code": code})
+			if count == 0 { //新爬虫审核记录表中有记录表示已经审核上架过,无论现在爬虫什么状态,视为历史爬虫
+				lock.Lock()
+				result++
+				lock.Unlock()
+			}
+		}(tmp)
+		if n%10 == 0 {
+			qu.Debug("current:", n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	return
+}
+
+func CountNoFinishLuaAllNum(query map[string]interface{}) (result int) {
+	defer qu.Catch()
+	sess := util.MgoEB.GetMgoConn()
+	defer util.MgoEB.DestoryMongoConn(sess)
+	ch := make(chan bool, 2)
+	wg := &sync.WaitGroup{}
+	lock := &sync.Mutex{}
+	field := map[string]interface{}{
+		"code": 1,
+	}
+	it := sess.DB(util.MgoEB.DbName).C("luaconfig").Find(&query).Select(&field).Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
+		ch <- true
+		wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			code := qu.ObjToString(tmp["code"])
+			count := util.MgoE.Count("lua_logs_auditor", map[string]interface{}{"code": code, "types": "审核"})
+			if count == 0 { //无审核记录表示新爬虫
+				lock.Lock()
+				result++
+				lock.Unlock()
+			}
+		}(tmp)
+		if n%100 == 0 {
+			qu.Debug("current:", n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	return
+}

+ 2 - 0
src/luatask/task.go

@@ -1786,6 +1786,7 @@ func CreateTask(t *Task, sp *Spider, upsertBulk *[][]map[string]interface{}, loc
 		urgency_old := qu.ObjToString(task["s_urgency"])   //历史任务紧急度
 		descript_old := qu.ObjToString(task["s_descript"]) //历史任务描述
 		result := map[string]interface{}{
+			"i_event":             sp.Event,
 			"i_frequencyerrtimes": sp.FrequencyErrTimes,
 			"i_num":               sp.DownloadSuccessNum, //下载量(目前按下载成功量)
 			"l_updatetime":        time.Now().Unix(),
@@ -1849,6 +1850,7 @@ func CreateTask(t *Task, sp *Spider, upsertBulk *[][]map[string]interface{}, loc
 			"s_urgency":           "1",
 			"i_frequencyerrtimes": sp.FrequencyErrTimes,
 			"i_pendstate":         sp.PendState, //爬虫挂起状态
+			"s_platform":          sp.Platform,
 		}
 		update = append(update, query)
 		update = append(update, saveMap)

+ 2 - 0
src/main.go

@@ -51,6 +51,8 @@ func main() {
 	//编辑器任务
 	c.AddFunc(util.ResetDataStateCron, luatask.ResetDataState) //重置数据
 	c.AddFunc(util.StartTaskCron, luatask.StartTask)           //开始任务
+	//爬虫相关周报统计
+	//c.AddFunc(util.SpiderWeeklyReportCron, luatask.SpiderWeeklyReport)
 	//c.AddFunc(CodeSummaryCron, SummaryCode)    //上架爬虫信息汇总
 	ch := make(chan bool, 1)
 	<-ch

+ 5 - 1
src/util/config.go

@@ -25,6 +25,7 @@ var (
 	ResetDataStateCron         string        //重置数据状态
 	FileWarnCron               string        //每天统计附件异常数据
 	MoveListDataCron           string        //迁移spider_highlistdata、spider_listdata数据
+	SpiderWeeklyReportCron     string        //周报统计
 	CloseNum                   int           //关闭几天的任务
 	DayNum                     int           //更新数据天数
 	CodeEventModel             map[int]int   //节点对应的采集模式0:老模式;1:新模式
@@ -78,6 +79,7 @@ func InitMgo() {
 }
 
 func InitOther() {
+	//cron
 	StartTaskCron = qu.ObjToString(Config["startaskcron"])
 	CodeSummaryCron = qu.ObjToString(Config["codesummarycron"])
 	ResetDataStateCron = qu.ObjToString(Config["resetdatastatecron"])
@@ -85,7 +87,9 @@ func InitOther() {
 	QyworkRemindModifyuserCron = qu.ObjToString(Config["qyworkremindmodifyusercron"])
 	QyworkRemindAuditorCron = qu.ObjToString(Config["qyworkremindauditorcron"])
 	FileWarnCron = qu.ObjToString(Config["filewarncron"])
-	MoveListDataCron = qu.ObjToString(Config["movelistdata"])
+	MoveListDataCron = qu.ObjToString(Config["movelistdatacron"])
+	SpiderWeeklyReportCron = qu.ObjToString(Config["spiderweeklyreportcron"])
+
 	CloseNum = qu.IntAll(Config["closenum"])
 	DayNum = qu.IntAll(Config["daynum"])
 	MsgServers := Config["msgservers"].(map[string]interface{})