Bläddra i källkod

爬虫任务新增通用平台爬虫统计

mxs 4 månader sedan
förälder
incheckning
89e4fd471a
3 ändrade filer med 984 tillägg och 1769 borttagningar
  1. 5 5
      src/config.json
  2. 132 37
      src/luatask/newtask.go
  3. 847 1727
      src/luatask/task.go

+ 5 - 5
src/config.json

@@ -1,28 +1,28 @@
 {
 	"spider":{
-		"addr": "192.168.3.207:29099",
+		"addr": "172.20.45.130:27017",
 		"db": "spider",
 		"size": 15
     },
     "editor": {
-		"addr": "192.168.3.71:29099",
+		"addr": "172.20.45.130:27017",
 		"db": "editor",
 		"size": 15
     },
 	"bideditor": {
-		"addr": "192.168.3.71:29099",
+		"addr": "172.20.45.130:27017",
 		"db": "editor",
 		"size": 2,
 		"username": "",
 		"password": ""
 	},
 	"pyspider":{
-		"addr": "192.168.3.71:29099",
+		"addr": "172.20.45.130:27017",
 		"db": "py_spider",
 		"size": 5
 	},
 	"bidding": {
-		"addr": "192.168.3.71:29099",
+		"addr": "172.20.45.130:27017",
 		"db": "qfw",
 		"size": 2,
 		"username": "",

+ 132 - 37
src/luatask/newtask.go

@@ -6,6 +6,7 @@ import (
 	"go.mongodb.org/mongo-driver/bson"
 	qu "qfw/util"
 	"strconv"
+	"strings"
 	"sync"
 	"time"
 	"timetask"
@@ -13,6 +14,7 @@ import (
 )
 
 const NEWTASK_LISTERR, NEWTASK_DATAINFOERR, NEWTASK_PAGEFLIPERR, NEWTASK_RATEERR, NEWTASK_DOWNLOADERR, NEWTASK_DATAINFOWARN = "1", "2", "3", "4", "5", "6"
+const PLATFORM_LUA, PLATFORM_CHROME, PLATFORM_COMM, PLATFORM_PYTHON = "golua平台", "chrome", "通用平台", "python"
 
 var NewCodeInfoMap = map[string]*NewSpider{}
 var LuaErrTypeInfo = map[string]string{
@@ -23,6 +25,7 @@ var LuaErrTypeInfo = map[string]string{
 	NEWTASK_DOWNLOADERR:  "下载异常",
 	NEWTASK_DATAINFOWARN: "数据异常警告",
 }
+var CodesAuditorLog = map[string]string{}
 var DataInfoErrMap = map[int]string{ //需要建数据异常错误的类型
 	1:  "Save Coll Error",
 	4:  "Field Value Is Null",
@@ -41,6 +44,12 @@ var DataInfoWarnMap = map[int]string{ //需要建数据异常警告的类型
 
 var UpdateLuaconfig [][]map[string]interface{}
 
+var (
+	StartTime   int64  //上一个工作日的起始时间
+	EndTime     int64  //上一个工作日的结束时间
+	Publishtime string //发布时间
+)
+
 type NewSpider struct {
 	//爬虫基本信息
 	Code         string                 `bson:"code"`
@@ -90,6 +99,23 @@ type NewSpider struct {
 	ErrDescription string       `bson:"errdescription"` //异常描述
 }
 
+type Task struct {
+	Platform          string //平台
+	Code              string //爬虫代码
+	Site              string //站点
+	Channel           string //栏目
+	ModifyUser        string //维护人员
+	ModifyId          string //维护人员id
+	ErrType           int    //异常类型:8:采集频率异常;7:列表页异常;5:下载异常;4:运行异常;3:发布时间异常;2:数据异常;1:数据量异常
+	Description       string //描述
+	State             int    //状态
+	Event             int    //节点
+	Num               int    //下载量
+	FrequencyErrTimes int    //爬虫采集频率异常次数
+	DescribeMap       map[int]string
+	//ErrInfo     map[string]map[string]interface{} //异常集合
+}
+
 type WarnInfo struct {
 	Info   string            `bson:"info"`
 	Num    int               `bson:"num"`
@@ -100,8 +126,8 @@ type WarnInfo struct {
 func NewStartTask() {
 	InitInfo() //初始化时间
 	logger.Info(StartTime, EndTime, Publishtime)
-	getCodeBaseInfo()      //获取爬虫基本信息
-	getCodeStatus()        //获取爬虫响应状态信息
+	getCodeBaseInfo() //获取爬虫基本信息
+	//getCodeStatus()        //获取爬虫响应状态信息
 	getPythonSummaryInfo() //获取python汇总信息
 	getLuaSummaryInfo()    //获取lua汇总信息
 	getSpiderWarnInfo()    //获取异常数据
@@ -111,6 +137,63 @@ func NewStartTask() {
 	timetask.CountLuaPythonNumEveryDay() //每日采集量统计
 }
 
+// 初始化
+func InitInfo() {
+	defer qu.Catch()
+	//CodeInfoMap = map[string]*Spider{} //初始化
+	//AllHref = map[string]string{}
+	//SameDayHref = map[string]string{}
+	//DataBakAllHref = map[string]string{}
+	//UserTaskNum = map[string]map[string]int{}
+	//StartTime, EndTime = util.GetWorkDayTimeUnix()
+	UpdateLuaconfig = [][]map[string]interface{}{}
+	StartTime = util.GetTime(-1)
+	EndTime = util.GetTime(0)
+	Publishtime = qu.FormatDateByInt64(&StartTime, qu.Date_Short_Layout)
+}
+
+func getCodeAuditorLog() {
+	defer qu.Catch()
+	sess := util.MgoEB.GetMgoConn()
+	defer util.MgoEB.DestoryMongoConn(sess)
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	query := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": StartTime,
+			"$lt":  EndTime,
+		},
+	}
+	fields := map[string]interface{}{
+		"code":  1,
+		"types": 1,
+	}
+	it := sess.DB(util.MgoEB.DbName).C("lua_logs_auditor").Find(&query).Select(&fields).Sort("_id").Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			code := qu.ObjToString(tmp["code"])
+			types := qu.ObjToString(tmp["types"])
+			lock.Lock()
+			CodesAuditorLog[code] = types
+			lock.Unlock()
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("审核记录信息准备完成...", len(CodesAuditorLog))
+}
+
 func getCodeBaseInfo() {
 	defer qu.Catch()
 	sess := util.MgoEB.GetMgoConn()
@@ -119,30 +202,35 @@ func getCodeBaseInfo() {
 	wg := &sync.WaitGroup{}
 	ch := make(chan bool, 5)
 	query := map[string]interface{}{
-		"$or": []interface{}{
-			//lua、python上线爬虫
-			map[string]interface{}{
-				"state": map[string]interface{}{
-					"$in": []int{5, 11}, //上架、上线爬虫
-				},
-				//"platform": map[string]interface{}{
-				//	"$in": []string{"golua平台", "chrome", "python"},
-				//},
-			},
-			//lua正在被维护的爬虫和上架爬虫
-			map[string]interface{}{
-				"platform": map[string]interface{}{
-					"$in": []string{"golua平台", "chrome"},
-				},
-				"state": map[string]interface{}{
-					"$in": []int{0, 1, 2}, //待完成、待审核、未通过
-				},
-				"event": map[string]interface{}{
-					"$ne": 7000,
-				},
-			},
+		"state": map[string]interface{}{ //所有平台在线爬虫
+			"$in": []int{5, 11}, //上架、上线爬虫
 		},
 	}
+	//query := map[string]interface{}{
+	//	"$or": []interface{}{
+	//		//lua、python上线爬虫
+	//		map[string]interface{}{
+	//			"state": map[string]interface{}{
+	//				"$in": []int{5, 11}, //上架、上线爬虫
+	//			},
+	//			//"platform": map[string]interface{}{
+	//			//	"$in": []string{"golua平台", "chrome", "python"},
+	//			//},
+	//		},
+	//		//lua正在被维护的爬虫和上架爬虫
+	//		map[string]interface{}{
+	//			"platform": map[string]interface{}{
+	//				"$in": []string{"golua平台", "chrome"},
+	//			},
+	//			"state": map[string]interface{}{
+	//				"$in": []int{0, 1, 2}, //待完成、待审核、未通过
+	//			},
+	//			"event": map[string]interface{}{
+	//				"$ne": 7000,
+	//			},
+	//		},
+	//	},
+	//}
 	fields := map[string]interface{}{
 		"code":         1,
 		"site":         1,
@@ -159,7 +247,7 @@ func getCodeBaseInfo() {
 		"infoformat":   1,
 		"param_common": 1,
 	}
-	it := sess.DB(util.MgoEB.DbName).C("luaconfig").Find(&query).Select(&fields).Iter()
+	it := sess.DB(util.MgoEB.DbName).C("luaconfig_online").Find(&query).Select(&fields).Iter()
 	n := 0
 	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
 		wg.Add(1)
@@ -190,8 +278,10 @@ func getCodeBaseInfo() {
 			sp.Model = util.CodeEventModel[sp.Event]
 			sp.ListDataColl = util.CodeListDataColl[sp.Event]
 			sp.MaxPage = maxPage
-			if sp.Platform == "python" {
+			if sp.Platform == PLATFORM_PYTHON {
 				sp.Model = 1
+			} else if sp.Platform == PLATFORM_COMM {
+				sp.Model = 0
 			}
 			lock.Lock()
 			NewCodeInfoMap[sp.Code] = sp
@@ -476,7 +566,7 @@ func getSpiderHighListDownloadNum() { //竞品数据暂未统计(延迟采集
 			state := qu.IntAll(tmp["state"])
 			times := tmp["times"]
 			lock.Lock()
-			if sp := NewCodeInfoMap[code]; sp != nil {
+			if sp := NewCodeInfoMap[code]; sp != nil && (sp.Platform == PLATFORM_LUA || sp.Platform == PLATFORM_CHROME) {
 				if state == 1 {
 					sp.Detail_DownloadSuccessNum++
 				} else if state == -1 {
@@ -532,7 +622,7 @@ func getSpiderListDownloadNum() {
 			href := qu.ObjToString(tmp["href"])
 			lock.Lock()
 			defer lock.Unlock()
-			if sp := NewCodeInfoMap[code]; sp != nil {
+			if sp := NewCodeInfoMap[code]; sp != nil && (sp.Platform == PLATFORM_LUA || sp.Platform == PLATFORM_CHROME) {
 				tmpState := repeatHrefMap[href]
 				if tmpState == 1 { //该href已记录下载成功,后续不做任务记录
 					return
@@ -614,7 +704,7 @@ func getSpiderDownloadRateData() {
 				sp.Page_FlipOk = !(uplimit > 0)
 				sp.UpLimit = uplimit
 				//判断第一页采集是否异常
-				sp.Page_OneOk = !(page_onefail == alltimes && page_onefail > 0)
+				sp.Page_OneOk = !(page_onefail > 0 && page_onefail == alltimes)
 				if sp.Page_OneOk {
 					percent := float64(page_onefail) / float64(alltimes)
 					if page_onefail <= 5 && (percent > 0.75 && percent < 1) {
@@ -625,6 +715,10 @@ func getSpiderDownloadRateData() {
 				}
 				//判断第二页采集是否异常
 				sp.Page_TwoOk = !(page_fail == alltimes && page_fail > 0)
+				//列表页
+				if !sp.List_IsGetData {
+					sp.List_IsGetData = sp.List_AllInTimes == sp.List_NoDataTimes
+				}
 			}
 			lock.Unlock()
 		}(tmp)
@@ -812,7 +906,7 @@ func getAllErr(sp *NewSpider) {
 }
 func listErr(sp *NewSpider) {
 	defer qu.Catch()
-	if sp.Platform == "python" && !sp.Py_IsValid {
+	if sp.Platform == PLATFORM_PYTHON && !sp.Py_IsValid {
 		return
 	}
 	//if !sp.List_IsGetData || sp.List_RunTimes == 0 {
@@ -829,6 +923,9 @@ func listErr(sp *NewSpider) {
 			errFlag = true
 		}
 		if errFlag {
+			if sp.Platform == PLATFORM_COMM && strings.Contains(CodesAuditorLog[sp.Code], "审核") { //通用平台前一天审核的爬虫不建列表页异常任务
+				return
+			}
 			//sp.Error[NEWTASK_LISTERR] = &ErrorInfo{
 			//	ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_LISTERR]: true},
 			//}
@@ -884,10 +981,9 @@ func dataInfoErr(sp *NewSpider) {
 	}
 
 }
-
 func pageFlipErr(sp *NewSpider) {
 	defer qu.Catch()
-	if sp.Platform == "python" {
+	if sp.Platform == PLATFORM_PYTHON {
 		return
 	}
 	errFlag := false
@@ -928,10 +1024,9 @@ func pageFlipErr(sp *NewSpider) {
 		}
 	}
 }
-
 func downloadRateErr(sp *NewSpider) {
 	defer qu.Catch()
-	if sp.Platform == "python" {
+	if sp.Platform == PLATFORM_PYTHON {
 		if !sp.Py_IsValid { //无效爬虫
 			return
 		} else {
@@ -988,7 +1083,7 @@ func downloadRateErr(sp *NewSpider) {
 }
 func downloadFailedErr(sp *NewSpider) {
 	defer qu.Catch()
-	if sp.Platform == "python" && !sp.Py_IsValid {
+	if sp.Platform == PLATFORM_PYTHON && !sp.Py_IsValid {
 		return
 	}
 	flagTime := util.GetTime(-7)
@@ -1183,7 +1278,7 @@ func (sp *NewSpider) getCodeFailDataCount() (int, float64) {
 	return allCount, value
 }
 
-//更新爬虫
+// 更新爬虫
 func updateLuaconfig() {
 	if len(UpdateLuaconfig) > 0 {
 		util.MgoEB.UpdateBulk("luaconfig", UpdateLuaconfig...)
@@ -1191,7 +1286,7 @@ func updateLuaconfig() {
 	}
 }
 
-//关闭任务
+// 关闭任务
 func closeTask() {
 	defer qu.Catch()
 	query := map[string]interface{}{ //关闭7天未转为待处理的下载异常,数据异常警告类型的任务

Filskillnaden har hållts tillbaka eftersom den är för stor
+ 847 - 1727
src/luatask/task.go


Vissa filer visades inte eftersom för många filer har ändrats