瀏覽代碼

通用平台爬虫任务流程修改

mxs 4 月之前
父節點
當前提交
0d6f452498
共有 1 個文件被更改,包括 56 次插入51 次删除
  1. 56 51
      src/luatask/newtask.go

+ 56 - 51
src/luatask/newtask.go

@@ -25,7 +25,6 @@ var LuaErrTypeInfo = map[string]string{
 	NEWTASK_DOWNLOADERR:  "下载异常",
 	NEWTASK_DATAINFOWARN: "数据异常警告",
 }
-var CodesAuditorLog = map[string]string{}
 var DataInfoErrMap = map[int]string{ //需要建数据异常错误的类型
 	1:  "Save Coll Error",
 	4:  "Field Value Is Null",
@@ -52,6 +51,7 @@ var (
 
 type NewSpider struct {
 	//爬虫基本信息
+	//AuditTime    int64                  `bson:"l_uploadtime"`
 	Code         string                 `bson:"code"`
 	Site         string                 `bson:"site"`
 	Channel      string                 `bson:"channel"`
@@ -64,7 +64,7 @@ type NewSpider struct {
 	ModifyTime   int64                  `bson:"modifytime"`
 	Model        int                    `bson:"model"`
 	Working      int                    `bson:"working"`
-	AuditTime    int64                  `bson:"l_uploadtime"`
+	Audit        bool                   `bson:"audit"`
 	ListIsFilter bool                   `bson:"listisfilter"`
 	UpLimit      int                    `bson:"uplimit"`
 	MaxPage      int                    `bson:"maxpage"`
@@ -126,7 +126,8 @@ type WarnInfo struct {
 func NewStartTask() {
 	InitInfo() //初始化时间
 	logger.Info(StartTime, EndTime, Publishtime)
-	getCodeBaseInfo() //获取爬虫基本信息
+	getCodeAuditorLog() //获取爬虫上线时间
+	getCodeBaseInfo()   //获取爬虫基本信息
 	//getCodeStatus()        //获取爬虫响应状态信息
 	getPythonSummaryInfo() //获取python汇总信息
 	getLuaSummaryInfo()    //获取lua汇总信息
@@ -152,48 +153,6 @@ func InitInfo() {
 	Publishtime = qu.FormatDateByInt64(&StartTime, qu.Date_Short_Layout)
 }
 
-func getCodeAuditorLog() {
-	defer qu.Catch()
-	sess := util.MgoEB.GetMgoConn()
-	defer util.MgoEB.DestoryMongoConn(sess)
-	lock := &sync.Mutex{}
-	wg := &sync.WaitGroup{}
-	ch := make(chan bool, 5)
-	query := map[string]interface{}{
-		"comeintime": map[string]interface{}{
-			"$gte": StartTime,
-			"$lt":  EndTime,
-		},
-	}
-	fields := map[string]interface{}{
-		"code":  1,
-		"types": 1,
-	}
-	it := sess.DB(util.MgoEB.DbName).C("lua_logs_auditor").Find(&query).Select(&fields).Sort("_id").Iter()
-	n := 0
-	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
-		wg.Add(1)
-		ch <- true
-		go func(tmp map[string]interface{}) {
-			defer func() {
-				<-ch
-				wg.Done()
-			}()
-			code := qu.ObjToString(tmp["code"])
-			types := qu.ObjToString(tmp["types"])
-			lock.Lock()
-			CodesAuditorLog[code] = types
-			lock.Unlock()
-		}(tmp)
-		if n%1000 == 0 {
-			logger.Info(n)
-		}
-		tmp = map[string]interface{}{}
-	}
-	wg.Wait()
-	logger.Info("审核记录信息准备完成...", len(CodesAuditorLog))
-}
-
 func getCodeBaseInfo() {
 	defer qu.Catch()
 	sess := util.MgoEB.GetMgoConn()
@@ -296,6 +255,50 @@ func getCodeBaseInfo() {
 	logger.Info("爬虫基本信息准备完成...", len(NewCodeInfoMap))
 }
 
+func getCodeAuditorLog() {
+	defer qu.Catch()
+	sess := util.MgoEB.GetMgoConn()
+	defer util.MgoEB.DestoryMongoConn(sess)
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	query := map[string]interface{}{ //查询前一天的审核记录
+		"comeintime": map[string]interface{}{
+			"$gte": StartTime,
+			"$lt":  EndTime,
+		},
+	}
+	fields := map[string]interface{}{
+		"code":  1,
+		"types": 1,
+	}
+	it := sess.DB(util.MgoEB.DbName).C("lua_logs_auditor").Find(&query).Select(&fields).Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			code := qu.ObjToString(tmp["code"])
+			types := qu.ObjToString(tmp["types"])
+			lock.Lock()
+			if sp := NewCodeInfoMap[code]; sp != nil {
+				sp.Audit = strings.Contains(types, "审核")
+			}
+			lock.Unlock()
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("审核记录信息准备完成...")
+}
+
 func getCodeStatus() {
 	defer qu.Catch()
 	sess := util.MgoEB.GetMgoConn()
@@ -717,7 +720,7 @@ func getSpiderDownloadRateData() {
 				sp.Page_TwoOk = !(page_fail == alltimes && page_fail > 0)
 				//列表页
 				if !sp.List_IsGetData {
-					sp.List_IsGetData = sp.List_AllInTimes == sp.List_NoDataTimes
+					sp.List_IsGetData = sp.List_RunTimes == sp.List_NoDataTimes
 				}
 			}
 			lock.Unlock()
@@ -923,7 +926,7 @@ func listErr(sp *NewSpider) {
 			errFlag = true
 		}
 		if errFlag {
-			if sp.Platform == PLATFORM_COMM && strings.Contains(CodesAuditorLog[sp.Code], "审核") { //通用平台前一天审核的爬虫不建列表页异常任务
+			if sp.Platform == PLATFORM_COMM && sp.Audit { //通用平台前一天审核的爬虫不建列表页异常任务(审核上线后,当天并未执行采集)
 				return
 			}
 			//sp.Error[NEWTASK_LISTERR] = &ErrorInfo{
@@ -1041,7 +1044,7 @@ func downloadRateErr(sp *NewSpider) {
 			} else { //无标记,记录采集频率异常
 				errFlag = true
 			}
-			if errFlag && sp.List_AllInTimes > 0 && sp.AuditTime > 24 {
+			if errFlag && sp.List_AllInTimes > 0 && !sp.Audit {
 				sp.ErrTypeMap[qu.IntAll(NEWTASK_RATEERR)] = true
 				sp.ErrDescription += "采集频率异常:\n列表页共采集" + fmt.Sprint(sp.List_RunTimes) + "轮,其中有" + fmt.Sprint(sp.List_AllInTimes) + "轮数据全采\n"
 				if sp.ErrType < "0" {
@@ -1053,11 +1056,13 @@ func downloadRateErr(sp *NewSpider) {
 		if sp.List_AllInTimes > 0 {
 			errFlag := false
 			if sp.Model == 1 { //列表页、详情页分开采集模式
-				if sp.AuditTime > 24 && (sp.MaxPage == 1 || sp.MaxPage > 100) { //分开采集且不是无限翻页,爬虫审核时间超过24小时,记录异常
+				if !sp.Audit && (sp.MaxPage == 1 || sp.MaxPage > 100) { //分开采集且不是无限翻页,爬虫审核时间超过24小时,记录异常
 					errFlag = true
 				}
-			} else if sp.Event != 7410 { //列表页、详情页顺序采集模式(排除7410节点)
-				if sp.CodeTags != nil {
+			} else if sp.Event != 7410 || sp.Platform != PLATFORM_LUA { //列表页、详情页顺序采集模式(排除7410节点)
+				if sp.Platform == PLATFORM_COMM && sp.Audit { //通用平台前一天审核的爬虫不建列表页异常任务
+					errFlag = false
+				} else if sp.CodeTags != nil {
 					tagTime, _ := sp.CodeTags[NEWTASK_RATEERR].(int64)
 					if tagTime == 0 { //无频率异常标记
 						errFlag = true