|
@@ -25,7 +25,6 @@ var LuaErrTypeInfo = map[string]string{
|
|
|
NEWTASK_DOWNLOADERR: "下载异常",
|
|
|
NEWTASK_DATAINFOWARN: "数据异常警告",
|
|
|
}
|
|
|
-var CodesAuditorLog = map[string]string{}
|
|
|
var DataInfoErrMap = map[int]string{ //需要建数据异常错误的类型
|
|
|
1: "Save Coll Error",
|
|
|
4: "Field Value Is Null",
|
|
@@ -52,6 +51,7 @@ var (
|
|
|
|
|
|
type NewSpider struct {
|
|
|
//爬虫基本信息
|
|
|
+ //AuditTime int64 `bson:"l_uploadtime"`
|
|
|
Code string `bson:"code"`
|
|
|
Site string `bson:"site"`
|
|
|
Channel string `bson:"channel"`
|
|
@@ -64,7 +64,7 @@ type NewSpider struct {
|
|
|
ModifyTime int64 `bson:"modifytime"`
|
|
|
Model int `bson:"model"`
|
|
|
Working int `bson:"working"`
|
|
|
- AuditTime int64 `bson:"l_uploadtime"`
|
|
|
+ Audit bool `bson:"audit"`
|
|
|
ListIsFilter bool `bson:"listisfilter"`
|
|
|
UpLimit int `bson:"uplimit"`
|
|
|
MaxPage int `bson:"maxpage"`
|
|
@@ -126,7 +126,8 @@ type WarnInfo struct {
|
|
|
func NewStartTask() {
|
|
|
InitInfo() //初始化时间
|
|
|
logger.Info(StartTime, EndTime, Publishtime)
|
|
|
- getCodeBaseInfo() //获取爬虫基本信息
|
|
|
+ getCodeAuditorLog() //获取爬虫上线时间
|
|
|
+ getCodeBaseInfo() //获取爬虫基本信息
|
|
|
//getCodeStatus() //获取爬虫响应状态信息
|
|
|
getPythonSummaryInfo() //获取python汇总信息
|
|
|
getLuaSummaryInfo() //获取lua汇总信息
|
|
@@ -152,48 +153,6 @@ func InitInfo() {
|
|
|
Publishtime = qu.FormatDateByInt64(&StartTime, qu.Date_Short_Layout)
|
|
|
}
|
|
|
|
|
|
-func getCodeAuditorLog() {
|
|
|
- defer qu.Catch()
|
|
|
- sess := util.MgoEB.GetMgoConn()
|
|
|
- defer util.MgoEB.DestoryMongoConn(sess)
|
|
|
- lock := &sync.Mutex{}
|
|
|
- wg := &sync.WaitGroup{}
|
|
|
- ch := make(chan bool, 5)
|
|
|
- query := map[string]interface{}{
|
|
|
- "comeintime": map[string]interface{}{
|
|
|
- "$gte": StartTime,
|
|
|
- "$lt": EndTime,
|
|
|
- },
|
|
|
- }
|
|
|
- fields := map[string]interface{}{
|
|
|
- "code": 1,
|
|
|
- "types": 1,
|
|
|
- }
|
|
|
- it := sess.DB(util.MgoEB.DbName).C("lua_logs_auditor").Find(&query).Select(&fields).Sort("_id").Iter()
|
|
|
- n := 0
|
|
|
- for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
|
- wg.Add(1)
|
|
|
- ch <- true
|
|
|
- go func(tmp map[string]interface{}) {
|
|
|
- defer func() {
|
|
|
- <-ch
|
|
|
- wg.Done()
|
|
|
- }()
|
|
|
- code := qu.ObjToString(tmp["code"])
|
|
|
- types := qu.ObjToString(tmp["types"])
|
|
|
- lock.Lock()
|
|
|
- CodesAuditorLog[code] = types
|
|
|
- lock.Unlock()
|
|
|
- }(tmp)
|
|
|
- if n%1000 == 0 {
|
|
|
- logger.Info(n)
|
|
|
- }
|
|
|
- tmp = map[string]interface{}{}
|
|
|
- }
|
|
|
- wg.Wait()
|
|
|
- logger.Info("审核记录信息准备完成...", len(CodesAuditorLog))
|
|
|
-}
|
|
|
-
|
|
|
func getCodeBaseInfo() {
|
|
|
defer qu.Catch()
|
|
|
sess := util.MgoEB.GetMgoConn()
|
|
@@ -296,6 +255,50 @@ func getCodeBaseInfo() {
|
|
|
logger.Info("爬虫基本信息准备完成...", len(NewCodeInfoMap))
|
|
|
}
|
|
|
|
|
|
+func getCodeAuditorLog() {
|
|
|
+ defer qu.Catch()
|
|
|
+ sess := util.MgoEB.GetMgoConn()
|
|
|
+ defer util.MgoEB.DestoryMongoConn(sess)
|
|
|
+ lock := &sync.Mutex{}
|
|
|
+ wg := &sync.WaitGroup{}
|
|
|
+ ch := make(chan bool, 5)
|
|
|
+ query := map[string]interface{}{ //查询前一天的审核记录
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
+ "$gte": StartTime,
|
|
|
+ "$lt": EndTime,
|
|
|
+ },
|
|
|
+ }
|
|
|
+ fields := map[string]interface{}{
|
|
|
+ "code": 1,
|
|
|
+ "types": 1,
|
|
|
+ }
|
|
|
+ it := sess.DB(util.MgoEB.DbName).C("lua_logs_auditor").Find(&query).Select(&fields).Iter()
|
|
|
+ n := 0
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
|
+ wg.Add(1)
|
|
|
+ ch <- true
|
|
|
+ go func(tmp map[string]interface{}) {
|
|
|
+ defer func() {
|
|
|
+ <-ch
|
|
|
+ wg.Done()
|
|
|
+ }()
|
|
|
+ code := qu.ObjToString(tmp["code"])
|
|
|
+ types := qu.ObjToString(tmp["types"])
|
|
|
+ lock.Lock()
|
|
|
+ if sp := NewCodeInfoMap[code]; sp != nil {
|
|
|
+ sp.Audit = strings.Contains(types, "审核")
|
|
|
+ }
|
|
|
+ lock.Unlock()
|
|
|
+ }(tmp)
|
|
|
+ if n%1000 == 0 {
|
|
|
+ logger.Info(n)
|
|
|
+ }
|
|
|
+ tmp = map[string]interface{}{}
|
|
|
+ }
|
|
|
+ wg.Wait()
|
|
|
+ logger.Info("审核记录信息准备完成...")
|
|
|
+}
|
|
|
+
|
|
|
func getCodeStatus() {
|
|
|
defer qu.Catch()
|
|
|
sess := util.MgoEB.GetMgoConn()
|
|
@@ -717,7 +720,7 @@ func getSpiderDownloadRateData() {
|
|
|
sp.Page_TwoOk = !(page_fail == alltimes && page_fail > 0)
|
|
|
//列表页
|
|
|
if !sp.List_IsGetData {
|
|
|
- sp.List_IsGetData = sp.List_AllInTimes == sp.List_NoDataTimes
|
|
|
+ sp.List_IsGetData = sp.List_RunTimes == sp.List_NoDataTimes
|
|
|
}
|
|
|
}
|
|
|
lock.Unlock()
|
|
@@ -923,7 +926,7 @@ func listErr(sp *NewSpider) {
|
|
|
errFlag = true
|
|
|
}
|
|
|
if errFlag {
|
|
|
- if sp.Platform == PLATFORM_COMM && strings.Contains(CodesAuditorLog[sp.Code], "审核") { //通用平台前一天审核的爬虫不建列表页异常任务
|
|
|
+ if sp.Platform == PLATFORM_COMM && sp.Audit { //通用平台前一天审核的爬虫不建列表页异常任务(审核上线后,当天并未执行采集)
|
|
|
return
|
|
|
}
|
|
|
//sp.Error[NEWTASK_LISTERR] = &ErrorInfo{
|
|
@@ -1041,7 +1044,7 @@ func downloadRateErr(sp *NewSpider) {
|
|
|
} else { //无标记,记录采集频率异常
|
|
|
errFlag = true
|
|
|
}
|
|
|
- if errFlag && sp.List_AllInTimes > 0 && sp.AuditTime > 24 {
|
|
|
+ if errFlag && sp.List_AllInTimes > 0 && !sp.Audit {
|
|
|
sp.ErrTypeMap[qu.IntAll(NEWTASK_RATEERR)] = true
|
|
|
sp.ErrDescription += "采集频率异常:\n列表页共采集" + fmt.Sprint(sp.List_RunTimes) + "轮,其中有" + fmt.Sprint(sp.List_AllInTimes) + "轮数据全采\n"
|
|
|
if sp.ErrType < "0" {
|
|
@@ -1053,11 +1056,13 @@ func downloadRateErr(sp *NewSpider) {
|
|
|
if sp.List_AllInTimes > 0 {
|
|
|
errFlag := false
|
|
|
if sp.Model == 1 { //列表页、详情页分开采集模式
|
|
|
- if sp.AuditTime > 24 && (sp.MaxPage == 1 || sp.MaxPage > 100) { //分开采集且不是无限翻页,爬虫审核时间超过24小时,记录异常
|
|
|
+ if !sp.Audit && (sp.MaxPage == 1 || sp.MaxPage > 100) { //分开采集且不是无限翻页,爬虫审核时间超过24小时,记录异常
|
|
|
errFlag = true
|
|
|
}
|
|
|
- } else if sp.Event != 7410 { //列表页、详情页顺序采集模式(排除7410节点)
|
|
|
- if sp.CodeTags != nil {
|
|
|
+ } else if sp.Event != 7410 || sp.Platform != PLATFORM_LUA { //列表页、详情页顺序采集模式(排除7410节点)
|
|
|
+ if sp.Platform == PLATFORM_COMM && sp.Audit { //通用平台前一天审核的爬虫不建列表页异常任务
|
|
|
+ errFlag = false
|
|
|
+ } else if sp.CodeTags != nil {
|
|
|
tagTime, _ := sp.CodeTags[NEWTASK_RATEERR].(int64)
|
|
|
if tagTime == 0 { //无频率异常标记
|
|
|
errFlag = true
|