|
@@ -6,6 +6,7 @@ import (
|
|
"go.mongodb.org/mongo-driver/bson"
|
|
"go.mongodb.org/mongo-driver/bson"
|
|
qu "qfw/util"
|
|
qu "qfw/util"
|
|
"strconv"
|
|
"strconv"
|
|
|
|
+ "strings"
|
|
"sync"
|
|
"sync"
|
|
"time"
|
|
"time"
|
|
"timetask"
|
|
"timetask"
|
|
@@ -13,6 +14,7 @@ import (
|
|
)
|
|
)
|
|
|
|
|
|
const NEWTASK_LISTERR, NEWTASK_DATAINFOERR, NEWTASK_PAGEFLIPERR, NEWTASK_RATEERR, NEWTASK_DOWNLOADERR, NEWTASK_DATAINFOWARN = "1", "2", "3", "4", "5", "6"
|
|
const NEWTASK_LISTERR, NEWTASK_DATAINFOERR, NEWTASK_PAGEFLIPERR, NEWTASK_RATEERR, NEWTASK_DOWNLOADERR, NEWTASK_DATAINFOWARN = "1", "2", "3", "4", "5", "6"
|
|
|
|
+const PLATFORM_LUA, PLATFORM_CHROME, PLATFORM_COMM, PLATFORM_PYTHON = "golua平台", "chrome", "通用平台", "python"
|
|
|
|
|
|
var NewCodeInfoMap = map[string]*NewSpider{}
|
|
var NewCodeInfoMap = map[string]*NewSpider{}
|
|
var LuaErrTypeInfo = map[string]string{
|
|
var LuaErrTypeInfo = map[string]string{
|
|
@@ -23,6 +25,7 @@ var LuaErrTypeInfo = map[string]string{
|
|
NEWTASK_DOWNLOADERR: "下载异常",
|
|
NEWTASK_DOWNLOADERR: "下载异常",
|
|
NEWTASK_DATAINFOWARN: "数据异常警告",
|
|
NEWTASK_DATAINFOWARN: "数据异常警告",
|
|
}
|
|
}
|
|
|
|
+var CodesAuditorLog = map[string]string{}
|
|
var DataInfoErrMap = map[int]string{ //需要建数据异常错误的类型
|
|
var DataInfoErrMap = map[int]string{ //需要建数据异常错误的类型
|
|
1: "Save Coll Error",
|
|
1: "Save Coll Error",
|
|
4: "Field Value Is Null",
|
|
4: "Field Value Is Null",
|
|
@@ -41,6 +44,12 @@ var DataInfoWarnMap = map[int]string{ //需要建数据异常警告的类型
|
|
|
|
|
|
var UpdateLuaconfig [][]map[string]interface{}
|
|
var UpdateLuaconfig [][]map[string]interface{}
|
|
|
|
|
|
|
|
+var (
|
|
|
|
+ StartTime int64 //上一个工作日的起始时间
|
|
|
|
+ EndTime int64 //上一个工作日的结束时间
|
|
|
|
+ Publishtime string //发布时间
|
|
|
|
+)
|
|
|
|
+
|
|
type NewSpider struct {
|
|
type NewSpider struct {
|
|
//爬虫基本信息
|
|
//爬虫基本信息
|
|
Code string `bson:"code"`
|
|
Code string `bson:"code"`
|
|
@@ -90,6 +99,23 @@ type NewSpider struct {
|
|
ErrDescription string `bson:"errdescription"` //异常描述
|
|
ErrDescription string `bson:"errdescription"` //异常描述
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+type Task struct {
|
|
|
|
+ Platform string //平台
|
|
|
|
+ Code string //爬虫代码
|
|
|
|
+ Site string //站点
|
|
|
|
+ Channel string //栏目
|
|
|
|
+ ModifyUser string //维护人员
|
|
|
|
+ ModifyId string //维护人员id
|
|
|
|
+ ErrType int //异常类型:8:采集频率异常;7:列表页异常;5:下载异常;4:运行异常;3:发布时间异常;2:数据异常;1:数据量异常
|
|
|
|
+ Description string //描述
|
|
|
|
+ State int //状态
|
|
|
|
+ Event int //节点
|
|
|
|
+ Num int //下载量
|
|
|
|
+ FrequencyErrTimes int //爬虫采集频率异常次数
|
|
|
|
+ DescribeMap map[int]string
|
|
|
|
+ //ErrInfo map[string]map[string]interface{} //异常集合
|
|
|
|
+}
|
|
|
|
+
|
|
type WarnInfo struct {
|
|
type WarnInfo struct {
|
|
Info string `bson:"info"`
|
|
Info string `bson:"info"`
|
|
Num int `bson:"num"`
|
|
Num int `bson:"num"`
|
|
@@ -100,8 +126,8 @@ type WarnInfo struct {
|
|
func NewStartTask() {
|
|
func NewStartTask() {
|
|
InitInfo() //初始化时间
|
|
InitInfo() //初始化时间
|
|
logger.Info(StartTime, EndTime, Publishtime)
|
|
logger.Info(StartTime, EndTime, Publishtime)
|
|
- getCodeBaseInfo() //获取爬虫基本信息
|
|
|
|
- getCodeStatus() //获取爬虫响应状态信息
|
|
|
|
|
|
+ getCodeBaseInfo() //获取爬虫基本信息
|
|
|
|
+ //getCodeStatus() //获取爬虫响应状态信息
|
|
getPythonSummaryInfo() //获取python汇总信息
|
|
getPythonSummaryInfo() //获取python汇总信息
|
|
getLuaSummaryInfo() //获取lua汇总信息
|
|
getLuaSummaryInfo() //获取lua汇总信息
|
|
getSpiderWarnInfo() //获取异常数据
|
|
getSpiderWarnInfo() //获取异常数据
|
|
@@ -111,6 +137,63 @@ func NewStartTask() {
|
|
timetask.CountLuaPythonNumEveryDay() //每日采集量统计
|
|
timetask.CountLuaPythonNumEveryDay() //每日采集量统计
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+// 初始化
|
|
|
|
+func InitInfo() {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ //CodeInfoMap = map[string]*Spider{} //初始化
|
|
|
|
+ //AllHref = map[string]string{}
|
|
|
|
+ //SameDayHref = map[string]string{}
|
|
|
|
+ //DataBakAllHref = map[string]string{}
|
|
|
|
+ //UserTaskNum = map[string]map[string]int{}
|
|
|
|
+ //StartTime, EndTime = util.GetWorkDayTimeUnix()
|
|
|
|
+ UpdateLuaconfig = [][]map[string]interface{}{}
|
|
|
|
+ StartTime = util.GetTime(-1)
|
|
|
|
+ EndTime = util.GetTime(0)
|
|
|
|
+ Publishtime = qu.FormatDateByInt64(&StartTime, qu.Date_Short_Layout)
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func getCodeAuditorLog() {
|
|
|
|
+ defer qu.Catch()
|
|
|
|
+ sess := util.MgoEB.GetMgoConn()
|
|
|
|
+ defer util.MgoEB.DestoryMongoConn(sess)
|
|
|
|
+ lock := &sync.Mutex{}
|
|
|
|
+ wg := &sync.WaitGroup{}
|
|
|
|
+ ch := make(chan bool, 5)
|
|
|
|
+ query := map[string]interface{}{
|
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
|
+ "$gte": StartTime,
|
|
|
|
+ "$lt": EndTime,
|
|
|
|
+ },
|
|
|
|
+ }
|
|
|
|
+ fields := map[string]interface{}{
|
|
|
|
+ "code": 1,
|
|
|
|
+ "types": 1,
|
|
|
|
+ }
|
|
|
|
+ it := sess.DB(util.MgoEB.DbName).C("lua_logs_auditor").Find(&query).Select(&fields).Sort("_id").Iter()
|
|
|
|
+ n := 0
|
|
|
|
+ for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
|
|
+ wg.Add(1)
|
|
|
|
+ ch <- true
|
|
|
|
+ go func(tmp map[string]interface{}) {
|
|
|
|
+ defer func() {
|
|
|
|
+ <-ch
|
|
|
|
+ wg.Done()
|
|
|
|
+ }()
|
|
|
|
+ code := qu.ObjToString(tmp["code"])
|
|
|
|
+ types := qu.ObjToString(tmp["types"])
|
|
|
|
+ lock.Lock()
|
|
|
|
+ CodesAuditorLog[code] = types
|
|
|
|
+ lock.Unlock()
|
|
|
|
+ }(tmp)
|
|
|
|
+ if n%1000 == 0 {
|
|
|
|
+ logger.Info(n)
|
|
|
|
+ }
|
|
|
|
+ tmp = map[string]interface{}{}
|
|
|
|
+ }
|
|
|
|
+ wg.Wait()
|
|
|
|
+ logger.Info("审核记录信息准备完成...", len(CodesAuditorLog))
|
|
|
|
+}
|
|
|
|
+
|
|
func getCodeBaseInfo() {
|
|
func getCodeBaseInfo() {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
sess := util.MgoEB.GetMgoConn()
|
|
sess := util.MgoEB.GetMgoConn()
|
|
@@ -119,30 +202,35 @@ func getCodeBaseInfo() {
|
|
wg := &sync.WaitGroup{}
|
|
wg := &sync.WaitGroup{}
|
|
ch := make(chan bool, 5)
|
|
ch := make(chan bool, 5)
|
|
query := map[string]interface{}{
|
|
query := map[string]interface{}{
|
|
- "$or": []interface{}{
|
|
|
|
- //lua、python上线爬虫
|
|
|
|
- map[string]interface{}{
|
|
|
|
- "state": map[string]interface{}{
|
|
|
|
- "$in": []int{5, 11}, //上架、上线爬虫
|
|
|
|
- },
|
|
|
|
- //"platform": map[string]interface{}{
|
|
|
|
- // "$in": []string{"golua平台", "chrome", "python"},
|
|
|
|
- //},
|
|
|
|
- },
|
|
|
|
- //lua正在被维护的爬虫和上架爬虫
|
|
|
|
- map[string]interface{}{
|
|
|
|
- "platform": map[string]interface{}{
|
|
|
|
- "$in": []string{"golua平台", "chrome"},
|
|
|
|
- },
|
|
|
|
- "state": map[string]interface{}{
|
|
|
|
- "$in": []int{0, 1, 2}, //待完成、待审核、未通过
|
|
|
|
- },
|
|
|
|
- "event": map[string]interface{}{
|
|
|
|
- "$ne": 7000,
|
|
|
|
- },
|
|
|
|
- },
|
|
|
|
|
|
+ "state": map[string]interface{}{ //所有平台在线爬虫
|
|
|
|
+ "$in": []int{5, 11}, //上架、上线爬虫
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
+ //query := map[string]interface{}{
|
|
|
|
+ // "$or": []interface{}{
|
|
|
|
+ // //lua、python上线爬虫
|
|
|
|
+ // map[string]interface{}{
|
|
|
|
+ // "state": map[string]interface{}{
|
|
|
|
+ // "$in": []int{5, 11}, //上架、上线爬虫
|
|
|
|
+ // },
|
|
|
|
+ // //"platform": map[string]interface{}{
|
|
|
|
+ // // "$in": []string{"golua平台", "chrome", "python"},
|
|
|
|
+ // //},
|
|
|
|
+ // },
|
|
|
|
+ // //lua正在被维护的爬虫和上架爬虫
|
|
|
|
+ // map[string]interface{}{
|
|
|
|
+ // "platform": map[string]interface{}{
|
|
|
|
+ // "$in": []string{"golua平台", "chrome"},
|
|
|
|
+ // },
|
|
|
|
+ // "state": map[string]interface{}{
|
|
|
|
+ // "$in": []int{0, 1, 2}, //待完成、待审核、未通过
|
|
|
|
+ // },
|
|
|
|
+ // "event": map[string]interface{}{
|
|
|
|
+ // "$ne": 7000,
|
|
|
|
+ // },
|
|
|
|
+ // },
|
|
|
|
+ // },
|
|
|
|
+ //}
|
|
fields := map[string]interface{}{
|
|
fields := map[string]interface{}{
|
|
"code": 1,
|
|
"code": 1,
|
|
"site": 1,
|
|
"site": 1,
|
|
@@ -159,7 +247,7 @@ func getCodeBaseInfo() {
|
|
"infoformat": 1,
|
|
"infoformat": 1,
|
|
"param_common": 1,
|
|
"param_common": 1,
|
|
}
|
|
}
|
|
- it := sess.DB(util.MgoEB.DbName).C("luaconfig").Find(&query).Select(&fields).Iter()
|
|
|
|
|
|
+ it := sess.DB(util.MgoEB.DbName).C("luaconfig_online").Find(&query).Select(&fields).Iter()
|
|
n := 0
|
|
n := 0
|
|
for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
|
|
wg.Add(1)
|
|
wg.Add(1)
|
|
@@ -190,8 +278,10 @@ func getCodeBaseInfo() {
|
|
sp.Model = util.CodeEventModel[sp.Event]
|
|
sp.Model = util.CodeEventModel[sp.Event]
|
|
sp.ListDataColl = util.CodeListDataColl[sp.Event]
|
|
sp.ListDataColl = util.CodeListDataColl[sp.Event]
|
|
sp.MaxPage = maxPage
|
|
sp.MaxPage = maxPage
|
|
- if sp.Platform == "python" {
|
|
|
|
|
|
+ if sp.Platform == PLATFORM_PYTHON {
|
|
sp.Model = 1
|
|
sp.Model = 1
|
|
|
|
+ } else if sp.Platform == PLATFORM_COMM {
|
|
|
|
+ sp.Model = 0
|
|
}
|
|
}
|
|
lock.Lock()
|
|
lock.Lock()
|
|
NewCodeInfoMap[sp.Code] = sp
|
|
NewCodeInfoMap[sp.Code] = sp
|
|
@@ -476,7 +566,7 @@ func getSpiderHighListDownloadNum() { //竞品数据暂未统计(延迟采集
|
|
state := qu.IntAll(tmp["state"])
|
|
state := qu.IntAll(tmp["state"])
|
|
times := tmp["times"]
|
|
times := tmp["times"]
|
|
lock.Lock()
|
|
lock.Lock()
|
|
- if sp := NewCodeInfoMap[code]; sp != nil {
|
|
|
|
|
|
+ if sp := NewCodeInfoMap[code]; sp != nil && (sp.Platform == PLATFORM_LUA || sp.Platform == PLATFORM_CHROME) {
|
|
if state == 1 {
|
|
if state == 1 {
|
|
sp.Detail_DownloadSuccessNum++
|
|
sp.Detail_DownloadSuccessNum++
|
|
} else if state == -1 {
|
|
} else if state == -1 {
|
|
@@ -532,7 +622,7 @@ func getSpiderListDownloadNum() {
|
|
href := qu.ObjToString(tmp["href"])
|
|
href := qu.ObjToString(tmp["href"])
|
|
lock.Lock()
|
|
lock.Lock()
|
|
defer lock.Unlock()
|
|
defer lock.Unlock()
|
|
- if sp := NewCodeInfoMap[code]; sp != nil {
|
|
|
|
|
|
+ if sp := NewCodeInfoMap[code]; sp != nil && (sp.Platform == PLATFORM_LUA || sp.Platform == PLATFORM_CHROME) {
|
|
tmpState := repeatHrefMap[href]
|
|
tmpState := repeatHrefMap[href]
|
|
if tmpState == 1 { //该href已记录下载成功,后续不做任务记录
|
|
if tmpState == 1 { //该href已记录下载成功,后续不做任务记录
|
|
return
|
|
return
|
|
@@ -614,7 +704,7 @@ func getSpiderDownloadRateData() {
|
|
sp.Page_FlipOk = !(uplimit > 0)
|
|
sp.Page_FlipOk = !(uplimit > 0)
|
|
sp.UpLimit = uplimit
|
|
sp.UpLimit = uplimit
|
|
//判断第一页采集是否异常
|
|
//判断第一页采集是否异常
|
|
- sp.Page_OneOk = !(page_onefail == alltimes && page_onefail > 0)
|
|
|
|
|
|
+ sp.Page_OneOk = !(page_onefail > 0 && page_onefail == alltimes)
|
|
if sp.Page_OneOk {
|
|
if sp.Page_OneOk {
|
|
percent := float64(page_onefail) / float64(alltimes)
|
|
percent := float64(page_onefail) / float64(alltimes)
|
|
if page_onefail <= 5 && (percent > 0.75 && percent < 1) {
|
|
if page_onefail <= 5 && (percent > 0.75 && percent < 1) {
|
|
@@ -625,6 +715,10 @@ func getSpiderDownloadRateData() {
|
|
}
|
|
}
|
|
//判断第二页采集是否异常
|
|
//判断第二页采集是否异常
|
|
sp.Page_TwoOk = !(page_fail == alltimes && page_fail > 0)
|
|
sp.Page_TwoOk = !(page_fail == alltimes && page_fail > 0)
|
|
|
|
+ //列表页
|
|
|
|
+ if !sp.List_IsGetData {
|
|
|
|
+ sp.List_IsGetData = sp.List_AllInTimes == sp.List_NoDataTimes
|
|
|
|
+ }
|
|
}
|
|
}
|
|
lock.Unlock()
|
|
lock.Unlock()
|
|
}(tmp)
|
|
}(tmp)
|
|
@@ -812,7 +906,7 @@ func getAllErr(sp *NewSpider) {
|
|
}
|
|
}
|
|
func listErr(sp *NewSpider) {
|
|
func listErr(sp *NewSpider) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
- if sp.Platform == "python" && !sp.Py_IsValid {
|
|
|
|
|
|
+ if sp.Platform == PLATFORM_PYTHON && !sp.Py_IsValid {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
//if !sp.List_IsGetData || sp.List_RunTimes == 0 {
|
|
//if !sp.List_IsGetData || sp.List_RunTimes == 0 {
|
|
@@ -829,6 +923,9 @@ func listErr(sp *NewSpider) {
|
|
errFlag = true
|
|
errFlag = true
|
|
}
|
|
}
|
|
if errFlag {
|
|
if errFlag {
|
|
|
|
+ if sp.Platform == PLATFORM_COMM && strings.Contains(CodesAuditorLog[sp.Code], "审核") { //通用平台前一天审核的爬虫不建列表页异常任务
|
|
|
|
+ return
|
|
|
|
+ }
|
|
//sp.Error[NEWTASK_LISTERR] = &ErrorInfo{
|
|
//sp.Error[NEWTASK_LISTERR] = &ErrorInfo{
|
|
// ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_LISTERR]: true},
|
|
// ErrInfo: map[string]bool{LuaErrTypeInfo[NEWTASK_LISTERR]: true},
|
|
//}
|
|
//}
|
|
@@ -884,10 +981,9 @@ func dataInfoErr(sp *NewSpider) {
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
}
|
|
-
|
|
|
|
func pageFlipErr(sp *NewSpider) {
|
|
func pageFlipErr(sp *NewSpider) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
- if sp.Platform == "python" {
|
|
|
|
|
|
+ if sp.Platform == PLATFORM_PYTHON {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
errFlag := false
|
|
errFlag := false
|
|
@@ -928,10 +1024,9 @@ func pageFlipErr(sp *NewSpider) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-
|
|
|
|
func downloadRateErr(sp *NewSpider) {
|
|
func downloadRateErr(sp *NewSpider) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
- if sp.Platform == "python" {
|
|
|
|
|
|
+ if sp.Platform == PLATFORM_PYTHON {
|
|
if !sp.Py_IsValid { //无效爬虫
|
|
if !sp.Py_IsValid { //无效爬虫
|
|
return
|
|
return
|
|
} else {
|
|
} else {
|
|
@@ -988,7 +1083,7 @@ func downloadRateErr(sp *NewSpider) {
|
|
}
|
|
}
|
|
func downloadFailedErr(sp *NewSpider) {
|
|
func downloadFailedErr(sp *NewSpider) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
- if sp.Platform == "python" && !sp.Py_IsValid {
|
|
|
|
|
|
+ if sp.Platform == PLATFORM_PYTHON && !sp.Py_IsValid {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
flagTime := util.GetTime(-7)
|
|
flagTime := util.GetTime(-7)
|
|
@@ -1183,7 +1278,7 @@ func (sp *NewSpider) getCodeFailDataCount() (int, float64) {
|
|
return allCount, value
|
|
return allCount, value
|
|
}
|
|
}
|
|
|
|
|
|
-//更新爬虫
|
|
|
|
|
|
+// 更新爬虫
|
|
func updateLuaconfig() {
|
|
func updateLuaconfig() {
|
|
if len(UpdateLuaconfig) > 0 {
|
|
if len(UpdateLuaconfig) > 0 {
|
|
util.MgoEB.UpdateBulk("luaconfig", UpdateLuaconfig...)
|
|
util.MgoEB.UpdateBulk("luaconfig", UpdateLuaconfig...)
|
|
@@ -1191,7 +1286,7 @@ func updateLuaconfig() {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-//关闭任务
|
|
|
|
|
|
+// 关闭任务
|
|
func closeTask() {
|
|
func closeTask() {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
query := map[string]interface{}{ //关闭7天未转为待处理的下载异常,数据异常警告类型的任务
|
|
query := map[string]interface{}{ //关闭7天未转为待处理的下载异常,数据异常警告类型的任务
|