|
@@ -94,7 +94,6 @@ func DownloadHighDetail(code string) {
|
|
|
for {
|
|
|
//logger.Info("爬虫代码:", s.Code, "已下架:", s.Stop)
|
|
|
//if !s.Stop { //爬虫是运行状态
|
|
|
- //TODO 延迟采集还未添加
|
|
|
/*
|
|
|
1、每轮开始先查询当天下载的数据
|
|
|
2、本次查询无数据依次向前推一天查询数据(暂定50条数据)
|
|
@@ -239,6 +238,18 @@ func DownloadHighDetail(code string) {
|
|
|
return
|
|
|
}
|
|
|
}
|
|
|
+ //正文、附件分析,下载异常数据重新下载
|
|
|
+ if AnalysisProjectInfo(data) {
|
|
|
+ times++
|
|
|
+ ss := map[string]interface{}{"times": times, "updatetime": time.Now().Unix()}
|
|
|
+ if times >= 3 { //3次下载失败今天不再下载,state置为1
|
|
|
+ ss["state"] = -1
|
|
|
+ ss["detailfilerr"] = true
|
|
|
+ }
|
|
|
+ set := map[string]interface{}{"$set": ss}
|
|
|
+ MgoS.Update("spider_highlistdata", query, set, false, false)
|
|
|
+ return
|
|
|
+ }
|
|
|
t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
data["publishtime"] = time.Now().Unix()
|
|
@@ -247,15 +258,15 @@ func DownloadHighDetail(code string) {
|
|
|
delete(data, "checkpublishtime")
|
|
|
data["comeintime"] = time.Now().Unix()
|
|
|
//计数
|
|
|
- tmpsp1, b := Allspiders.Load(sp.Code)
|
|
|
- if b {
|
|
|
- sp1, ok := tmpsp1.(*Spider)
|
|
|
- if ok {
|
|
|
- atomic.AddInt32(&sp1.LastDowncount, 1)
|
|
|
- atomic.AddInt32(&sp1.TodayDowncount, 1)
|
|
|
- atomic.AddInt32(&sp1.TotalDowncount, 1)
|
|
|
- }
|
|
|
- }
|
|
|
+ //tmpsp1, b := Allspiders.Load(sp.Code)
|
|
|
+ //if b {
|
|
|
+ // sp1, ok := tmpsp1.(*Spider)
|
|
|
+ // if ok {
|
|
|
+ // atomic.AddInt32(&sp1.LastDowncount, 1)
|
|
|
+ // atomic.AddInt32(&sp1.TodayDowncount, 1)
|
|
|
+ // atomic.AddInt32(&sp1.TotalDowncount, 1)
|
|
|
+ // }
|
|
|
+ //}
|
|
|
data["spidercode"] = sp.Code
|
|
|
data["dataging"] = 0
|
|
|
data["iscompete"] = sp.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
@@ -272,6 +283,35 @@ func DownloadHighDetail(code string) {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+//detail含“详情请访问原网页!”且附件未下成功的,不计入下载成功
|
|
|
+func AnalysisProjectInfo(data map[string]interface{}) bool {
|
|
|
+ defer qu.Catch()
|
|
|
+ detail := qu.ObjToString(data["detail"])
|
|
|
+ if detail == "详情请访问原网页!" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
|
|
|
+ if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
|
|
|
+ if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
|
|
|
+ fileOk := false
|
|
|
+ for _, data := range attachments {
|
|
|
+ if d, ok := data.(map[string]interface{}); ok {
|
|
|
+ fid := qu.ObjToString(d["fid"])
|
|
|
+ if fid != "" { //附件上传成功
|
|
|
+ fileOk = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return !fileOk
|
|
|
+ } else {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
func FilterByDetail(href string, query, data map[string]interface{}) bool {
|
|
|
if data["delete"] != nil {
|
|
|
//增量
|