|
@@ -38,89 +38,89 @@ func (s *Spider) StartSpider() {
|
|
|
}
|
|
|
|
|
|
//加载应采集数据,进行采集
|
|
|
-func (s *Spider) DownloadHistoryDetail() {
|
|
|
- defer qu.Catch()
|
|
|
- q := map[string]interface{}{"spidercode": s.Code, "state": 0}
|
|
|
- o := map[string]interface{}{"_id": 1}
|
|
|
- f := map[string]interface{}{
|
|
|
- "state": 0,
|
|
|
- "comeintime": 0,
|
|
|
- "event": 0,
|
|
|
- }
|
|
|
- //UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录采集三级页心跳
|
|
|
- list, _ := MgoS.Find("spider_historydata", q, o, f, false, 0, 200)
|
|
|
- if len(*list) == 0 { //数据量为0,表示无可下载数据,爬虫作废
|
|
|
- s.Stop = true
|
|
|
- return
|
|
|
- }
|
|
|
- //采集(目前未开多线程)
|
|
|
- for _, tmp := range *list {
|
|
|
- id := tmp["_id"]
|
|
|
- href := qu.ObjToString(tmp["href"])
|
|
|
- hashHref := sputil.HexText(href)
|
|
|
- isExist := sputil.RedisClusterExists(hashHref) //全量href redis判重
|
|
|
- if isExist {
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "href", "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
- MgoS.UpdateById("spider_historydata", id, set)
|
|
|
- return
|
|
|
- }
|
|
|
- success := true //数据是否下载成功的标志
|
|
|
- delete(tmp, "_id") //删除列表页信息无用字段_id
|
|
|
- data := map[string]interface{}{}
|
|
|
- for k, v := range tmp {
|
|
|
- data[k] = v
|
|
|
- }
|
|
|
- //下载、解析、入库
|
|
|
- data, err := s.DownloadDetailPage(tmp, data)
|
|
|
- //UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //下载数据心跳
|
|
|
- if err != nil || data == nil {
|
|
|
- success = false
|
|
|
- if err != nil {
|
|
|
- logger.Error(s.Code, err, tmp)
|
|
|
- //if len(tmp) > 0 {
|
|
|
- // SaveErrorData(s.MUserName, tmp, err) //保存错误信息
|
|
|
- //}
|
|
|
- } /*else if data == nil && times >= 3 { //下载问题,建editor任务
|
|
|
- DownloadErrorData(s.Code, tmp)
|
|
|
- }*/
|
|
|
- } else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
- sputil.RedisClusterSet(hashHref, "", -1)
|
|
|
- }
|
|
|
- if !success { //下载失败
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": -1, "updatetime": time.Now().Unix()}}
|
|
|
- MgoS.UpdateById("spider_historydata", id, set)
|
|
|
- return
|
|
|
- } else if data["delete"] != nil { //三级页过滤
|
|
|
- sputil.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
|
|
|
- //更新mgo 要删除的数据更新spider_historydata state=1不再下载,更新redis
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
|
|
|
- MgoS.UpdateById("spider_historydata", id, set)
|
|
|
- return
|
|
|
- }
|
|
|
- //正文、附件分析,下载异常数据重新下载
|
|
|
- if AnalysisProjectInfo(data) {
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": -1, "detailfilerr": true, "updatetime": time.Now().Unix()}}
|
|
|
- MgoS.UpdateById("spider_historydata", id, set)
|
|
|
- return
|
|
|
- }
|
|
|
- t1 := sputil.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
- if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
- data["publishtime"] = time.Now().Unix()
|
|
|
- }
|
|
|
- delete(data, "exit")
|
|
|
- delete(data, "checkpublishtime")
|
|
|
- data["comeintime"] = time.Now().Unix()
|
|
|
- data["spidercode"] = s.Code
|
|
|
- data["dataging"] = 0
|
|
|
- data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
|
- //发送保存服务
|
|
|
- Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
|
- set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
|
|
|
- MgoS.UpdateById("spider_historydata", id, set)
|
|
|
- }
|
|
|
- //采集完LoadScript
|
|
|
- s.LoadScript(&s.Name, &s.Channel, &s.MUserName, s.Code, s.ScriptFile, true, false)
|
|
|
-}
|
|
|
+//func (s *Spider) DownloadHistoryDetail() {
|
|
|
+// defer qu.Catch()
|
|
|
+// q := map[string]interface{}{"spidercode": s.Code, "state": 0}
|
|
|
+// o := map[string]interface{}{"_id": 1}
|
|
|
+// f := map[string]interface{}{
|
|
|
+// "state": 0,
|
|
|
+// "comeintime": 0,
|
|
|
+// "event": 0,
|
|
|
+// }
|
|
|
+// //UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录采集三级页心跳
|
|
|
+// list, _ := MgoS.Find("spider_historydata", q, o, f, false, 0, 200)
|
|
|
+// if len(*list) == 0 { //数据量为0,表示无可下载数据,爬虫作废
|
|
|
+// s.Stop = true
|
|
|
+// return
|
|
|
+// }
|
|
|
+// //采集(目前未开多线程)
|
|
|
+// for _, tmp := range *list {
|
|
|
+// id := tmp["_id"]
|
|
|
+// href := qu.ObjToString(tmp["href"])
|
|
|
+// hashHref := sputil.HexText(href)
|
|
|
+// isExist := sputil.RedisClusterExists(hashHref) //全量href redis判重
|
|
|
+// if isExist {
|
|
|
+// set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": "href", "updatetime": time.Now().Unix()}} //已存在state置为1
|
|
|
+// MgoS.UpdateById("spider_historydata", id, set)
|
|
|
+// return
|
|
|
+// }
|
|
|
+// success := true //数据是否下载成功的标志
|
|
|
+// delete(tmp, "_id") //删除列表页信息无用字段_id
|
|
|
+// data := map[string]interface{}{}
|
|
|
+// for k, v := range tmp {
|
|
|
+// data[k] = v
|
|
|
+// }
|
|
|
+// //下载、解析、入库
|
|
|
+// data, err := s.DownloadDetailPage(tmp, data)
|
|
|
+// //UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //下载数据心跳
|
|
|
+// if err != nil || data == nil {
|
|
|
+// success = false
|
|
|
+// if err != nil {
|
|
|
+// logger.Error(s.Code, err, tmp)
|
|
|
+// //if len(tmp) > 0 {
|
|
|
+// // SaveErrorData(s.MUserName, tmp, err) //保存错误信息
|
|
|
+// //}
|
|
|
+// } /*else if data == nil && times >= 3 { //下载问题,建editor任务
|
|
|
+// DownloadErrorData(s.Code, tmp)
|
|
|
+// }*/
|
|
|
+// } else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
|
|
|
+// sputil.RedisClusterSet(hashHref, "", -1)
|
|
|
+// }
|
|
|
+// if !success { //下载失败
|
|
|
+// set := map[string]interface{}{"$set": map[string]interface{}{"state": -1, "updatetime": time.Now().Unix()}}
|
|
|
+// MgoS.UpdateById("spider_historydata", id, set)
|
|
|
+// return
|
|
|
+// } else if data["delete"] != nil { //三级页过滤
|
|
|
+// sputil.RedisClusterSet(hashHref, "", -1) //过滤掉的数据存值全量redis
|
|
|
+// //更新mgo 要删除的数据更新spider_historydata state=1不再下载,更新redis
|
|
|
+// set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
|
|
|
+// MgoS.UpdateById("spider_historydata", id, set)
|
|
|
+// return
|
|
|
+// }
|
|
|
+// //正文、附件分析,下载异常数据重新下载
|
|
|
+// if AnalysisProjectInfo(data) {
|
|
|
+// set := map[string]interface{}{"$set": map[string]interface{}{"state": -1, "detailfilerr": true, "updatetime": time.Now().Unix()}}
|
|
|
+// MgoS.UpdateById("spider_historydata", id, set)
|
|
|
+// return
|
|
|
+// }
|
|
|
+// t1 := sputil.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
|
|
|
+// if t1 > time.Now().Unix() { //防止发布时间超前
|
|
|
+// data["publishtime"] = time.Now().Unix()
|
|
|
+// }
|
|
|
+// delete(data, "exit")
|
|
|
+// delete(data, "checkpublishtime")
|
|
|
+// data["comeintime"] = time.Now().Unix()
|
|
|
+// data["spidercode"] = s.Code
|
|
|
+// data["dataging"] = 0
|
|
|
+// data["iscompete"] = s.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
|
|
|
+// //发送保存服务
|
|
|
+// Store(s.StoreMode, s.StoreToMsgEvent, s.Collection, s.CoverAttr, data, true)
|
|
|
+// set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
|
|
|
+// MgoS.UpdateById("spider_historydata", id, set)
|
|
|
+// }
|
|
|
+// //采集完LoadScript
|
|
|
+// s.LoadScript(&s.Name, &s.Channel, &s.MUserName, s.Code, s.ScriptFile, true, false)
|
|
|
+//}
|
|
|
|
|
|
//定时检测数据集汇总爬虫
|
|
|
func GetHistoryDownloadSpider() {
|