Sfoglia il codice sorgente

更新listdata中列表页采集是否成功标记

maxiaoshan 3 anni fa
parent
commit
27eee019c4
3 ha cambiato i file con 37 aggiunte e 4 eliminazioni
  1. 7 0
      src/spider/script.go
  2. 27 2
      src/spider/spider.go
  3. 3 2
      src/spider/store.go

+ 7 - 0
src/spider/script.go

@@ -734,6 +734,13 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 		S.Push(lua.LString(result))
 		return 1
 	}))
+	//base64加密
+	s.L.SetGlobal("encodeBase64", s.L.NewFunction(func(S *lua.LState) int {
+		text := S.ToString(-1)
+		base64Text := base64.StdEncoding.EncodeToString([]byte(text))
+		S.Push(lua.LString(base64Text))
+		return 1
+	}))
 	//长度
 	s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int {
 		text := S.ToString(-1)

+ 27 - 2
src/spider/spider.go

@@ -420,12 +420,17 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
 	}
 	db := HexToBigIntMod(href)
 	hashHref := HexText(href)
-	SaveListPageData(paramdata)                                               //存储采集记录
+	id := ""
+	SaveListPageData(paramdata, &id)                                          //存储采集记录
 	isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref) //取全量redis
 	//log.Println("full href:", href, " isExist:", isExist)
 	logger.Debug("full href:", href, " isExist:", isExist)
 	if !s.IsMustDownload && isExist { //非强制下载redis中存在,结束
 		//qu.Debug("非强制下载redis中存在,结束")
+		//更新spider_listdata中数据下载成功标记
+		if id != "" {
+			Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
+		}
 		return
 	}
 	//qu.Debug("----------------下载、解析、入库--------------------")
@@ -438,8 +443,16 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
 			// 	SaveErrorData(paramdata) //保存错误信息
 			// }
 		}
+		//更新spider_listdata中数据下载失败标记
+		if id != "" {
+			Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1}})
+		}
 		return
 	}
+	//更新spider_listdata中数据下载成功标记
+	if id != "" {
+		Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
+	}
 	flag := true
 	t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"])) //publishtime
 	if s.IsMustDownload {                                           //强制下载
@@ -462,6 +475,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
 	if t1 > time.Now().Unix() { //防止发布时间超前
 		data["publishtime"] = time.Now().Unix()
 	}
+	delete(data, "state")
 	delete(data, "exit")
 	delete(data, "checkpublishtime")
 	data["comeintime"] = time.Now().Unix()
@@ -500,6 +514,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 		}
 		log.Println("href had++:", isExist, href)
 	*/
+	id := ""                    //记录spider_listdata中保存的数据id,便于下载成功后更新状态
 	if util.Config.Modal == 1 { //除7000、7500、7700节点外所有节点只采集列表页信息
 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
 		if isExist { //更新redis生命周期
@@ -517,7 +532,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 			*num++ //已采集
 			return
 		}
-		SaveListPageData(paramdata) //保存7000、7500、7700节点列表页采集的信息
+		SaveListPageData(paramdata, &id) //保存7000、7410、7500、7700节点列表页采集的信息
 	}
 	//下载、解析、入库
 	data, err = s.DownloadDetailPage(paramdata, data)
@@ -528,6 +543,10 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 				SaveErrorData(s.MUserName, paramdata, err) //保存错误信息
 			}
 		}
+		//更新spider_listdata中数据下载失败标记
+		if id != "" {
+			Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": -1}})
+		}
 		return
 	} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
 		log.Println("beforeHref:", href, "afterHref:", tmphref)
@@ -541,11 +560,17 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 			util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 		}
 	}
+	//更新spider_listdata中数据下载成功标记
+	if id != "" {
+		Mgo.UpdateById("spider_listdata", id, map[string]interface{}{"$set": map[string]interface{}{"state": 1}})
+	}
+
 	t1 := util.ParseDate2Int64(qu.ObjToString(data["publishtime"]))
 	if t1 > time.Now().Unix() { //防止发布时间超前
 		data["publishtime"] = time.Now().Unix()
 	}
 	UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detailexcute") //记录modal=0老模式采集到数据心跳
+	delete(data, "state")
 	delete(data, "exit")
 	delete(data, "checkpublishtime")
 	data["comeintime"] = time.Now().Unix()

+ 3 - 2
src/spider/store.go

@@ -256,10 +256,11 @@ func SaveHighListPageData(tmp map[string]interface{}, href string, num *int) {
 }
 
 //保存7000、7500、7700采集的列表页信息
-func SaveListPageData(tmp map[string]interface{}) {
+func SaveListPageData(tmp map[string]interface{}, id *string) {
 	tmp["event"] = lu.Config.Uploadevent
 	tmp["comeintime"] = time.Now().Unix()
-	Mgo.Save("spider_listdata", tmp)
+	tmp["state"] = 0
+	*id = Mgo.Save("spider_listdata", tmp)
 }
 
 //定时任务