maxiaoshan 3 yıl önce
ebeveyn
işleme
9f8e962a11
3 değiştirilmiş dosya ile 27 ekleme ve 15 silme
  1. 6 5
      src/spider/download.go
  2. 16 5
      src/spider/script.go
  3. 5 5
      src/spider/spider.go

+ 6 - 5
src/spider/download.go

@@ -82,7 +82,7 @@ func Download(downloaderid, url, method string, head map[string]interface{}, enc
 }
 
 //下载页面,发送消息,等待下载
-func DownloadAdv(downloaderid, url, method string, reqparam, head map[string]interface{}, mycookie []*http.Cookie, encoding string, useproxy, ishttps bool, code string, timeout int64) (string, []*http.Cookie) {
+func DownloadAdv(downloaderid, url, method string, reqparam, head map[string]interface{}, mycookie []*http.Cookie, encoding string, useproxy, ishttps bool, code string, timeout int64) (string, []*http.Cookie, map[string]interface{}) {
 	defer mu.Catch()
 	msgid := mu.UUID(8)
 	if len(head) < 1 {
@@ -117,7 +117,7 @@ func DownloadAdv(downloaderid, url, method string, reqparam, head map[string]int
 				"ishttps":  ishttps,
 			}, timeout)
 		} else {
-			return "", nil
+			return "", nil, nil
 		}
 	}
 	if err != nil {
@@ -127,16 +127,17 @@ func DownloadAdv(downloaderid, url, method string, reqparam, head map[string]int
 	tmp := map[string]interface{}{}
 	json.Unmarshal(ret, &tmp)
 	cooks := lu.ParseHttpCookie(tmp["cookie"])
+	headers, _ := tmp["header"].(map[string]interface{})
 	if v, ok := tmp["code"].(string); ok && v == "200" {
 		if isImg {
 			bs, _ := tmp["content"].(string)
-			return string(bs), cooks
+			return string(bs), cooks, headers
 		} else {
 			bs, _ := base64.StdEncoding.DecodeString(tmp["content"].(string))
-			return string(bs), cooks
+			return string(bs), cooks, headers
 		}
 	} else {
-		return "", nil
+		return "", nil, nil
 	}
 }
 

+ 16 - 5
src/spider/script.go

@@ -153,15 +153,18 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 		json.Unmarshal([]byte(cookie), &mycookie)
 		var ret string
 		var retcookie []*http.Cookie
+		var headers = map[string]interface{}{}
 		if param == nil {
 			ptext := map[string]interface{}{"text": S.ToString(-3)}
-			ret, retcookie = DownloadAdv(s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
+			ret, retcookie, headers = DownloadAdv(s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
 		} else {
-			ret, retcookie = DownloadAdv(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
+			ret, retcookie, headers = DownloadAdv(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
 		}
 		S.Push(lua.LString(ret))
 		scookie, _ := json.Marshal(retcookie)
 		S.Push(lua.LString(scookie))
+		hTable := util.MapToLuaTable(S, headers)
+		S.Push(hTable)
 		atomic.AddInt32(&s.ToDayRequestNum, 1)
 		atomic.AddInt32(&s.TotalRequestNum, 1)
 		end := time.Since(start)
@@ -169,7 +172,7 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 			s.LastThreeTimes = s.LastThreeTimes[1:]
 		}
 		s.LastThreeTimes = append(s.LastThreeTimes, end)
-		return 2
+		return 3
 	}))
 	//保存验证错误日志
 	s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
@@ -784,8 +787,16 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 	}))
 	//获取验证码
 	s.L.SetGlobal("getCodeByPath", s.L.NewFunction(func(S *lua.LState) int {
-		path := S.ToString(-1)
-		code := codegrpc.GetCodeByPath(path)
+		head := S.ToTable(-1)
+		path := S.ToString(-2)
+		headMap := util.GetTable(head)
+		qu.Debug(headMap)
+		headJsonStr := ""
+		headByte, err := json.Marshal(headMap)
+		if err == nil {
+			headJsonStr = string(headByte)
+		}
+		code := codegrpc.GetCodeByPath(path, headJsonStr)
 		S.Push(lua.LString(code))
 		return 1
 	}))

+ 5 - 5
src/spider/spider.go

@@ -159,7 +159,7 @@ func DownloadHighDetail(code string) {
 					//为了避免重复下载,进行增量redis判重
 					isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
 					if isExist {
-						set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
+						set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
 						MgoS.Update("spider_highlistdata", query, set, false, false)
 						return
 					}
@@ -170,7 +170,7 @@ func DownloadHighDetail(code string) {
 						esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
 						count := Es.Count(EsIndex, EsType, esQuery)
 						if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
-							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
+							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
 							MgoS.Update("spider_highlistdata", query, set, false, false)
 							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
 							return
@@ -181,7 +181,7 @@ func DownloadHighDetail(code string) {
 						title := qu.ObjToString(tmp["title"])
 						one, _ := MgoS.FindOne("data_bak", map[string]interface{}{"title": title})
 						if one != nil && len(*one) > 0 { //剑鱼已采集,舍弃此条信息
-							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true}} //已存在state置为1
+							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
 							MgoS.Update("spider_highlistdata", query, set, false, false)
 							return
 						}
@@ -260,7 +260,7 @@ func DownloadHighDetail(code string) {
 					data["dataging"] = 0
 					data["iscompete"] = sp.IsCompete //2021-11-01以后新增的爬虫不在展示原文链接(保存服务判断)
 					Store(sp.StoreMode, sp.StoreToMsgEvent, sp.Collection, sp.CoverAttr, data, true)
-					set := map[string]interface{}{"$set": map[string]interface{}{"state": 1}} //下载成功state置为1
+					set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}} //下载成功state置为1
 					MgoS.Update("spider_highlistdata", query, set, false, false)
 				}(l, spTmp)
 			}
@@ -281,7 +281,7 @@ func FilterByDetail(href string, query, data map[string]interface{}) bool {
 		hashHref := HexText(href)
 		util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 		//更新mgo 要删除的数据更新spider_highlistdata state=1不再下载,更新redis
-		set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true}}
+		set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "delete": true, "updatetime": time.Now().Unix()}}
 		MgoS.Update("spider_highlistdata", query, set, false, false)
 		return true
 	}