Ver código fonte

全量、增量redis val调整

maxiaoshan 3 anos atrás
pai
commit
b82a54ec8b
4 arquivos alterados com 114 adições e 10 exclusões
  1. 55 0
      src/spider/download.go
  2. 2 2
      src/spider/msgservice.go
  3. 54 5
      src/spider/script.go
  4. 3 3
      src/spider/spider.go

+ 55 - 0
src/spider/download.go

@@ -80,6 +80,61 @@ func Download(downloaderid, url, method string, head map[string]interface{}, enc
 		return ""
 	}
 }
+func NewDownloadFile(downloaderid, url, method string, reqparam, head map[string]interface{}, mycookie []*http.Cookie, encoding string, useproxy, ishttps bool, code string, timeout int64, noredirect bool) []byte {
+	defer mu.Catch()
+	timeout = timeout * 10
+	msgid := mu.UUID(8)
+	if len(head) < 1 {
+		l := len(agent.UserAgents["common"])
+		r := rand.New(rand.NewSource(time.Now().UnixNano()))
+		head["User-Agent"] = agent.UserAgents["common"][r.Intn(l)]
+	}
+	var ret []byte
+	var err error
+	if downloaderid == "" {
+		ret, err = MsclientFile.Call("", msgid, mu.SERVICE_DOWNLOAD, mu.SENDTO_TYPE_RAND_RECIVER, map[string]interface{}{
+			"url":        url,
+			"method":     method,
+			"head":       head,
+			"reqparam":   reqparam,
+			"cookie":     mycookie,
+			"encoding":   encoding,
+			"useproxy":   useproxy,
+			"ishttps":    ishttps,
+			"new":        true,
+			"noredirect": noredirect,
+		}, timeout)
+	} else {
+		if isAvailableFile(downloaderid) {
+			ret, err = MsclientFile.Call(downloaderid, msgid, mu.SERVICE_DOWNLOAD, mu.SENDTO_TYPE_P2P, map[string]interface{}{
+				"url":        url,
+				"method":     method,
+				"head":       head,
+				"reqparam":   reqparam,
+				"cookie":     mycookie,
+				"encoding":   encoding,
+				"useproxy":   useproxy,
+				"ishttps":    ishttps,
+				"new":        true,
+				"noredirect": noredirect,
+			}, timeout)
+		} else {
+			return nil
+		}
+	}
+	if err != nil {
+		str := code + "方法DownloadFile,url:" + url + ",err:" + err.Error()
+		logger.Error(str, timeout)
+	}
+	tmp := map[string]interface{}{}
+	json.Unmarshal(ret, &tmp)
+	if v, ok := tmp["code"].(string); ok && v == "200" {
+		bs, _ := base64.StdEncoding.DecodeString(tmp["content"].(string))
+		return bs
+	} else {
+		return nil
+	}
+}
 
 //下载页面,发送消息,等待下载
 func DownloadAdv(downloaderid, url, method string, reqparam, head map[string]interface{}, mycookie []*http.Cookie, encoding string, useproxy, ishttps bool, code string, timeout int64) (string, []*http.Cookie, map[string]interface{}) {

+ 2 - 2
src/spider/msgservice.go

@@ -290,12 +290,12 @@ func SaveObj(event int, checkAtrr string, data map[string]interface{}, saveredis
 			}
 			//保存服务未接收成功的数据会存入data_bak中,确保数据不丢失依赖补发程序
 			if id != "" {
-				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 				if !flag { //保存服务发送成功
 					//全量(判断是否已存在防止覆盖id)
 					isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
 					if !isExist {
-						util.PutRedis("title_repeat_fulljudgement", db, hashHref, "lua_"+id, -1)
+						util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 					}
 				}
 			}

+ 54 - 5
src/spider/script.go

@@ -241,6 +241,12 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 		S.Push(ret)
 		return 1
 	}))
+	//推送列表页下载数据量
+	s.L.SetGlobal("sendListNum", s.L.NewFunction(func(S *lua.LState) int {
+		//table := S.ToTable(-1)
+		//list := util.TableToMap(table)
+		return 1
+	}))
 	// s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int {
 	// 	update := [][]map[string]interface{}{}
 	// 	query := map[string]interface{}{"state": 0}
@@ -329,6 +335,7 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 		str := S.CheckString(-1)
 		switch codeType {
 		case "unicode":
+			str = strings.Replace(str, "%u", "\\u", -1)
 			str = transUnic(str)
 		case "urlencode_gbk":
 			data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
@@ -787,19 +794,61 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 	}))
 	//获取验证码
 	s.L.SetGlobal("getCodeByPath", s.L.NewFunction(func(S *lua.LState) int {
-		head := S.ToTable(-1)
-		path := S.ToString(-2)
+		cookie := S.ToString(-1)
+		head := S.ToTable(-2)
+		stype := S.ToString(-3)
+		path := S.ToString(-4)
 		headMap := util.GetTable(head)
-		qu.Debug(headMap)
+		//qu.Debug("cookie----------", cookie)
+		//qu.Debug("headMap----------", headMap)
 		headJsonStr := ""
 		headByte, err := json.Marshal(headMap)
 		if err == nil {
 			headJsonStr = string(headByte)
 		}
-		code := codegrpc.GetCodeByPath(path, headJsonStr)
+		code, respHead, respCookie := codegrpc.GetCodeByPath(path, stype, headJsonStr, cookie)
+		//qu.Debug("code====", code)
+		//qu.Debug("respHead====", respHead)
+		//qu.Debug("respCookie====", respCookie)
 		S.Push(lua.LString(code))
-		return 1
+		respHeadMap := map[string]interface{}{}
+		json.Unmarshal([]byte(respHead), &respHeadMap)
+		hTable := util.MapToLuaTable(S, respHeadMap)
+		S.Push(hTable)
+		S.Push(lua.LString(respCookie))
+		return 3
 	}))
+	s.L.SetGlobal("newDownloadFile", s.L.NewFunction(func(S *lua.LState) int {
+		cookie := S.ToString(-1)
+		head := S.ToTable(-2)
+		param := S.ToTable(-3)
+		method := S.ToString(-4)
+		url := S.ToString(-5)
+		fileName := S.ToString(-6)
+		ishttps := strings.Contains(url, "https")
+		var mycookie []*http.Cookie
+		if cookie != "{}" {
+			json.Unmarshal([]byte(cookie), &mycookie)
+		} else {
+			mycookie = make([]*http.Cookie, 0)
+		}
+		fileName = strings.TrimSpace(fileName)
+		url = strings.TrimSpace(url)
+		ret := NewDownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout, false)
+		url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
+		if strings.TrimSpace(ftype) == "" {
+			if len(path.Ext(name)) > 0 {
+				ftype = path.Ext(name)[1:]
+			}
+		}
+		S.Push(lua.LString(url))
+		S.Push(lua.LString(name))
+		S.Push(lua.LString(size))
+		S.Push(lua.LString(ftype))
+		S.Push(lua.LString(fid))
+		return 5
+	}))
+
 	return ""
 }
 func dealHref(pageListUrl, href string) string {

+ 3 - 3
src/spider/spider.go

@@ -172,7 +172,7 @@ func DownloadHighDetail(code string) {
 						if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
 							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
 							MgoS.Update("spider_highlistdata", query, set, false, false)
-							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 							return
 						}
 					}
@@ -216,7 +216,7 @@ func DownloadHighDetail(code string) {
 					} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
 						log.Println("beforeHref:", href, "afterHref:", href)
 						//增量
-						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 						//全量
 						db := HexToBigIntMod(tmphref)
 						hashHref := HexText(href)
@@ -275,7 +275,7 @@ func DownloadHighDetail(code string) {
 func FilterByDetail(href string, query, data map[string]interface{}) bool {
 	if data["delete"] != nil {
 		//增量
-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 		//全量
 		db := HexToBigIntMod(href)
 		hashHref := HexText(href)