Browse Source

DownloadFile方法更新

maxiaoshan 2 years ago
parent
commit
16bb5fc0dc
1 changed files with 108 additions and 108 deletions
  1. 108 108
      src/spider/script.go

+ 108 - 108
src/spider/script.go

@@ -227,6 +227,82 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		return 3
 	}))
 	//下载附件downloadFile(url,method,param,head,cookie,fileName)
+	//s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
+	//	if s.FileLastThreeTimes == nil {
+	//		s.FileLastThreeTimes = make([]time.Duration, 4)
+	//	}
+	//	if util.Config.IsDelay {
+	//		SleepTime(3, s.FileLastThreeTimes) //睡眠时间
+	//	}
+	//	start := time.Now() //起始时间
+	//	cookie := S.ToString(-1)
+	//	head := S.ToTable(-2)
+	//	param := S.ToTable(-3)
+	//	method := S.ToString(-4)
+	//	url := S.ToString(-5)
+	//	fileName := S.ToString(-6)
+	//	ishttps := strings.Contains(url, "https")
+	//	//base64匹配
+	//	base64UrlReg := regexp.MustCompile("data:image")
+	//	indexArr := base64UrlReg.FindStringIndex(url)
+	//	name, size, ftype, fid := "", "", "", ""
+	//	var ret []byte
+	//	var err error
+	//	//base64 url
+	//	if len(indexArr) == 2 { //base64 http://www.mmjyjt.com/
+	//		//截取base64
+	//		start := indexArr[0]
+	//		url = url[start:]
+	//		fileName = "文件下载.jpg"
+	//		index := strings.Index(url, ",")
+	//		dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(url[index+1:]))
+	//		ret, err = io.ReadAll(dec)
+	//		if err == nil && len(ret) > 0 {
+	//			url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, "", ret)
+	//		}
+	//	} else {
+	//		var mycookie []*http.Cookie
+	//		if cookie != "{}" {
+	//			json.Unmarshal([]byte(cookie), &mycookie)
+	//		} else {
+	//			mycookie = make([]*http.Cookie, 0)
+	//		}
+	//		fileName = strings.TrimSpace(fileName)
+	//		url = strings.TrimSpace(url)
+	//		ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
+	//		url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
+	//		if strings.TrimSpace(ftype) == "" {
+	//			if len(path.Ext(name)) > 0 {
+	//				ftype = path.Ext(name)[1:]
+	//			}
+	//		}
+	//	}
+	//	//特殊处理中国招标投标公共服务平台异常附件过滤
+	//	if *site == "中国招标投标公共服务平台" {
+	//		if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件
+	//			size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载
+	//		} else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf
+	//			logger.Info("Error File Type:", bttype, url)
+	//			size, ftype, fid = "", "", ""
+	//		}
+	//	}
+	//	S.Push(lua.LString(url))
+	//	S.Push(lua.LString(name))
+	//	S.Push(lua.LString(size))
+	//	S.Push(lua.LString(ftype))
+	//	S.Push(lua.LString(fid))
+	//	atomic.AddInt32(&s.ToDayRequestNum, 1)
+	//	atomic.AddInt32(&s.TotalRequestNum, 1)
+	//
+	//	end := time.Since(start)
+	//	if len(s.FileLastThreeTimes) >= 4 {
+	//		s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
+	//	}
+	//	s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
+	//	return 5
+	//}))
+
+	//附件大小限制3KB时,解决中国政府采购网附件采集问题
 	s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
 		if s.FileLastThreeTimes == nil {
 			s.FileLastThreeTimes = make([]time.Duration, 4)
@@ -246,8 +322,16 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		base64UrlReg := regexp.MustCompile("data:image")
 		indexArr := base64UrlReg.FindStringIndex(url)
 		name, size, ftype, fid := "", "", "", ""
+		tmpUrl := ""
 		var ret []byte
 		var err error
+		var mycookie []*http.Cookie
+		if cookie != "{}" {
+			json.Unmarshal([]byte(cookie), &mycookie)
+		} else {
+			mycookie = make([]*http.Cookie, 0)
+		}
+
 		//base64 url
 		if len(indexArr) == 2 { //base64 http://www.mmjyjt.com/
 			//截取base64
@@ -261,14 +345,9 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 				url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, "", ret)
 			}
 		} else {
-			var mycookie []*http.Cookie
-			if cookie != "{}" {
-				json.Unmarshal([]byte(cookie), &mycookie)
-			} else {
-				mycookie = make([]*http.Cookie, 0)
-			}
 			fileName = strings.TrimSpace(fileName)
 			url = strings.TrimSpace(url)
+			tmpUrl = url
 			ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
 			url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
 			if strings.TrimSpace(ftype) == "" {
@@ -285,6 +364,29 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 				logger.Info("Error File Type:", bttype, url)
 				size, ftype, fid = "", "", ""
 			}
+		} else if *site == "中国政府采购网" && tmpUrl != "" { //中国政府采购网附件大小异常,限制IP所致
+			if size == "4.1 KB" || size == "4.2 KB" {
+				times := 1
+				for { //重试三次
+					if times > 3 {
+						break
+					}
+					//http://www.ccgp.gov.cn/cggg/dfgg/jzxcs/202302/t20230210_19437644.htm
+					ret = DownloadFile(s.Downloader, tmpUrl, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
+					bs := bytes.NewReader(ret)
+					bsLen := qu.ConvertFileSize(bs.Len())
+					if bsLen != "4.1 KB" && bsLen != "4.2 KB" && bsLen != "0 B" {
+						url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, tmpUrl, ret)
+						break
+					}
+					times++
+				}
+				if size == "4.1 KB" || size == "4.2 KB" { //重试后异常
+					fid = ""
+					ftype = ""
+					name = ""
+				}
+			}
 		}
 		S.Push(lua.LString(url))
 		S.Push(lua.LString(name))
@@ -301,109 +403,7 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
 		return 5
 	}))
-	/*
-		//附件大小限制3KB时,解决中国政府采购网附件采集问题
-		s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
-			if s.FileLastThreeTimes == nil {
-				s.FileLastThreeTimes = make([]time.Duration, 4)
-			}
-			if util.Config.IsDelay {
-				SleepTime(3, s.FileLastThreeTimes) //睡眠时间
-			}
-			start := time.Now() //起始时间
-			cookie := S.ToString(-1)
-			head := S.ToTable(-2)
-			param := S.ToTable(-3)
-			method := S.ToString(-4)
-			url := S.ToString(-5)
-			fileName := S.ToString(-6)
-			ishttps := strings.Contains(url, "https")
-			//base64匹配
-			base64UrlReg := regexp.MustCompile("data:image")
-			indexArr := base64UrlReg.FindStringIndex(url)
-			name, size, ftype, fid := "", "", "", ""
-			tmpUrl := ""
-			var ret []byte
-			var err error
-			var mycookie []*http.Cookie
-			if cookie != "{}" {
-				json.Unmarshal([]byte(cookie), &mycookie)
-			} else {
-				mycookie = make([]*http.Cookie, 0)
-			}
 
-			//base64 url
-			if len(indexArr) == 2 { //base64 http://www.mmjyjt.com/
-				//截取base64
-				start := indexArr[0]
-				url = url[start:]
-				fileName = "文件下载.jpg"
-				index := strings.Index(url, ",")
-				dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(url[index+1:]))
-				ret, err = io.ReadAll(dec)
-				if err == nil && len(ret) > 0 {
-					url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, "", ret)
-				}
-			} else {
-				fileName = strings.TrimSpace(fileName)
-				url = strings.TrimSpace(url)
-				tmpUrl = url
-				ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
-				url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
-				if strings.TrimSpace(ftype) == "" {
-					if len(path.Ext(name)) > 0 {
-						ftype = path.Ext(name)[1:]
-					}
-				}
-			}
-			//特殊处理中国招标投标公共服务平台异常附件过滤
-			if *site == "中国招标投标公共服务平台" {
-				if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件
-					size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载
-				} else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf
-					logger.Info("Error File Type:", bttype, url)
-					size, ftype, fid = "", "", ""
-				}
-			} else if *site == "中国政府采购网" && tmpUrl != "" { //中国政府采购网附件大小异常,限制IP所致
-				if size == "4.1 KB" || size == "4.2 KB" {
-					times := 1
-					for { //重试三次
-						if times > 3 {
-							break
-						}
-						//http://www.ccgp.gov.cn/cggg/dfgg/jzxcs/202302/t20230210_19437644.htm
-						ret = DownloadFile(s.Downloader, tmpUrl, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
-						bs := bytes.NewReader(ret)
-						bsLen := qu.ConvertFileSize(bs.Len())
-						if bsLen != "4.1 KB" && bsLen != "4.2 KB" && bsLen != "0 B" {
-							url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, tmpUrl, ret)
-							break
-						}
-						times++
-					}
-					if size == "4.1 KB" || size == "4.2 KB" { //重试后异常
-						fid = ""
-						ftype = ""
-						name = ""
-					}
-				}
-			}
-			S.Push(lua.LString(url))
-			S.Push(lua.LString(name))
-			S.Push(lua.LString(size))
-			S.Push(lua.LString(ftype))
-			S.Push(lua.LString(fid))
-			atomic.AddInt32(&s.ToDayRequestNum, 1)
-			atomic.AddInt32(&s.TotalRequestNum, 1)
-
-			end := time.Since(start)
-			if len(s.FileLastThreeTimes) >= 4 {
-				s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
-			}
-			s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
-			return 5
-		}))
-	*/
 	//下载、上传base64图片
 	s.L.SetGlobal("downloadBase64File", s.L.NewFunction(func(S *lua.LState) int {
 		url := S.ToString(-3)