소스 검색

DownloadFile方法更新

maxiaoshan 2 년 전
부모
커밋
a9430bdb8d
2개의 변경된 파일105개의 추가작업 그리고 25개의 파일을 삭제
  1. 1 1
      src/spider/download.go
  2. 104 24
      src/spider/script.go

+ 1 - 1
src/spider/download.go

@@ -198,7 +198,7 @@ func DownloadFile_bak(downloaderid, url, method string, reqparam, head map[strin
 	}
 }
 
-func DownloadFile(retLen *int64, downloaderid, url, method string, reqparam, head map[string]interface{}, mycookie []*http.Cookie, encoding string, useproxy, ishttps bool, code string, timeout int64) []byte {
+func DownloadFile(downloaderid, url, method string, reqparam, head map[string]interface{}, mycookie []*http.Cookie, encoding string, useproxy, ishttps bool, code string, timeout int64) []byte {
 	defer mu.Catch()
 	timeout = timeout * 2
 	msgid := mu.UUID(8)

+ 104 - 24
src/spider/script.go

@@ -269,30 +269,7 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 			}
 			fileName = strings.TrimSpace(fileName)
 			url = strings.TrimSpace(url)
-			var retLen int64
-			ret = DownloadFile(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
-			//流量统计
-			//if retLen > 0 {
-			//	key := Today + "+" + code
-			//	if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
-			//		if sfMap, ok := sf.(*SpiderFlow); ok {
-			//			sfMap.Flow += retLen
-			//			//sfMap.Site = *site
-			//			//sfMap.Channel = *channel
-			//			//sfMap.ModifyUser = *user
-			//			SpiderFlowMap.Store(key, sfMap)
-			//		}
-			//	} else {
-			//		SpiderFlowMap.Store(key, &SpiderFlow{
-			//			//Code:       code,
-			//			Site:       *site,
-			//			Channel:    *channel,
-			//			Flow:       retLen,
-			//			ModifyUser: *user,
-			//		})
-			//	}
-			//}
-
+			ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
 			url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
 			if strings.TrimSpace(ftype) == "" {
 				if len(path.Ext(name)) > 0 {
@@ -324,6 +301,109 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
 		return 5
 	}))
+	/*
+		//附件大小限制3KB时,解决中国政府采购网附件采集问题
+		s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
+			if s.FileLastThreeTimes == nil {
+				s.FileLastThreeTimes = make([]time.Duration, 4)
+			}
+			if util.Config.IsDelay {
+				SleepTime(3, s.FileLastThreeTimes) //睡眠时间
+			}
+			start := time.Now() //起始时间
+			cookie := S.ToString(-1)
+			head := S.ToTable(-2)
+			param := S.ToTable(-3)
+			method := S.ToString(-4)
+			url := S.ToString(-5)
+			fileName := S.ToString(-6)
+			ishttps := strings.Contains(url, "https")
+			//base64匹配
+			base64UrlReg := regexp.MustCompile("data:image")
+			indexArr := base64UrlReg.FindStringIndex(url)
+			name, size, ftype, fid := "", "", "", ""
+			tmpUrl := ""
+			var ret []byte
+			var err error
+			var mycookie []*http.Cookie
+			if cookie != "{}" {
+				json.Unmarshal([]byte(cookie), &mycookie)
+			} else {
+				mycookie = make([]*http.Cookie, 0)
+			}
+
+			//base64 url
+			if len(indexArr) == 2 { //base64 http://www.mmjyjt.com/data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAqAAAAOwCAYAAAD
+				//截取base64
+				start := indexArr[0]
+				url = url[start:]
+				fileName = "文件下载.jpg"
+				index := strings.Index(url, ",")
+				dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(url[index+1:]))
+				ret, err = io.ReadAll(dec)
+				if err == nil && len(ret) > 0 {
+					url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, "", ret)
+				}
+			} else {
+				fileName = strings.TrimSpace(fileName)
+				url = strings.TrimSpace(url)
+				tmpUrl = url
+				ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
+				url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
+				if strings.TrimSpace(ftype) == "" {
+					if len(path.Ext(name)) > 0 {
+						ftype = path.Ext(name)[1:]
+					}
+				}
+			}
+			//特殊处理中国招标投标公共服务平台异常附件过滤
+			if *site == "中国招标投标公共服务平台" {
+				if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件
+					size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载
+				} else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf
+					logger.Info("Error File Type:", bttype, url)
+					size, ftype, fid = "", "", ""
+				}
+			} else if *site == "中国政府采购网" && tmpUrl != "" { //中国政府采购网附件大小异常,限制IP所致
+				if size == "4.1 KB" || size == "4.2 KB" {
+					times := 1
+					for { //重试三次
+						if times > 3 {
+							break
+						}
+						//http://www.ccgp.gov.cn/cggg/dfgg/jzxcs/202302/t20230210_19437644.htm
+						ret = DownloadFile(s.Downloader, tmpUrl, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
+						bs := bytes.NewReader(ret)
+						bsLen := qu.ConvertFileSize(bs.Len())
+						if bsLen != "4.1 KB" && bsLen != "4.2 KB" && bsLen != "0 B" {
+							url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, tmpUrl, ret)
+							break
+						}
+						times++
+					}
+					if size == "4.1 KB" || size == "4.2 KB" { //重试后异常
+						fid = ""
+						ftype = ""
+						name = ""
+					}
+				}
+			}
+			S.Push(lua.LString(url))
+			S.Push(lua.LString(name))
+			S.Push(lua.LString(size))
+			S.Push(lua.LString(ftype))
+			S.Push(lua.LString(fid))
+			atomic.AddInt32(&s.ToDayRequestNum, 1)
+			atomic.AddInt32(&s.TotalRequestNum, 1)
+
+			end := time.Since(start)
+			if len(s.FileLastThreeTimes) >= 4 {
+				s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
+			}
+			s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
+			return 5
+		}))
+	*/
 	//下载、上传base64图片
 	s.L.SetGlobal("downloadBase64File", s.L.NewFunction(func(S *lua.LState) int {
 		url := S.ToString(-3)