|
@@ -269,30 +269,7 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
|
|
}
|
|
}
|
|
fileName = strings.TrimSpace(fileName)
|
|
fileName = strings.TrimSpace(fileName)
|
|
url = strings.TrimSpace(url)
|
|
url = strings.TrimSpace(url)
|
|
- var retLen int64
|
|
|
|
- ret = DownloadFile(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
|
|
|
|
- //流量统计
|
|
|
|
- //if retLen > 0 {
|
|
|
|
- // key := Today + "+" + code
|
|
|
|
- // if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
|
|
|
|
- // if sfMap, ok := sf.(*SpiderFlow); ok {
|
|
|
|
- // sfMap.Flow += retLen
|
|
|
|
- // //sfMap.Site = *site
|
|
|
|
- // //sfMap.Channel = *channel
|
|
|
|
- // //sfMap.ModifyUser = *user
|
|
|
|
- // SpiderFlowMap.Store(key, sfMap)
|
|
|
|
- // }
|
|
|
|
- // } else {
|
|
|
|
- // SpiderFlowMap.Store(key, &SpiderFlow{
|
|
|
|
- // //Code: code,
|
|
|
|
- // Site: *site,
|
|
|
|
- // Channel: *channel,
|
|
|
|
- // Flow: retLen,
|
|
|
|
- // ModifyUser: *user,
|
|
|
|
- // })
|
|
|
|
- // }
|
|
|
|
- //}
|
|
|
|
-
|
|
|
|
|
|
+ ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
|
|
url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
|
|
url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
|
|
if strings.TrimSpace(ftype) == "" {
|
|
if strings.TrimSpace(ftype) == "" {
|
|
if len(path.Ext(name)) > 0 {
|
|
if len(path.Ext(name)) > 0 {
|
|
@@ -324,6 +301,109 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
|
|
s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
|
|
s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
|
|
return 5
|
|
return 5
|
|
}))
|
|
}))
|
|
|
|
+ /*
|
|
|
|
+ //附件大小限制3KB时,解决中国政府采购网附件采集问题
|
|
|
|
+ s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
|
|
|
|
+ if s.FileLastThreeTimes == nil {
|
|
|
|
+ s.FileLastThreeTimes = make([]time.Duration, 4)
|
|
|
|
+ }
|
|
|
|
+ if util.Config.IsDelay {
|
|
|
|
+ SleepTime(3, s.FileLastThreeTimes) //睡眠时间
|
|
|
|
+ }
|
|
|
|
+ start := time.Now() //起始时间
|
|
|
|
+ cookie := S.ToString(-1)
|
|
|
|
+ head := S.ToTable(-2)
|
|
|
|
+ param := S.ToTable(-3)
|
|
|
|
+ method := S.ToString(-4)
|
|
|
|
+ url := S.ToString(-5)
|
|
|
|
+ fileName := S.ToString(-6)
|
|
|
|
+ ishttps := strings.Contains(url, "https")
|
|
|
|
+ //base64匹配
|
|
|
|
+ base64UrlReg := regexp.MustCompile("data:image")
|
|
|
|
+ indexArr := base64UrlReg.FindStringIndex(url)
|
|
|
|
+ name, size, ftype, fid := "", "", "", ""
|
|
|
|
+ tmpUrl := ""
|
|
|
|
+ var ret []byte
|
|
|
|
+ var err error
|
|
|
|
+ var mycookie []*http.Cookie
|
|
|
|
+ if cookie != "{}" {
|
|
|
|
+ json.Unmarshal([]byte(cookie), &mycookie)
|
|
|
|
+ } else {
|
|
|
|
+ mycookie = make([]*http.Cookie, 0)
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ //base64 url
|
|
|
|
+ if len(indexArr) == 2 { //base64 http://www.mmjyjt.com/
|
|
|
|
+ //截取base64
|
|
|
|
+ start := indexArr[0]
|
|
|
|
+ url = url[start:]
|
|
|
|
+ fileName = "文件下载.jpg"
|
|
|
|
+ index := strings.Index(url, ",")
|
|
|
|
+ dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(url[index+1:]))
|
|
|
|
+ ret, err = io.ReadAll(dec)
|
|
|
|
+ if err == nil && len(ret) > 0 {
|
|
|
|
+ url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, "", ret)
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ fileName = strings.TrimSpace(fileName)
|
|
|
|
+ url = strings.TrimSpace(url)
|
|
|
|
+ tmpUrl = url
|
|
|
|
+ ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
|
|
|
|
+ url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
|
|
|
|
+ if strings.TrimSpace(ftype) == "" {
|
|
|
|
+ if len(path.Ext(name)) > 0 {
|
|
|
|
+ ftype = path.Ext(name)[1:]
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ //特殊处理中国招标投标公共服务平台异常附件过滤
|
|
|
|
+ if *site == "中国招标投标公共服务平台" {
|
|
|
|
+ if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件
|
|
|
|
+ size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载
|
|
|
|
+ } else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf
|
|
|
|
+ logger.Info("Error File Type:", bttype, url)
|
|
|
|
+ size, ftype, fid = "", "", ""
|
|
|
|
+ }
|
|
|
|
+ } else if *site == "中国政府采购网" && tmpUrl != "" { //中国政府采购网附件大小异常,限制IP所致
|
|
|
|
+ if size == "4.1 KB" || size == "4.2 KB" {
|
|
|
|
+ times := 1
|
|
|
|
+ for { //重试三次
|
|
|
|
+ if times > 3 {
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ //http://www.ccgp.gov.cn/cggg/dfgg/jzxcs/202302/t20230210_19437644.htm
|
|
|
|
+ ret = DownloadFile(s.Downloader, tmpUrl, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
|
|
|
|
+ bs := bytes.NewReader(ret)
|
|
|
|
+ bsLen := qu.ConvertFileSize(bs.Len())
|
|
|
|
+ if bsLen != "4.1 KB" && bsLen != "4.2 KB" && bsLen != "0 B" {
|
|
|
|
+ url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, tmpUrl, ret)
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ times++
|
|
|
|
+ }
|
|
|
|
+ if size == "4.1 KB" || size == "4.2 KB" { //重试后异常
|
|
|
|
+ fid = ""
|
|
|
|
+ ftype = ""
|
|
|
|
+ name = ""
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ S.Push(lua.LString(url))
|
|
|
|
+ S.Push(lua.LString(name))
|
|
|
|
+ S.Push(lua.LString(size))
|
|
|
|
+ S.Push(lua.LString(ftype))
|
|
|
|
+ S.Push(lua.LString(fid))
|
|
|
|
+ atomic.AddInt32(&s.ToDayRequestNum, 1)
|
|
|
|
+ atomic.AddInt32(&s.TotalRequestNum, 1)
|
|
|
|
+
|
|
|
|
+ end := time.Since(start)
|
|
|
|
+ if len(s.FileLastThreeTimes) >= 4 {
|
|
|
|
+ s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
|
|
|
|
+ }
|
|
|
|
+ s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
|
|
|
|
+ return 5
|
|
|
|
+ }))
|
|
|
|
+ */
|
|
//下载、上传base64图片
|
|
//下载、上传base64图片
|
|
s.L.SetGlobal("downloadBase64File", s.L.NewFunction(func(S *lua.LState) int {
|
|
s.L.SetGlobal("downloadBase64File", s.L.NewFunction(func(S *lua.LState) int {
|
|
url := S.ToString(-3)
|
|
url := S.ToString(-3)
|