Преглед на файлове

新增处理二维码功能

mxs преди 7 месеца
родител
ревизия
a04d1859c7
променени са 3 файла, в които са добавени 208 реда и са изтрити 22 реда
  1. 2 2
      backend/browser.go
  2. 200 7
      backend/script/script.go
  3. 6 13
      main.go

+ 2 - 2
backend/browser.go

@@ -159,7 +159,7 @@ func NewBrowser(headless bool, showImage bool, proxyServe bool, baseUrl string)
 
 func GetProxyAddr() string {
 	proxyAddr := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
-	roxyAuthor := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+	proxyAuthor := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
 	//获取代理
 	req, err := http.NewRequest(http.MethodGet, proxyAddr, nil)
 	if err != nil {
@@ -167,7 +167,7 @@ func GetProxyAddr() string {
 		return ""
 	}
 	//添加请求头
-	req.Header.Add("Authorization", roxyAuthor)
+	req.Header.Add("Authorization", proxyAuthor)
 	client := http.Client{}
 	//发送请求
 	resp, err := client.Do(req)

+ 200 - 7
backend/script/script.go

@@ -1,25 +1,29 @@
 package script
 
 import (
+	"bytes"
 	"context"
+	"crypto/tls"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"github.com/chromedp/cdproto/browser"
+	"github.com/chromedp/cdproto/network"
+	"github.com/chromedp/cdproto/page"
+	"github.com/chromedp/chromedp"
+	"github.com/imroc/req/v3"
+	"github.com/yuin/gopher-lua"
 	"github.com/yuin/gopher-lua/parse"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	"net/http"
 	"net/url"
 	"os"
 	"path/filepath"
 	"spider_creator/backend"
+	be "spider_creator/backend"
 	"strconv"
 	"strings"
 	"time"
-
-	"github.com/chromedp/cdproto/browser"
-	"github.com/chromedp/cdproto/network"
-	"github.com/chromedp/cdproto/page"
-	"github.com/chromedp/chromedp"
-	"github.com/yuin/gopher-lua"
-	be "spider_creator/backend"
 )
 
 const (
@@ -423,6 +427,166 @@ func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selecto
 		act)
 }
 
+func (b *GLBrowser) AnalysisCode(path, stype, head, cookie string, proxy bool) (code string, rh http.Header, rc []*http.Cookie) {
+	//先用免费,为识别再用收费
+	ok := false
+	code, rh, rc, _, ok = getCodeByFree(path, stype, head, cookie, proxy) //自己的服务
+	qu.Debug("Get Code By Free Result:", path, ok, code)
+	if qu.IntAll(stype) > 0 && !ok {
+		code, rh, rc = getCodeByPay(path, stype, head, cookie, proxy) //超级鹰收费
+	}
+	return
+}
+
+func getCodeByFree(path, stype, head, cookie string, proxy bool) (code string, respheader http.Header, respcookie []*http.Cookie, getCodeResp *req.Response, ok bool) {
+	defer qu.Catch()
+	client := req.C().
+		SetTimeout(time.Duration(be.Cfg.ServerCodeTimeOut) * time.Second).
+		SetTLSClientConfig(&tls.Config{
+			Renegotiation:      tls.RenegotiateOnceAsClient,
+			InsecureSkipVerify: true,
+		}) //忽略证书验证
+	headers := map[string]string{}
+	if head != "" {
+		json.Unmarshal([]byte(head), &headers)
+	}
+	cookies := []*http.Cookie{}
+	if cookie != "" {
+		json.Unmarshal([]byte(cookie), &cookies)
+	}
+	for times := 1; times <= 6; times++ { //重试三次
+		if times > 2 || proxy { //重试第4次开始,使用代理ip
+			if stype == "-1" {
+				return
+			}
+			proxyIp := be.GetProxyAddr() //获取代理地址
+			qu.Debug("proxy:", path, proxyIp)
+			client.SetProxyURL(proxyIp) //设置代理IP
+		}
+		request := client.R()
+		if len(headers) > 0 {
+			request.SetHeaders(headers)
+		}
+		if len(cookies) > 0 {
+			request.SetCookies(cookies...)
+		}
+		//下载验证码图片
+		var err error
+		var resultByte []byte
+		address := be.Cfg.ServerCodeFreeAddressOcr
+		if stype == "-1" { //传base64的图片
+			resultByte = []byte(path)
+		} else {
+			if stype == "6001" { //计算类验证码解析接口地址
+				address = be.Cfg.ServerCodeFreeAddressArithmetic
+			}
+			getCodeResp, err = request.Get(path) //通过请求图片地址返回的byte
+			resultByte = getCodeResp.Bytes()
+		}
+		if err != nil {
+			qu.Debug("Get Code By Path Error: ", path, err)
+			continue
+		}
+		//解析验证码
+		codeResp, err := client.R().
+			SetHeader("accept", "application/json").
+			SetFileReader("file", "1", bytes.NewReader(resultByte)).
+			Post(address)
+		if err != nil {
+			qu.Debug("analysis code by path err: ", path, err)
+			continue
+		}
+		yzmResult := map[string]interface{}{}
+		json.Unmarshal(codeResp.Bytes(), &yzmResult)
+		qu.Debug(path, yzmResult)
+		result := yzmResult["r"].(map[string]interface{})
+		yzm := fmt.Sprint(result["code"])
+		if yzm != "" {
+			if stype == "6001" || len(yzm) >= 4 {
+				code = yzm //长度小于4的视为识别错误
+				if getCodeResp != nil {
+					respheader = getCodeResp.Header
+					respcookie = getCodeResp.Cookies()
+				}
+				ok = true
+				return
+			}
+		}
+	}
+	return
+}
+
+func getCodeByPay(path, stype, head, cookie string, proxy bool) (code string, respheader http.Header, respcookie []*http.Cookie) {
+	defer qu.Catch()
+	client := req.C().
+		SetTimeout(time.Duration(be.Cfg.ServerCodeTimeOut) * time.Second).
+		SetTLSClientConfig(&tls.Config{
+			Renegotiation:      tls.RenegotiateOnceAsClient,
+			InsecureSkipVerify: true,
+		}) //忽略证书验证
+	headers := map[string]string{}
+	if head != "" {
+		json.Unmarshal([]byte(head), &headers)
+	}
+	cookies := []*http.Cookie{}
+	if cookie != "" {
+		json.Unmarshal([]byte(cookie), &cookies)
+	}
+	for times := 1; times <= 2; times++ { //重试三次
+		//atomic.AddInt64(&PyTimes, 1)
+		if times > 1 || proxy { //重试第2次开始,使用代理ip
+			proxyIp := be.GetProxyAddr() //获取代理地址
+			qu.Debug("proxy:", path, proxyIp)
+			client.SetProxyURL(proxyIp) //设置代理IP
+		}
+		request := client.R()
+		if len(headers) > 0 {
+			request.SetHeaders(headers)
+		}
+		if len(cookies) > 0 {
+			request.SetCookies(cookies...)
+		}
+		//下载验证码图片
+		getCodeResp, err := request.Get(path)
+		//log.Println("respHeader---", getCodeResp.Header)
+		//log.Println("respCookie---", getCodeResp.Cookies())
+		if err != nil {
+			qu.Debug("Get Code By Path Error: ", path, err)
+			continue
+		}
+		//解析验证码
+		data := map[string]string{
+			"grant_type":     "",
+			"username":       "jianyu001",
+			"password":       "123qwe!A",
+			"scope":          "",
+			"client_id":      "",
+			"client_secret ": "",
+		}
+		codeResp, err := client.R().
+			SetHeader("accept", "application/json").
+			SetFileReader("file", "1", bytes.NewReader(getCodeResp.Bytes())).
+			SetFormData(data).
+			Post(be.Cfg.ServerCodeAddress + stype)
+		//SetFile("file", "C:/Users/topnet/Desktop/code.jpg").
+		// Post(spiderutil.Config.ServerCodeAddress)
+		if err != nil {
+			qu.Debug("analysis code by path err: ", path, err)
+			continue
+		}
+		codeResult := map[string]interface{}{}
+		json.Unmarshal(codeResp.Bytes(), &codeResult)
+		qu.Debug("codeResult:", codeResult)
+		if yzm, ok := codeResult["r"].(map[string]interface{})["pic_str"].(string); ok && yzm != "" && len(yzm) >= 4 {
+			code = yzm
+			respheader = getCodeResp.Header
+			respcookie = getCodeResp.Cookies()
+			return
+		}
+	}
+	return
+}
+
 // BindLuaState
 func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) {
 	//执行暂停
@@ -591,6 +755,35 @@ func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) {
 		}
 		return 1
 	}))
+	s.SetGlobal("browser_analysiscode", s.NewFunction(func(S *lua.LState) int {
+		cookie := S.ToString(-1)
+		head := S.ToTable(-2)
+		stype := S.ToString(-3)
+		path := S.ToString(-4)
+		proxy := S.ToBool(-5)
+		headMap := TableToMap(head)
+		//qu.Debug("cookie----------", cookie)
+		//qu.Debug("headMap----------", headMap)
+		headJsonStr := ""
+		headByte, err := json.Marshal(headMap)
+		if err == nil {
+			headJsonStr = string(headByte)
+		}
+		code, respHead, respCookie := b.AnalysisCode(path, stype, headJsonStr, cookie, proxy)
+		rhead, _ := json.Marshal(respHead)
+		respHeadMap := map[string]interface{}{}
+		json.Unmarshal(rhead, &respHeadMap)
+		hTable := MapToTable(respHeadMap)
+
+		rcookie, _ := json.Marshal(respCookie)
+		respCookieMap := []map[string]interface{}{}
+		json.Unmarshal(rcookie, &respCookieMap)
+		cTable := MapToTable(map[string]interface{}{"cookie": respCookieMap})
+		S.Push(lua.LString(code))
+		S.Push(hTable)
+		S.Push(cTable.RawGetString("cookie"))
+		return 3
+	}))
 	//发布时间格式化
 	s.SetGlobal("browser_publishtime", s.NewFunction(func(l *lua.LState) int {
 		text := l.ToString(-1)

+ 6 - 13
main.go

@@ -3,17 +3,15 @@ package main
 import (
 	"container/list"
 	"embed"
+	"github.com/wailsapp/wails/v2"
+	"github.com/wailsapp/wails/v2/pkg/options"
+	"github.com/wailsapp/wails/v2/pkg/options/assetserver"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	be "spider_creator/backend"
 	bdb "spider_creator/backend/db"
 	"spider_creator/backend/script"
 	bvm "spider_creator/backend/vm"
 	bws "spider_creator/backend/webservice"
-	"strconv"
-
-	"github.com/wailsapp/wails/v2"
-	"github.com/wailsapp/wails/v2/pkg/options"
-	"github.com/wailsapp/wails/v2/pkg/options/assetserver"
 )
 
 var (
@@ -29,23 +27,18 @@ var (
 	vm                   *bvm.VM
 	glvm                 *script.GLVm
 	ws                   *bws.WebService
-	//重点网站和正式环境
-	isOnly4MainSite            string = "false"
-	BrowserLoadResourceTimeout        = "5"
-	serverAddress                     = "http://visualizeld.spdata.jianyu360.com/%s" //正式环境
-	//serverAddress = "http://127.0.0.1:8091/%s" //正式环境
+	//serverAddress        = "http://visualizeld.spdata.jianyu360.com/%s" //正式环境
+	serverAddress = "http://127.0.0.1:8091/%s" //正式环境
 )
 
 //build
 // wails build -ldflags="-X 'main.isOnly4MainSite=false'" -o="剑鱼可视化爬虫开发工具_正式.exe"
 
 func init() {
-	//be.LoadConfig("./config.yaml")
-	be.Cfg.IsOnly4MainSite = isOnly4MainSite == "true"
+	be.LoadConfig("backend/config.yaml")
 	if be.Cfg.IsOnly4MainSite {
 		serverAddress = "http://visualize.spdata.jianyu360.com/%s" //重点网站
 	}
-	be.Cfg.BrowserLoadResourceTimeout, _ = strconv.ParseInt(BrowserLoadResourceTimeout, 10, 64)
 	qu.Debug("重点网站:", be.Cfg.IsOnly4MainSite, serverAddress)
 }