|
@@ -1,25 +1,30 @@
|
|
|
package script
|
|
|
|
|
|
import (
|
|
|
+ "bytes"
|
|
|
"context"
|
|
|
+ "crypto/tls"
|
|
|
+ "encoding/json"
|
|
|
"errors"
|
|
|
"fmt"
|
|
|
+ "github.com/chromedp/cdproto/browser"
|
|
|
+ "github.com/chromedp/cdproto/network"
|
|
|
+ "github.com/chromedp/cdproto/page"
|
|
|
+ "github.com/chromedp/chromedp"
|
|
|
+ "github.com/imroc/req/v3"
|
|
|
+ "github.com/yuin/gopher-lua"
|
|
|
"github.com/yuin/gopher-lua/parse"
|
|
|
+ "io/ioutil"
|
|
|
qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
|
+ "net/http"
|
|
|
"net/url"
|
|
|
"os"
|
|
|
"path/filepath"
|
|
|
"spider_creator/backend"
|
|
|
+ be "spider_creator/backend"
|
|
|
"strconv"
|
|
|
"strings"
|
|
|
"time"
|
|
|
-
|
|
|
- "github.com/chromedp/cdproto/browser"
|
|
|
- "github.com/chromedp/cdproto/network"
|
|
|
- "github.com/chromedp/cdproto/page"
|
|
|
- "github.com/chromedp/chromedp"
|
|
|
- "github.com/yuin/gopher-lua"
|
|
|
- be "spider_creator/backend"
|
|
|
)
|
|
|
|
|
|
const (
|
|
@@ -423,6 +428,267 @@ func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selecto
|
|
|
act)
|
|
|
}
|
|
|
|
|
|
+func (b *GLBrowser) AnalyzeCodeByPath(path, stype, head, cookie string, proxy bool) (code string, rh http.Header, rc []*http.Cookie) {
|
|
|
+ //先用免费,为识别再用收费
|
|
|
+ ok := false
|
|
|
+ code, rh, rc, _, ok = getCodeByFree(path, stype, head, cookie, proxy) //自己的服务
|
|
|
+ qu.Debug("Get Code By Free Result:", path, ok, code)
|
|
|
+ if qu.IntAll(stype) > 0 && !ok {
|
|
|
+ code, rh, rc = getCodeByPay(path, stype, head, cookie, proxy) //超级鹰收费
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+func getCodeByFree(path, stype, head, cookie string, proxy bool) (code string, respheader http.Header, respcookie []*http.Cookie, getCodeResp *req.Response, ok bool) {
|
|
|
+ defer qu.Catch()
|
|
|
+ client := req.C().
|
|
|
+ SetTimeout(time.Duration(be.Cfg.ServerCodeTimeOut) * time.Second).
|
|
|
+ SetTLSClientConfig(&tls.Config{
|
|
|
+ Renegotiation: tls.RenegotiateOnceAsClient,
|
|
|
+ InsecureSkipVerify: true,
|
|
|
+ }) //忽略证书验证
|
|
|
+ headers := map[string]string{}
|
|
|
+ if head != "" {
|
|
|
+ json.Unmarshal([]byte(head), &headers)
|
|
|
+ }
|
|
|
+ cookies := []*http.Cookie{}
|
|
|
+ if cookie != "" {
|
|
|
+ json.Unmarshal([]byte(cookie), &cookies)
|
|
|
+ }
|
|
|
+ for times := 1; times <= 6; times++ { //重试三次
|
|
|
+ if times > 2 || proxy { //重试第4次开始,使用代理ip
|
|
|
+ if stype == "-1" {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ proxyIp := be.GetProxyAddr() //获取代理地址
|
|
|
+ qu.Debug("proxy:", path, proxyIp)
|
|
|
+ client.SetProxyURL(proxyIp) //设置代理IP
|
|
|
+ }
|
|
|
+ request := client.R()
|
|
|
+ if len(headers) > 0 {
|
|
|
+ request.SetHeaders(headers)
|
|
|
+ }
|
|
|
+ if len(cookies) > 0 {
|
|
|
+ request.SetCookies(cookies...)
|
|
|
+ }
|
|
|
+ //下载验证码图片
|
|
|
+ var err error
|
|
|
+ var resultByte []byte
|
|
|
+ //address := be.Cfg.ServerCodeFreeAddressOcr
|
|
|
+ if stype == "-1" { //传base64的图片
|
|
|
+ resultByte = []byte(path)
|
|
|
+ } else {
|
|
|
+ if stype == "6001" { //计算类验证码解析接口地址
|
|
|
+ //address = be.Cfg.ServerCodeFreeAddressArithmetic
|
|
|
+ }
|
|
|
+ getCodeResp, err = request.Get(path) //通过请求图片地址返回的byte
|
|
|
+ resultByte = getCodeResp.Bytes()
|
|
|
+ }
|
|
|
+ if err != nil {
|
|
|
+ qu.Debug("Get Code By Path Error: ", path, err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ code, err = getCode(resultByte, stype, true)
|
|
|
+ if err == nil && code != "" {
|
|
|
+ if getCodeResp != nil {
|
|
|
+ respheader = getCodeResp.Header
|
|
|
+ respcookie = getCodeResp.Cookies()
|
|
|
+ }
|
|
|
+ ok = true
|
|
|
+ return
|
|
|
+ }
|
|
|
+ //解析验证码
|
|
|
+ //codeResp, err := client.R().
|
|
|
+ // SetHeader("accept", "application/json").
|
|
|
+ // SetFileReader("file", "1", bytes.NewReader(resultByte)).
|
|
|
+ // Post(address)
|
|
|
+ //if err != nil {
|
|
|
+ // qu.Debug("analysis code by path err: ", path, err)
|
|
|
+ // continue
|
|
|
+ //}
|
|
|
+ //yzmResult := map[string]interface{}{}
|
|
|
+ //json.Unmarshal(codeResp.Bytes(), &yzmResult)
|
|
|
+ //qu.Debug(path, yzmResult)
|
|
|
+ //if err != nil || yzmResult == nil {
|
|
|
+ // continue
|
|
|
+ //}
|
|
|
+ //result := yzmResult["r"].(map[string]interface{})
|
|
|
+ //yzm := fmt.Sprint(result["code"])
|
|
|
+ //if yzm != "" {
|
|
|
+ // if stype == "6001" || len(yzm) >= 4 {
|
|
|
+ // code = yzm //长度小于4的视为识别错误
|
|
|
+ // if getCodeResp != nil {
|
|
|
+ // respheader = getCodeResp.Header
|
|
|
+ // respcookie = getCodeResp.Cookies()
|
|
|
+ // }
|
|
|
+ // ok = true
|
|
|
+ // return
|
|
|
+ // }
|
|
|
+ //}
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+func getCodeByPay(path, stype, head, cookie string, proxy bool) (code string, respheader http.Header, respcookie []*http.Cookie) {
|
|
|
+ defer qu.Catch()
|
|
|
+ client := req.C().
|
|
|
+ SetTimeout(time.Duration(be.Cfg.ServerCodeTimeOut) * time.Second).
|
|
|
+ SetTLSClientConfig(&tls.Config{
|
|
|
+ Renegotiation: tls.RenegotiateOnceAsClient,
|
|
|
+ InsecureSkipVerify: true,
|
|
|
+ }) //忽略证书验证
|
|
|
+ headers := map[string]string{}
|
|
|
+ if head != "" {
|
|
|
+ json.Unmarshal([]byte(head), &headers)
|
|
|
+ }
|
|
|
+ cookies := []*http.Cookie{}
|
|
|
+ if cookie != "" {
|
|
|
+ json.Unmarshal([]byte(cookie), &cookies)
|
|
|
+ }
|
|
|
+ for times := 1; times <= 2; times++ { //重试三次
|
|
|
+ //atomic.AddInt64(&PyTimes, 1)
|
|
|
+ if times > 1 || proxy { //重试第2次开始,使用代理ip
|
|
|
+ proxyIp := be.GetProxyAddr() //获取代理地址
|
|
|
+ qu.Debug("proxy:", path, proxyIp)
|
|
|
+ client.SetProxyURL(proxyIp) //设置代理IP
|
|
|
+ }
|
|
|
+ request := client.R()
|
|
|
+ if len(headers) > 0 {
|
|
|
+ request.SetHeaders(headers)
|
|
|
+ }
|
|
|
+ if len(cookies) > 0 {
|
|
|
+ request.SetCookies(cookies...)
|
|
|
+ }
|
|
|
+ //下载验证码图片
|
|
|
+ getCodeResp, err := request.Get(path)
|
|
|
+ //log.Println("respHeader---", getCodeResp.Header)
|
|
|
+ //log.Println("respCookie---", getCodeResp.Cookies())
|
|
|
+ if err != nil {
|
|
|
+ qu.Debug("Get Code By Path Error: ", path, err)
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ code, err = getCode(getCodeResp.Bytes(), stype, false)
|
|
|
+ if err == nil && code != "" {
|
|
|
+ respheader = getCodeResp.Header
|
|
|
+ respcookie = getCodeResp.Cookies()
|
|
|
+ return
|
|
|
+ }
|
|
|
+ //解析验证码
|
|
|
+ //data := map[string]string{
|
|
|
+ // "grant_type": "",
|
|
|
+ // "username": "jianyu001",
|
|
|
+ // "password": "123qwe!A",
|
|
|
+ // "scope": "",
|
|
|
+ // "client_id": "",
|
|
|
+ // "client_secret ": "",
|
|
|
+ //}
|
|
|
+ //codeResp, err := client.R().
|
|
|
+ // SetHeader("accept", "application/json").
|
|
|
+ // SetFileReader("file", "1", bytes.NewReader(getCodeResp.Bytes())).
|
|
|
+ // SetFormData(data).
|
|
|
+ // Post(be.Cfg.ServerCodeAddress + stype)
|
|
|
+ //if err != nil {
|
|
|
+ // qu.Debug("analysis code by path err: ", path, err)
|
|
|
+ // continue
|
|
|
+ //}
|
|
|
+ //codeResult := map[string]interface{}{}
|
|
|
+ //json.Unmarshal(codeResp.Bytes(), &codeResult)
|
|
|
+ //qu.Debug("codeResult:", codeResult)
|
|
|
+ //qu.Debug("codeResult:", result)
|
|
|
+ //if err != nil || result == nil {
|
|
|
+ // continue
|
|
|
+ //}
|
|
|
+ //if yzm, ok := result["r"].(map[string]interface{})["pic_str"].(string); ok && yzm != "" && len(yzm) >= 4 {
|
|
|
+ // code = yzm
|
|
|
+ // respheader = getCodeResp.Header
|
|
|
+ // respcookie = getCodeResp.Cookies()
|
|
|
+ // return
|
|
|
+ //}
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+func getCode(b []byte, stype string, free bool) (code string, err error) {
|
|
|
+ qu.Debug("验证码类型:", stype, ",是否免费:", free)
|
|
|
+ //解析验证码
|
|
|
+ request := req.C().R().
|
|
|
+ SetHeader("accept", "application/json").
|
|
|
+ SetFileReader("file", "1", bytes.NewReader(b))
|
|
|
+ address := be.Cfg.ServerCodeFreeAddressOcr
|
|
|
+ if !free {
|
|
|
+ data := map[string]string{
|
|
|
+ "grant_type": "",
|
|
|
+ "username": be.Cfg.Username,
|
|
|
+ "password": be.Cfg.Password,
|
|
|
+ "scope": "",
|
|
|
+ "client_id": "",
|
|
|
+ "client_secret ": "",
|
|
|
+ }
|
|
|
+ request.SetFormData(data)
|
|
|
+ address = be.Cfg.ServerCodeAddress + stype
|
|
|
+ } else if stype == "6001" { //计算类验证码解析接口地址
|
|
|
+ address = be.Cfg.ServerCodeFreeAddressArithmetic
|
|
|
+ }
|
|
|
+ qu.Debug("address:", address)
|
|
|
+ var resp *req.Response
|
|
|
+ resp, err = request.Post(address)
|
|
|
+ if err != nil {
|
|
|
+ qu.Debug("analysis code by path err: ", err)
|
|
|
+ return
|
|
|
+ }
|
|
|
+ var result map[string]interface{}
|
|
|
+ err = json.Unmarshal(resp.Bytes(), &result)
|
|
|
+ qu.Debug("验证码解析结果:", free, result)
|
|
|
+ if err == nil && result != nil {
|
|
|
+ if free {
|
|
|
+ r, _ := result["r"].(map[string]interface{})
|
|
|
+ codeTmp := qu.ObjToString(r["code"])
|
|
|
+ if len(codeTmp) >= 4 || stype == "6001" && codeTmp != "" {
|
|
|
+ return codeTmp, nil
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if codeTmp, ok := result["r"].(map[string]interface{})["pic_str"].(string); ok && codeTmp != "" {
|
|
|
+ if stype == "6001" || len(codeTmp) >= 4 {
|
|
|
+ return codeTmp, nil
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+// AnalyzeCodeScreenShot 截屏解析验证码
|
|
|
+func (b *GLBrowser) AnalyzeCodeScreenShot(tabTitle, tabUrl, selector string, selectorType int, timeout int64, stype string) (code string, err error) {
|
|
|
+ ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
|
|
|
+ if err != nil {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ var act chromedp.QueryAction
|
|
|
+ var bt []byte
|
|
|
+ switch selectorType {
|
|
|
+ case selector_type_id:
|
|
|
+ act = chromedp.Screenshot(selector, &bt, chromedp.ByID)
|
|
|
+ case selector_type_query:
|
|
|
+ act = chromedp.Screenshot(selector, &bt, chromedp.ByQuery)
|
|
|
+ //case selector_type_search:
|
|
|
+ //case selector_type_jspath:
|
|
|
+ default:
|
|
|
+ //option = chromedp.ByQueryAll
|
|
|
+ chromedp.Screenshot(selector, &bt, chromedp.ByQueryAll)
|
|
|
+ }
|
|
|
+ err = chromedp.Run(ctx,
|
|
|
+ act,
|
|
|
+ )
|
|
|
+ //保存
|
|
|
+ if err = ioutil.WriteFile("code.png", bt, 0755); err != nil {
|
|
|
+ qu.Debug(err)
|
|
|
+ }
|
|
|
+ code, err = getCode(bt, stype, true) //免费
|
|
|
+ if err != nil || code == "" {
|
|
|
+ code, err = getCode(bt, stype, false) //收费
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
// BindLuaState
|
|
|
func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) {
|
|
|
//执行暂停
|
|
@@ -591,6 +857,35 @@ func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) {
|
|
|
}
|
|
|
return 1
|
|
|
}))
|
|
|
+ //s.SetGlobal("browser_analyzecode_bypath", s.NewFunction(func(S *lua.LState) int {
|
|
|
+ // proxy := S.ToBool(-5)
|
|
|
+ // url := S.ToString(-4)
|
|
|
+ // stype := S.ToString(-3)
|
|
|
+ // head := S.ToTable(-2)
|
|
|
+ // cookie := S.ToString(-1)
|
|
|
+ // headMap := TableToMap(head)
|
|
|
+ // //qu.Debug("cookie----------", cookie)
|
|
|
+ // //qu.Debug("headMap----------", headMap)
|
|
|
+ // headJsonStr := ""
|
|
|
+ // headByte, err := json.Marshal(headMap)
|
|
|
+ // if err == nil {
|
|
|
+ // headJsonStr = string(headByte)
|
|
|
+ // }
|
|
|
+ // code, respHead, respCookie := b.AnalyzeCodeByPath(url, stype, headJsonStr, cookie, proxy)
|
|
|
+ // rhead, _ := json.Marshal(respHead)
|
|
|
+ // respHeadMap := map[string]interface{}{}
|
|
|
+ // json.Unmarshal(rhead, &respHeadMap)
|
|
|
+ // hTable := MapToTable(respHeadMap)
|
|
|
+ //
|
|
|
+ // rcookie, _ := json.Marshal(respCookie)
|
|
|
+ // respCookieMap := []map[string]interface{}{}
|
|
|
+ // json.Unmarshal(rcookie, &respCookieMap)
|
|
|
+ // cTable := MapToTable(map[string]interface{}{"cookie": respCookieMap})
|
|
|
+ // S.Push(lua.LString(code))
|
|
|
+ // S.Push(hTable)
|
|
|
+ // S.Push(cTable.RawGetString("cookie"))
|
|
|
+ // return 3
|
|
|
+ //}))
|
|
|
//发布时间格式化
|
|
|
s.SetGlobal("browser_publishtime", s.NewFunction(func(l *lua.LState) int {
|
|
|
text := l.ToString(-1)
|
|
@@ -598,17 +893,29 @@ func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) {
|
|
|
l.Push(lua.LString(publishtime))
|
|
|
return 1
|
|
|
}))
|
|
|
+ //截屏功能
|
|
|
+ s.SetGlobal("browser_analyzecode_screenshot", s.NewFunction(func(l *lua.LState) int {
|
|
|
+ tabTitle := l.ToString(-6)
|
|
|
+ tabUrl := l.ToString(-5)
|
|
|
+ stype := l.ToString(-4)
|
|
|
+ timeout := l.ToInt64(-3)
|
|
|
+ selectorType := l.ToInt(-2)
|
|
|
+ selector := l.ToString(-1)
|
|
|
+ code, _ := b.AnalyzeCodeScreenShot(tabTitle, tabUrl, selector, selectorType, timeout, stype)
|
|
|
+ l.Push(lua.LString(code))
|
|
|
+ return 1
|
|
|
+ }))
|
|
|
//保存数据
|
|
|
s.SetGlobal("browser_savedata", s.NewFunction(func(l *lua.LState) int {
|
|
|
//fmt.Println("---browser_savedata---")
|
|
|
- page := l.ToString(-2)
|
|
|
+ pageType := l.ToString(-2)
|
|
|
data := l.ToTable(-1)
|
|
|
result := TableToMap(data)
|
|
|
- if page == "list" {
|
|
|
+ if pageType == "list" {
|
|
|
result["recordid"] = recordId
|
|
|
}
|
|
|
DataCache <- result
|
|
|
- return 1
|
|
|
+ return 0
|
|
|
}))
|
|
|
//获取数据
|
|
|
s.SetGlobal("browser_getdata", s.NewFunction(func(l *lua.LState) int {
|