package script import ( "context" "errors" "fmt" "github.com/yuin/gopher-lua/parse" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" "net/url" "os" "path/filepath" "spider_creator/backend" "strconv" "strings" "time" "github.com/chromedp/cdproto/browser" "github.com/chromedp/cdproto/network" "github.com/chromedp/cdproto/page" "github.com/chromedp/chromedp" "github.com/yuin/gopher-lua" be "spider_creator/backend" ) const ( selector_type_id = 0 selector_type_query = 1 selector_type_search = 2 selector_type_jspath = 3 selector_type_query_all = 4 execute_return_type_string = 0 execute_return_type_list = 1 execute_return_type_table = 2 qlm_list_lua = "/script/qlm_list.lua" qlm_detail_lua = "/script/qlm_detail.lua" ) type GLVm struct { attachesDir string dnf backend.EventNotifyFace Headless bool ShowImage bool ProxyServer bool ProxyAddr string B *GLBrowser //WsAddr string //RunMode int //S Storage } type GLBrowser struct { BaseCancelFn context.CancelFunc Ctx context.Context CancelFn context.CancelFunc } func NewGLVM(attachesDir string, dnf be.EventNotifyFace) *GLVm { return &GLVm{ attachesDir: attachesDir, dnf: dnf, } } // LoadScript 加载脚本 func (glvm *GLVm) LoadScript(page string) string { var path string if page == "list" { path = glvm.attachesDir + qlm_list_lua } else if page == "detail" { path = glvm.attachesDir + qlm_detail_lua } bs, err := os.ReadFile(path) if err != nil { qu.Debug(path, "脚本加载失败...") } return string(bs) } // RunScript 执行lua代码 func (glvm *GLVm) RunScript(script string) error { defer Catch() var state *lua.LState = lua.NewState() defer state.Close() //方法绑定 glvm.ResetBrowser() //先创建浏览器对象 glvm.BindLuaState(state) //绑定虚拟机函数 glvm.B.BindLuaState(state) defer func() { if b := glvm.B; b != nil { b.CancelFn() b.Ctx = nil b.CancelFn = nil b.BaseCancelFn() b.BaseCancelFn = nil } }() reader := strings.NewReader(script) chunk, err := parse.Parse(reader, "code") if err != nil { return err } proto, err := lua.Compile(chunk, script) if err != nil { return err } lfunc := state.NewFunctionFromProto(proto) state.Push(lfunc) state.Call(0, 0) return nil } // ResetBrowser 重置浏览器 func (vm *GLVm) ResetBrowser() { if vm.B != nil && vm.B.CancelFn != nil && vm.B.BaseCancelFn != nil { vm.B.CancelFn() vm.B.BaseCancelFn() vm.B.Ctx = nil vm.B.CancelFn = nil vm.B.BaseCancelFn = nil } _, baseCancelFn, _, _, ctx, incCancelFn := backend.NewBrowser(vm.Headless, vm.ShowImage, vm.ProxyServer, "https://") b := &GLBrowser{ BaseCancelFn: baseCancelFn, Ctx: ctx, CancelFn: incCancelFn, } if vm.B == nil { vm.B = b } else { vm.B.Ctx, vm.B.CancelFn = b.Ctx, b.CancelFn } } // BindLuaState 绑定虚拟机函数 func (vm *GLVm) BindLuaState(state *lua.LState) { state.SetGlobal("browser_reset", state.NewFunction(func(l *lua.LState) int { vm.ResetBrowser() return 0 })) // state.SetGlobal("browser_save", state.NewFunction(func(l *lua.LState) int { //spiderCode := l.ToString(-5) //siteName := l.ToString(-4) //siteChannelName := l.ToString(-3) //siteChannelUrl := l.ToString(-2) /*table := l.ToTable(-1) data := TableToMap(table)*/ //vm.S.Save(spiderCode, siteName, siteChannelName, siteChannelUrl, data) return 0 })) } // findTab 根据标题、url找tab func (b *GLBrowser) findTabContext(tabTitle, tabUrl string, timeoutInt64 int64) (ctx context.Context, err error) { if timeoutInt64 == 0 { timeoutInt64 = 5000 } timeout := time.Duration(timeoutInt64) * time.Millisecond if tabTitle == "" && tabUrl == "" { ctx, _ = context.WithTimeout(b.Ctx, timeout) return ctx, nil } else { ts, err := chromedp.Targets(b.Ctx) if err != nil { return nil, err } for _, t := range ts { if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) { // log.Printf("find tab param: %s %s found %s %s", tabTitle, tabUrl, // t.Title, t.URL) newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID)) ctx, _ = context.WithTimeout(newCtx, timeout) return ctx, nil } } } return nil, errors.New("can't find tab") } // CloseTabs关闭页面 func (b *GLBrowser) CloseTabs(tabTitle, tabUrl string, timeoutInt64 int64) (err error) { if timeoutInt64 == 0 { timeoutInt64 = 5 } timeout := time.Duration(timeoutInt64) * time.Millisecond ts, err := chromedp.Targets(b.Ctx) if err != nil { return err } for _, t := range ts { if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) { newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID)) ctx, _ := context.WithTimeout(newCtx, timeout) chromedp.Run( ctx, page.Close(), ) } } return nil } // Navigate 导航到指定网址 func (b *GLBrowser) Navigate(tabTitle string, tabUrl string, isNewTab bool, targetUrl string, timeout int64) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } //新标签页 if isNewTab { ctx, _ = chromedp.NewContext(ctx) } // return chromedp.Run(ctx, chromedp.Navigate(targetUrl)) } // Navigate 导航到指定网址,并保存请求资源,如图片等 func (b *GLBrowser) NavigateAndSaveRes(tabTitle string, tabUrl string, timeout int64, isNewTab bool, targetUrl string, saveFileTypeList, save2dir string) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } //新标签页 if isNewTab { ctx, _ = chromedp.NewContext(ctx) } // saveFileType := strings.Split(saveFileTypeList, " ") isNeedRes := func(fileType string) bool { for _, v := range saveFileType { if strings.Contains(fileType, v) { return true } } return false } fnURL2FileName := func(requestURL string) string { u, err := url.Parse(requestURL) if err != nil { return "" } _, filename := filepath.Split(u.Path) return filename } var cache = map[network.RequestID]string{} chromedp.ListenTarget(ctx, func(v interface{}) { switch ev := v.(type) { case *network.EventRequestWillBeSent: //准备下载 cache[ev.RequestID] = ev.Request.URL case *network.EventResponseReceived: //检查回应头的contenttype contentType, _ := ev.Response.Headers["Content-Type"].(string) fmt.Println(contentType) if !isNeedRes(contentType) { delete(cache, ev.RequestID) } case *network.EventLoadingFinished: //下载完成 if uri, ok := cache[ev.RequestID]; ok { filename := fnURL2FileName(uri) fmt.Println("save2file", filename) if filename != "" { filePath := filepath.Join(save2dir, filename) var buf []byte if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error { var err error buf, err = network.GetResponseBody(ev.RequestID).Do(ctx) return err })); err == nil { os.WriteFile(filePath, buf, 0777) } else { fmt.Println(err.Error()) } } } } }) // err = chromedp.Run(ctx, chromedp.Navigate(targetUrl)) //下载存储 return err } // ExecuteJS 执行脚本 func (b *GLBrowser) ExecuteJS(tabTitle, tabUrl, script string, ret interface{}, timeout int64) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } return chromedp.Run(ctx, chromedp.Evaluate(script, ret)) } // Click 点击 func (b *GLBrowser) Click(tabTitle, tabUrl, selector string, selectorType int, timeout int64) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } var act chromedp.QueryAction switch selectorType { case selector_type_id: act = chromedp.Click(selector, chromedp.ByID) case selector_type_query: act = chromedp.Click(selector, chromedp.ByQuery) case selector_type_search: act = chromedp.Click(selector, chromedp.BySearch) case selector_type_jspath: act = chromedp.Click(selector, chromedp.ByJSPath) default: act = chromedp.Click(selector, chromedp.ByQueryAll) } err = chromedp.Run(ctx, act) return err } // KeySend 键盘输入 func (b *GLBrowser) KeySend(tabTitle, tabUrl, selector, sendStr string, selectorType int, timeout int64) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } var act chromedp.QueryAction switch selectorType { case selector_type_id: act = chromedp.SendKeys(selector, sendStr, chromedp.ByID) case selector_type_query: act = chromedp.SendKeys(selector, sendStr, chromedp.ByQuery) case selector_type_search: act = chromedp.SendKeys(selector, sendStr, chromedp.BySearch) case selector_type_jspath: act = chromedp.SendKeys(selector, sendStr, chromedp.ByJSPath) default: act = chromedp.SendKeys(selector, sendStr, chromedp.ByQueryAll) } return chromedp.Run(ctx, act) } // WaitVisible 等待元素可见 func (b *GLBrowser) WaitVisible(tabTitle, tabUrl, selector string, selectorType int, timeout int64) error { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } var act chromedp.QueryAction switch selectorType { case selector_type_id: act = chromedp.WaitVisible(selector, chromedp.ByID) case selector_type_query: act = chromedp.WaitVisible(selector, chromedp.ByQuery) case selector_type_search: act = chromedp.WaitVisible(selector, chromedp.BySearch) case selector_type_jspath: act = chromedp.WaitVisible(selector, chromedp.ByJSPath) default: act = chromedp.WaitVisible(selector, chromedp.ByQueryAll) } return chromedp.Run(ctx, act) } // 重置浏览器 func (b *GLBrowser) Reset() { } // DownloadFile 只有在非headless模式下有效,与click方法其实是一致的 func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selector string, selectorType int, save2dir string) error { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } var act chromedp.QueryAction switch selectorType { case selector_type_id: act = chromedp.Click(selector, chromedp.ByID) case selector_type_query: act = chromedp.Click(selector, chromedp.ByQuery) case selector_type_search: act = chromedp.Click(selector, chromedp.BySearch) case selector_type_jspath: act = chromedp.Click(selector, chromedp.ByJSPath) default: act = chromedp.Click(selector, chromedp.ByQueryAll) } return chromedp.Run(ctx, browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).WithDownloadPath(save2dir).WithEventsEnabled(true), act) } // BindLuaState func (b *GLBrowser) BindLuaState(s *lua.LState) { //执行暂停 s.SetGlobal("browser_sleep", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_sleep---") timeout := l.ToInt64(-1) if timeout == 0 { timeout = 5 } time.Sleep(time.Duration(timeout) * time.Millisecond) return 0 })) //关闭tabl页 s.SetGlobal("browser_closetabs", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_closetabs---") timeout := l.ToInt64(-3) tabTitle := l.ToString(-2) tabUrl := l.ToString(-1) if timeout == 0 { timeout = 5 } b.CloseTabs(tabTitle, tabUrl, timeout) return 0 })) //注册打开地址 s.SetGlobal("browser_navagite", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_navagite---") tabTitle := l.ToString(-5) //指定标签页title tabUrl := l.ToString(-4) //指定标签页url isNewTab := l.ToBool(-3) //是否打开新的标签页 timeout := l.ToInt64(-2) //网页打开的超时时间 targetUrl := l.ToString(-1) //打开网页的链接 if err := b.Navigate(tabTitle, tabUrl, isNewTab, targetUrl, timeout); err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //执行浏览器端js s.SetGlobal("browser_executejs", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_executejs---") tabTitle := l.ToString(-5) tabUrl := l.ToString(-4) timeout := l.ToInt64(-3) returnType := l.ToInt(-2) //返回数据类型 script := l.ToString(-1) //执行的js switch returnType { case execute_return_type_string: //返回string var ret string if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil { l.Push(lua.LString("ok")) l.Push(lua.LString(ret)) } else { l.Push(lua.LString("err")) l.Push(lua.LString(err.Error())) } case execute_return_type_list: //返回list var ret = make([]interface{}, 0, 0) var tmp = make(map[string]interface{}) if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil { for i, v := range ret { tmp[strconv.Itoa(i)] = v } l.Push(lua.LString("ok")) l.Push(MapToTable(tmp)) } else { l.Push(lua.LString("err")) l.Push(lua.LString(err.Error())) } case execute_return_type_table: //返回table var ret = make(map[string]interface{}) if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil { l.Push(lua.LString("ok")) l.Push(MapToTable(ret)) } else { l.Push(lua.LString("err")) l.Push(lua.LString(err.Error())) } } return 2 })) //按键 s.SetGlobal("browser_keysend", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_keysend---") tabTitle := l.ToString(-6) tabUrl := l.ToString(-5) timeout := l.ToInt64(-4) words := l.ToString(-3) selectorType := l.ToInt(-2) selector := l.ToString(-1) fmt.Println(selector, words, selectorType, timeout) err := b.KeySend(tabTitle, tabUrl, selector, words, selectorType, timeout) if err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //点击 s.SetGlobal("browser_click", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_click---") tabTitle := l.ToString(-5) tabUrl := l.ToString(-4) timeout := l.ToInt64(-3) selectorType := l.ToInt(-2) selector := l.ToString(-1) err := b.Click(tabTitle, tabUrl, selector, selectorType, timeout) if err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) s.SetGlobal("browser_waitvisible", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_waitvisible---") tabTitle := l.ToString(-5) tabUrl := l.ToString(-4) timeout := l.ToInt64(-3) selectorType := l.ToInt(-2) //选择器类型 selector := l.ToString(-1) //选择器 err := b.WaitVisible(tabTitle, tabUrl, selector, selectorType, timeout) if err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //点击 s.SetGlobal("browser_downloadfile", s.NewFunction(func(l *lua.LState) int { tabTitle := l.ToString(-6) tabUrl := l.ToString(-5) timeout := l.ToInt64(-4) selectorType := l.ToInt(-3) selector := l.ToString(-2) save2dir := l.ToString(-1) err := b.DownloadFile(tabTitle, tabUrl, timeout, selector, selectorType, save2dir) if err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //注册打开地址 s.SetGlobal("browser_navagite_download_res", s.NewFunction(func(l *lua.LState) int { tabTitle := l.ToString(-7) tabUrl := l.ToString(-6) timeout := l.ToInt64(-5) isNewTab := l.ToBool(-4) targetUrl := l.ToString(-3) saveFileTypeList := l.ToString(-2) savedir := l.ToString(-1) if err := b.NavigateAndSaveRes(tabTitle, tabUrl, timeout, isNewTab, targetUrl, saveFileTypeList, savedir); err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) }