package script import ( "context" "errors" "fmt" "github.com/yuin/gopher-lua/parse" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" "net/url" "os" "path/filepath" "spider_creator/backend" "strconv" "strings" "time" "github.com/chromedp/cdproto/browser" "github.com/chromedp/cdproto/network" "github.com/chromedp/cdproto/page" "github.com/chromedp/chromedp" "github.com/yuin/gopher-lua" be "spider_creator/backend" ) const ( selector_type_id = 0 selector_type_query = 1 selector_type_search = 2 selector_type_jspath = 3 selector_type_query_all = 4 execute_return_type_string = 0 execute_return_type_list = 1 execute_return_type_table = 2 qlm_list_lua = "/qlm_list.lua" qlm_detail_lua = "/qlm_detail.lua" ) var ( DataCache = make(chan map[string]interface{}, 2000) Datas []map[string]interface{} ) type GLVm struct { ScriptDir string LogsDir string LogsFile *os.File Dnf backend.EventNotifyFace Headless bool ShowImage bool ProxyServer bool ProxyAddr string B *GLBrowser ScriptRunning bool //控制一次只能执行一个脚本 DataSaveOver chan bool } type GLBrowser struct { Ctx context.Context CancelFn context.CancelFunc } func NewGLVM(scriptDir, logsDir string, dnf be.EventNotifyFace) *GLVm { return &GLVm{ ScriptDir: scriptDir, LogsDir: logsDir, Dnf: dnf, DataSaveOver: make(chan bool, 1), } } // LoadScript 加载脚本 func (glvm *GLVm) LoadScript(page string) string { var path string if page == "list" { path = glvm.ScriptDir + qlm_list_lua } else if page == "detail" { path = glvm.ScriptDir + qlm_detail_lua } bs, err := os.ReadFile(path) if err != nil { qu.Debug(path, "脚本加载失败...") } return string(bs) } // RunScript 执行lua代码 func (glvm *GLVm) RunScript(script, recordId string) error { defer qu.Catch() var s *lua.LState = lua.NewState() defer s.Close() //日志文件 now := time.Now() path := glvm.LogsDir + fmt.Sprintf("/%s.log", qu.FormatDate(&now, qu.Date_Short_Layout)) qu.Debug("log path:", path) file, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0666) if err != nil { qu.Debug("日志创建失败:", err) return err } glvm.LogsFile = file defer glvm.LogsFile.Close() //方法绑定 glvm.ResetBrowser() //先创建浏览器对象 glvm.BindLuaState(s) //绑定虚拟机函数 glvm.B.BindLuaState(s, recordId) defer func() { if b := glvm.B; b != nil { b.CancelFn() b.Ctx = nil b.CancelFn = nil b = nil } }() reader := strings.NewReader(script) chunk, err := parse.Parse(reader, "code") if err != nil { return err } proto, err := lua.Compile(chunk, script) if err != nil { return err } lfunc := s.NewFunctionFromProto(proto) s.Push(lfunc) s.Call(0, 0) return nil } // ResetBrowser 重置浏览器 func (glvm *GLVm) ResetBrowser() { if glvm.B != nil && glvm.B.CancelFn != nil { glvm.B.CancelFn() glvm.B.Ctx = nil glvm.B.CancelFn = nil } _, _, _, _, ctx, incCancelFn := backend.NewBrowser(glvm.Headless, glvm.ShowImage, glvm.ProxyServer, "http://") b := &GLBrowser{ Ctx: ctx, CancelFn: incCancelFn, } if glvm.B == nil { glvm.B = b } else { glvm.B.Ctx, glvm.B.CancelFn = b.Ctx, b.CancelFn } } // BindLuaState 绑定虚拟机函数 func (glvm *GLVm) BindLuaState(s *lua.LState) { s.SetGlobal("browser_reset", s.NewFunction(func(l *lua.LState) int { glvm.ResetBrowser() return 0 })) s.SetGlobal("browser_savelog", s.NewFunction(func(l *lua.LState) int { text := l.ToString(-1) qu.Debug("log:", text) now := time.Now() glvm.LogsFile.Write([]byte(fmt.Sprintf("%s%s%s%s", qu.FormatDate(&now, qu.Date_Full_Layout), "---", text, "\n"))) return 0 })) } func (glvm *GLVm) CloseTabs() { if glvm.B != nil && glvm.B.CancelFn != nil { glvm.B.CancelFn() glvm.B.Ctx = nil glvm.B.CancelFn = nil glvm.B = nil } } // findTab 根据标题、url找tab func (b *GLBrowser) findTabContext(tabTitle, tabUrl string, timeoutInt64 int64) (ctx context.Context, err error) { if b.Ctx != nil { if timeoutInt64 == 0 { timeoutInt64 = 5000 } timeout := time.Duration(timeoutInt64) * time.Millisecond if tabTitle == "" && tabUrl == "" { ctx, _ = context.WithTimeout(b.Ctx, timeout) return ctx, nil } else { ts, err := chromedp.Targets(b.Ctx) if err != nil { return nil, err } for _, t := range ts { if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) { // log.Printf("find tab param: %s %s found %s %s", tabTitle, tabUrl, // t.Title, t.URL) newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID)) ctx, _ = context.WithTimeout(newCtx, timeout) return ctx, nil } } } return nil, errors.New("can't find tab") } return nil, errors.New("context is error") } // CloseTabs 关闭页面 func (b *GLBrowser) CloseTabs(tabTitle, tabUrl string, timeoutInt64 int64) (err error) { if timeoutInt64 == 0 { timeoutInt64 = 5 } timeout := time.Duration(timeoutInt64) * time.Millisecond ts, err := chromedp.Targets(b.Ctx) if err != nil { return err } for _, t := range ts { if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) { newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID)) ctx, _ := context.WithTimeout(newCtx, timeout) chromedp.Run( ctx, page.Close(), ) } } return nil } // Navigate 导航到指定网址 func (b *GLBrowser) Navigate(tabTitle string, tabUrl string, isNewTab bool, targetUrl string, timeout int64) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } //新标签页 if isNewTab { ctx, _ = chromedp.NewContext(ctx) } // return chromedp.Run(ctx, chromedp.Navigate(targetUrl)) } // Navigate 导航到指定网址,并保存请求资源,如图片等 func (b *GLBrowser) NavigateAndSaveRes(tabTitle string, tabUrl string, timeout int64, isNewTab bool, targetUrl string, saveFileTypeList, save2dir string) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } //新标签页 if isNewTab { ctx, _ = chromedp.NewContext(ctx) } // saveFileType := strings.Split(saveFileTypeList, " ") isNeedRes := func(fileType string) bool { for _, v := range saveFileType { if strings.Contains(fileType, v) { return true } } return false } fnURL2FileName := func(requestURL string) string { u, err := url.Parse(requestURL) if err != nil { return "" } _, filename := filepath.Split(u.Path) return filename } var cache = map[network.RequestID]string{} chromedp.ListenTarget(ctx, func(v interface{}) { switch ev := v.(type) { case *network.EventRequestWillBeSent: //准备下载 cache[ev.RequestID] = ev.Request.URL case *network.EventResponseReceived: //检查回应头的contenttype contentType, _ := ev.Response.Headers["Content-Type"].(string) fmt.Println(contentType) if !isNeedRes(contentType) { delete(cache, ev.RequestID) } case *network.EventLoadingFinished: //下载完成 if uri, ok := cache[ev.RequestID]; ok { filename := fnURL2FileName(uri) fmt.Println("save2file", filename) if filename != "" { filePath := filepath.Join(save2dir, filename) var buf []byte if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error { var err error buf, err = network.GetResponseBody(ev.RequestID).Do(ctx) return err })); err == nil { os.WriteFile(filePath, buf, 0777) } else { fmt.Println(err.Error()) } } } } }) // err = chromedp.Run(ctx, chromedp.Navigate(targetUrl)) //下载存储 return err } // ExecuteJS 执行脚本 func (b *GLBrowser) ExecuteJS(tabTitle, tabUrl, script string, ret interface{}, timeout int64) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } return chromedp.Run(ctx, chromedp.Evaluate(script, ret)) } // Click 点击 func (b *GLBrowser) Click(tabTitle, tabUrl, selector string, selectorType int, timeout int64) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } var act chromedp.QueryAction switch selectorType { case selector_type_id: act = chromedp.Click(selector, chromedp.ByID) case selector_type_query: act = chromedp.Click(selector, chromedp.ByQuery) case selector_type_search: act = chromedp.Click(selector, chromedp.BySearch) case selector_type_jspath: act = chromedp.Click(selector, chromedp.ByJSPath) default: act = chromedp.Click(selector, chromedp.ByQueryAll) } err = chromedp.Run(ctx, act) return err } // KeySend 键盘输入 func (b *GLBrowser) KeySend(tabTitle, tabUrl, selector, sendStr string, selectorType int, timeout int64) (err error) { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } var act chromedp.QueryAction switch selectorType { case selector_type_id: act = chromedp.SendKeys(selector, sendStr, chromedp.ByID) case selector_type_query: act = chromedp.SendKeys(selector, sendStr, chromedp.ByQuery) case selector_type_search: act = chromedp.SendKeys(selector, sendStr, chromedp.BySearch) case selector_type_jspath: act = chromedp.SendKeys(selector, sendStr, chromedp.ByJSPath) default: act = chromedp.SendKeys(selector, sendStr, chromedp.ByQueryAll) } return chromedp.Run(ctx, act) } // WaitVisible 等待元素可见 func (b *GLBrowser) WaitVisible(tabTitle, tabUrl, selector string, selectorType int, timeout int64) error { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } var act chromedp.QueryAction switch selectorType { case selector_type_id: act = chromedp.WaitVisible(selector, chromedp.ByID) case selector_type_query: act = chromedp.WaitVisible(selector, chromedp.ByQuery) case selector_type_search: act = chromedp.WaitVisible(selector, chromedp.BySearch) case selector_type_jspath: act = chromedp.WaitVisible(selector, chromedp.ByJSPath) default: act = chromedp.WaitVisible(selector, chromedp.ByQueryAll) } return chromedp.Run(ctx, act) } // 重置浏览器 func (b *GLBrowser) Reset() { } // DownloadFile 只有在非headless模式下有效,与click方法其实是一致的 func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selector string, selectorType int, save2dir string) error { ctx, err := b.findTabContext(tabTitle, tabUrl, timeout) if err != nil { return err } var act chromedp.QueryAction switch selectorType { case selector_type_id: act = chromedp.Click(selector, chromedp.ByID) case selector_type_query: act = chromedp.Click(selector, chromedp.ByQuery) case selector_type_search: act = chromedp.Click(selector, chromedp.BySearch) case selector_type_jspath: act = chromedp.Click(selector, chromedp.ByJSPath) default: act = chromedp.Click(selector, chromedp.ByQueryAll) } return chromedp.Run(ctx, browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).WithDownloadPath(save2dir).WithEventsEnabled(true), act) } // BindLuaState func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) { //执行暂停 s.SetGlobal("browser_sleep", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_sleep---") timeout := l.ToInt64(-1) if timeout == 0 { timeout = 5 } time.Sleep(time.Duration(timeout) * time.Millisecond) return 0 })) //关闭tabl页 s.SetGlobal("browser_closetabs", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_closetabs---") timeout := l.ToInt64(-3) tabTitle := l.ToString(-2) tabUrl := l.ToString(-1) if timeout == 0 { timeout = 5 } b.CloseTabs(tabTitle, tabUrl, timeout) return 0 })) //注册打开地址 s.SetGlobal("browser_navagite", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_navagite---") tabTitle := l.ToString(-5) //指定标签页title tabUrl := l.ToString(-4) //指定标签页url isNewTab := l.ToBool(-3) //是否打开新的标签页 timeout := l.ToInt64(-2) //网页打开的超时时间 targetUrl := l.ToString(-1) //打开网页的链接 if err := b.Navigate(tabTitle, tabUrl, isNewTab, targetUrl, timeout); err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //执行浏览器端js s.SetGlobal("browser_executejs", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_executejs---") tabTitle := l.ToString(-5) tabUrl := l.ToString(-4) timeout := l.ToInt64(-3) returnType := l.ToInt(-2) //返回数据类型 script := l.ToString(-1) //执行的js switch returnType { case execute_return_type_string: //返回string var ret string if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil { l.Push(lua.LString("ok")) l.Push(lua.LString(ret)) } else { l.Push(lua.LString("err")) l.Push(lua.LString(err.Error())) } case execute_return_type_list: //返回list var ret = make([]interface{}, 0, 0) var tmp = make(map[string]interface{}) if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil { for i, v := range ret { tmp[strconv.Itoa(i)] = v } l.Push(lua.LString("ok")) l.Push(MapToTable(tmp)) } else { l.Push(lua.LString("err")) l.Push(lua.LString(err.Error())) } case execute_return_type_table: //返回table var ret = make(map[string]interface{}) if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil { l.Push(lua.LString("ok")) l.Push(MapToTable(ret)) } else { l.Push(lua.LString("err")) l.Push(lua.LString(err.Error())) } } return 2 })) //按键 s.SetGlobal("browser_keysend", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_keysend---") tabTitle := l.ToString(-6) tabUrl := l.ToString(-5) timeout := l.ToInt64(-4) words := l.ToString(-3) selectorType := l.ToInt(-2) selector := l.ToString(-1) fmt.Println(selector, words, selectorType, timeout) err := b.KeySend(tabTitle, tabUrl, selector, words, selectorType, timeout) if err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //点击 s.SetGlobal("browser_click", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_click---") tabTitle := l.ToString(-5) tabUrl := l.ToString(-4) timeout := l.ToInt64(-3) selectorType := l.ToInt(-2) selector := l.ToString(-1) err := b.Click(tabTitle, tabUrl, selector, selectorType, timeout) if err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //等待元素加载 s.SetGlobal("browser_waitvisible", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_waitvisible---") tabTitle := l.ToString(-5) tabUrl := l.ToString(-4) timeout := l.ToInt64(-3) selectorType := l.ToInt(-2) //选择器类型 selector := l.ToString(-1) //选择器 err := b.WaitVisible(tabTitle, tabUrl, selector, selectorType, timeout) if err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //下载附件 s.SetGlobal("browser_downloadfile", s.NewFunction(func(l *lua.LState) int { tabTitle := l.ToString(-6) tabUrl := l.ToString(-5) timeout := l.ToInt64(-4) selectorType := l.ToInt(-3) selector := l.ToString(-2) save2dir := l.ToString(-1) err := b.DownloadFile(tabTitle, tabUrl, timeout, selector, selectorType, save2dir) if err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //注册打开地址 s.SetGlobal("browser_navagite_download_res", s.NewFunction(func(l *lua.LState) int { tabTitle := l.ToString(-7) tabUrl := l.ToString(-6) timeout := l.ToInt64(-5) isNewTab := l.ToBool(-4) targetUrl := l.ToString(-3) saveFileTypeList := l.ToString(-2) savedir := l.ToString(-1) if err := b.NavigateAndSaveRes(tabTitle, tabUrl, timeout, isNewTab, targetUrl, saveFileTypeList, savedir); err != nil { l.Push(lua.LString(err.Error())) } else { l.Push(lua.LString("ok")) } return 1 })) //发布时间格式化 s.SetGlobal("browser_publishtime", s.NewFunction(func(l *lua.LState) int { text := l.ToString(-1) publishtime := getPublitime(text) l.Push(lua.LString(publishtime)) return 1 })) //保存数据 s.SetGlobal("browser_savedata", s.NewFunction(func(l *lua.LState) int { //fmt.Println("---browser_savedata---") page := l.ToString(-2) data := l.ToTable(-1) result := TableToMap(data) if page == "list" { result["recordid"] = recordId } DataCache <- result return 1 })) //获取数据 s.SetGlobal("browser_getdata", s.NewFunction(func(l *lua.LState) int { fmt.Println("---browser_getdata---") num := l.ToInt(-1) //获取多少条数据 count := len(Datas) if count == 0 { l.Push(lua.LString("err")) l.Push(lua.LString("当前可下载量为0")) } else { if count < num { num = count } data := Datas[:num] Datas = Datas[num:] tMap := MapToTable(map[string]interface{}{"data": data}) l.Push(lua.LString("ok")) l.Push(tMap.RawGetString("data")) } return 2 })) }