123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621 |
- package script
- import (
- "context"
- "errors"
- "fmt"
- "github.com/yuin/gopher-lua/parse"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "net/url"
- "os"
- "path/filepath"
- "spider_creator/backend"
- "strconv"
- "strings"
- "time"
- "github.com/chromedp/cdproto/browser"
- "github.com/chromedp/cdproto/network"
- "github.com/chromedp/cdproto/page"
- "github.com/chromedp/chromedp"
- "github.com/yuin/gopher-lua"
- be "spider_creator/backend"
- )
- const (
- selector_type_id = 0
- selector_type_query = 1
- selector_type_search = 2
- selector_type_jspath = 3
- selector_type_query_all = 4
- execute_return_type_string = 0
- execute_return_type_list = 1
- execute_return_type_table = 2
- qlm_list_lua = "/script/qlm_list.lua"
- qlm_detail_lua = "/script/qlm_detail.lua"
- )
- var (
- DataCache = make(chan map[string]interface{}, 2000)
- Datas []map[string]interface{}
- )
- type GLVm struct {
- AttachesDir string
- Dnf backend.EventNotifyFace
- Headless bool
- ShowImage bool
- ProxyServer bool
- ProxyAddr string
- B *GLBrowser
- ScriptRunning bool //控制一次只能执行一个脚本
- DataSaveOver chan bool
- }
- type GLBrowser struct {
- Ctx context.Context
- CancelFn context.CancelFunc
- }
- func NewGLVM(attachesDir string, dnf be.EventNotifyFace) *GLVm {
- return &GLVm{
- AttachesDir: attachesDir,
- Dnf: dnf,
- DataSaveOver: make(chan bool, 1),
- }
- }
- // LoadScript 加载脚本
- func (glvm *GLVm) LoadScript(page string) string {
- var path string
- if page == "list" {
- path = glvm.AttachesDir + qlm_list_lua
- } else if page == "detail" {
- path = glvm.AttachesDir + qlm_detail_lua
- }
- bs, err := os.ReadFile(path)
- if err != nil {
- qu.Debug(path, "脚本加载失败...")
- }
- return string(bs)
- }
- // RunScript 执行lua代码
- func (glvm *GLVm) RunScript(script, recordId string) error {
- defer Catch()
- var s *lua.LState = lua.NewState()
- defer s.Close()
- //方法绑定
- glvm.ResetBrowser() //先创建浏览器对象
- glvm.BindLuaState(s) //绑定虚拟机函数
- glvm.B.BindLuaState(s, recordId)
- defer func() {
- if b := glvm.B; b != nil {
- b.CancelFn()
- b.Ctx = nil
- b.CancelFn = nil
- }
- }()
- reader := strings.NewReader(script)
- chunk, err := parse.Parse(reader, "code")
- if err != nil {
- return err
- }
- proto, err := lua.Compile(chunk, script)
- if err != nil {
- return err
- }
- lfunc := s.NewFunctionFromProto(proto)
- s.Push(lfunc)
- s.Call(0, 0)
- return nil
- }
- // ResetBrowser 重置浏览器
- func (glvm *GLVm) ResetBrowser() {
- if glvm.B != nil && glvm.B.CancelFn != nil {
- glvm.B.CancelFn()
- glvm.B.Ctx = nil
- glvm.B.CancelFn = nil
- }
- _, _, _, _, ctx, incCancelFn := backend.NewBrowser(glvm.Headless, glvm.ShowImage, glvm.ProxyServer, "http://")
- b := &GLBrowser{
- Ctx: ctx,
- CancelFn: incCancelFn,
- }
- if glvm.B == nil {
- glvm.B = b
- } else {
- glvm.B.Ctx, glvm.B.CancelFn = b.Ctx, b.CancelFn
- }
- }
- // BindLuaState 绑定虚拟机函数
- func (glvm *GLVm) BindLuaState(state *lua.LState) {
- state.SetGlobal("browser_reset", state.NewFunction(func(l *lua.LState) int {
- glvm.ResetBrowser()
- return 0
- }))
- //
- state.SetGlobal("browser_save", state.NewFunction(func(l *lua.LState) int {
- //spiderCode := l.ToString(-5)
- //siteName := l.ToString(-4)
- //siteChannelName := l.ToString(-3)
- //siteChannelUrl := l.ToString(-2)
- /*table := l.ToTable(-1)
- data := TableToMap(table)*/
- //vm.S.Save(spiderCode, siteName, siteChannelName, siteChannelUrl, data)
- return 0
- }))
- }
- func (glvm *GLVm) CloseTabs() {
- if b := glvm.B; b != nil {
- b.CancelFn()
- b.Ctx = nil
- b.CancelFn = nil
- }
- }
- // findTab 根据标题、url找tab
- func (b *GLBrowser) findTabContext(tabTitle, tabUrl string, timeoutInt64 int64) (ctx context.Context, err error) {
- if b.Ctx != nil {
- if timeoutInt64 == 0 {
- timeoutInt64 = 5000
- }
- timeout := time.Duration(timeoutInt64) * time.Millisecond
- if tabTitle == "" && tabUrl == "" {
- ctx, _ = context.WithTimeout(b.Ctx, timeout)
- return ctx, nil
- } else {
- ts, err := chromedp.Targets(b.Ctx)
- if err != nil {
- return nil, err
- }
- for _, t := range ts {
- if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) {
- // log.Printf("find tab param<title,url>: %s %s found %s %s", tabTitle, tabUrl,
- // t.Title, t.URL)
- newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID))
- ctx, _ = context.WithTimeout(newCtx, timeout)
- return ctx, nil
- }
- }
- }
- return nil, errors.New("can't find tab")
- }
- return nil, errors.New("context is error")
- }
- // CloseTabs 关闭页面
- func (b *GLBrowser) CloseTabs(tabTitle, tabUrl string, timeoutInt64 int64) (err error) {
- if timeoutInt64 == 0 {
- timeoutInt64 = 5
- }
- timeout := time.Duration(timeoutInt64) * time.Millisecond
- ts, err := chromedp.Targets(b.Ctx)
- if err != nil {
- return err
- }
- for _, t := range ts {
- if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) {
- newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID))
- ctx, _ := context.WithTimeout(newCtx, timeout)
- chromedp.Run(
- ctx,
- page.Close(),
- )
- }
- }
- return nil
- }
- // Navigate 导航到指定网址
- func (b *GLBrowser) Navigate(tabTitle string, tabUrl string, isNewTab bool, targetUrl string, timeout int64) (err error) {
- ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
- if err != nil {
- return err
- }
- //新标签页
- if isNewTab {
- ctx, _ = chromedp.NewContext(ctx)
- }
- //
- return chromedp.Run(ctx,
- chromedp.Navigate(targetUrl))
- }
- // Navigate 导航到指定网址,并保存请求资源,如图片等
- func (b *GLBrowser) NavigateAndSaveRes(tabTitle string, tabUrl string, timeout int64, isNewTab bool, targetUrl string, saveFileTypeList, save2dir string) (err error) {
- ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
- if err != nil {
- return err
- }
- //新标签页
- if isNewTab {
- ctx, _ = chromedp.NewContext(ctx)
- }
- //
- saveFileType := strings.Split(saveFileTypeList, " ")
- isNeedRes := func(fileType string) bool {
- for _, v := range saveFileType {
- if strings.Contains(fileType, v) {
- return true
- }
- }
- return false
- }
- fnURL2FileName := func(requestURL string) string {
- u, err := url.Parse(requestURL)
- if err != nil {
- return ""
- }
- _, filename := filepath.Split(u.Path)
- return filename
- }
- var cache = map[network.RequestID]string{}
- chromedp.ListenTarget(ctx, func(v interface{}) {
- switch ev := v.(type) {
- case *network.EventRequestWillBeSent: //准备下载
- cache[ev.RequestID] = ev.Request.URL
- case *network.EventResponseReceived: //检查回应头的contenttype
- contentType, _ := ev.Response.Headers["Content-Type"].(string)
- fmt.Println(contentType)
- if !isNeedRes(contentType) {
- delete(cache, ev.RequestID)
- }
- case *network.EventLoadingFinished: //下载完成
- if uri, ok := cache[ev.RequestID]; ok {
- filename := fnURL2FileName(uri)
- fmt.Println("save2file", filename)
- if filename != "" {
- filePath := filepath.Join(save2dir, filename)
- var buf []byte
- if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
- var err error
- buf, err = network.GetResponseBody(ev.RequestID).Do(ctx)
- return err
- })); err == nil {
- os.WriteFile(filePath, buf, 0777)
- } else {
- fmt.Println(err.Error())
- }
- }
- }
- }
- })
- //
- err = chromedp.Run(ctx,
- chromedp.Navigate(targetUrl))
- //下载存储
- return err
- }
- // ExecuteJS 执行脚本
- func (b *GLBrowser) ExecuteJS(tabTitle, tabUrl, script string, ret interface{}, timeout int64) (err error) {
- ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
- if err != nil {
- return err
- }
- return chromedp.Run(ctx,
- chromedp.Evaluate(script, ret))
- }
- // Click 点击
- func (b *GLBrowser) Click(tabTitle, tabUrl, selector string, selectorType int, timeout int64) (err error) {
- ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
- if err != nil {
- return err
- }
- var act chromedp.QueryAction
- switch selectorType {
- case selector_type_id:
- act = chromedp.Click(selector, chromedp.ByID)
- case selector_type_query:
- act = chromedp.Click(selector, chromedp.ByQuery)
- case selector_type_search:
- act = chromedp.Click(selector, chromedp.BySearch)
- case selector_type_jspath:
- act = chromedp.Click(selector, chromedp.ByJSPath)
- default:
- act = chromedp.Click(selector, chromedp.ByQueryAll)
- }
- err = chromedp.Run(ctx,
- act)
- return err
- }
- // KeySend 键盘输入
- func (b *GLBrowser) KeySend(tabTitle, tabUrl, selector, sendStr string, selectorType int, timeout int64) (err error) {
- ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
- if err != nil {
- return err
- }
- var act chromedp.QueryAction
- switch selectorType {
- case selector_type_id:
- act = chromedp.SendKeys(selector, sendStr, chromedp.ByID)
- case selector_type_query:
- act = chromedp.SendKeys(selector, sendStr, chromedp.ByQuery)
- case selector_type_search:
- act = chromedp.SendKeys(selector, sendStr, chromedp.BySearch)
- case selector_type_jspath:
- act = chromedp.SendKeys(selector, sendStr, chromedp.ByJSPath)
- default:
- act = chromedp.SendKeys(selector, sendStr, chromedp.ByQueryAll)
- }
- return chromedp.Run(ctx,
- act)
- }
- // WaitVisible 等待元素可见
- func (b *GLBrowser) WaitVisible(tabTitle, tabUrl, selector string, selectorType int, timeout int64) error {
- ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
- if err != nil {
- return err
- }
- var act chromedp.QueryAction
- switch selectorType {
- case selector_type_id:
- act = chromedp.WaitVisible(selector, chromedp.ByID)
- case selector_type_query:
- act = chromedp.WaitVisible(selector, chromedp.ByQuery)
- case selector_type_search:
- act = chromedp.WaitVisible(selector, chromedp.BySearch)
- case selector_type_jspath:
- act = chromedp.WaitVisible(selector, chromedp.ByJSPath)
- default:
- act = chromedp.WaitVisible(selector, chromedp.ByQueryAll)
- }
- return chromedp.Run(ctx,
- act)
- }
- // 重置浏览器
- func (b *GLBrowser) Reset() {
- }
- // DownloadFile 只有在非headless模式下有效,与click方法其实是一致的
- func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selector string, selectorType int, save2dir string) error {
- ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
- if err != nil {
- return err
- }
- var act chromedp.QueryAction
- switch selectorType {
- case selector_type_id:
- act = chromedp.Click(selector, chromedp.ByID)
- case selector_type_query:
- act = chromedp.Click(selector, chromedp.ByQuery)
- case selector_type_search:
- act = chromedp.Click(selector, chromedp.BySearch)
- case selector_type_jspath:
- act = chromedp.Click(selector, chromedp.ByJSPath)
- default:
- act = chromedp.Click(selector, chromedp.ByQueryAll)
- }
- return chromedp.Run(ctx,
- browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).WithDownloadPath(save2dir).WithEventsEnabled(true),
- act)
- }
- // BindLuaState
- func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) {
- //执行暂停
- s.SetGlobal("browser_sleep", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_sleep---")
- timeout := l.ToInt64(-1)
- if timeout == 0 {
- timeout = 5
- }
- time.Sleep(time.Duration(timeout) * time.Millisecond)
- return 0
- }))
- //关闭tabl页
- s.SetGlobal("browser_closetabs", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_closetabs---")
- timeout := l.ToInt64(-3)
- tabTitle := l.ToString(-2)
- tabUrl := l.ToString(-1)
- if timeout == 0 {
- timeout = 5
- }
- b.CloseTabs(tabTitle, tabUrl, timeout)
- return 0
- }))
- //注册打开地址
- s.SetGlobal("browser_navagite", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_navagite---")
- tabTitle := l.ToString(-5) //指定标签页title
- tabUrl := l.ToString(-4) //指定标签页url
- isNewTab := l.ToBool(-3) //是否打开新的标签页
- timeout := l.ToInt64(-2) //网页打开的超时时间
- targetUrl := l.ToString(-1) //打开网页的链接
- if err := b.Navigate(tabTitle, tabUrl, isNewTab, targetUrl, timeout); err != nil {
- l.Push(lua.LString(err.Error()))
- } else {
- l.Push(lua.LString("ok"))
- }
- return 1
- }))
- //执行浏览器端js
- s.SetGlobal("browser_executejs", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_executejs---")
- tabTitle := l.ToString(-5)
- tabUrl := l.ToString(-4)
- timeout := l.ToInt64(-3)
- returnType := l.ToInt(-2) //返回数据类型
- script := l.ToString(-1) //执行的js
- switch returnType {
- case execute_return_type_string: //返回string
- var ret string
- if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
- l.Push(lua.LString("ok"))
- l.Push(lua.LString(ret))
- } else {
- l.Push(lua.LString("err"))
- l.Push(lua.LString(err.Error()))
- }
- case execute_return_type_list: //返回list
- var ret = make([]interface{}, 0, 0)
- var tmp = make(map[string]interface{})
- if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
- for i, v := range ret {
- tmp[strconv.Itoa(i)] = v
- }
- l.Push(lua.LString("ok"))
- l.Push(MapToTable(tmp))
- } else {
- l.Push(lua.LString("err"))
- l.Push(lua.LString(err.Error()))
- }
- case execute_return_type_table: //返回table
- var ret = make(map[string]interface{})
- if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
- l.Push(lua.LString("ok"))
- l.Push(MapToTable(ret))
- } else {
- l.Push(lua.LString("err"))
- l.Push(lua.LString(err.Error()))
- }
- }
- return 2
- }))
- //按键
- s.SetGlobal("browser_keysend", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_keysend---")
- tabTitle := l.ToString(-6)
- tabUrl := l.ToString(-5)
- timeout := l.ToInt64(-4)
- words := l.ToString(-3)
- selectorType := l.ToInt(-2)
- selector := l.ToString(-1)
- fmt.Println(selector, words, selectorType, timeout)
- err := b.KeySend(tabTitle, tabUrl, selector, words, selectorType, timeout)
- if err != nil {
- l.Push(lua.LString(err.Error()))
- } else {
- l.Push(lua.LString("ok"))
- }
- return 1
- }))
- //点击
- s.SetGlobal("browser_click", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_click---")
- tabTitle := l.ToString(-5)
- tabUrl := l.ToString(-4)
- timeout := l.ToInt64(-3)
- selectorType := l.ToInt(-2)
- selector := l.ToString(-1)
- err := b.Click(tabTitle, tabUrl, selector, selectorType, timeout)
- if err != nil {
- l.Push(lua.LString(err.Error()))
- } else {
- l.Push(lua.LString("ok"))
- }
- return 1
- }))
- //等待元素加载
- s.SetGlobal("browser_waitvisible", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_waitvisible---")
- tabTitle := l.ToString(-5)
- tabUrl := l.ToString(-4)
- timeout := l.ToInt64(-3)
- selectorType := l.ToInt(-2) //选择器类型
- selector := l.ToString(-1) //选择器
- err := b.WaitVisible(tabTitle, tabUrl, selector, selectorType, timeout)
- if err != nil {
- l.Push(lua.LString(err.Error()))
- } else {
- l.Push(lua.LString("ok"))
- }
- return 1
- }))
- //下载附件
- s.SetGlobal("browser_downloadfile", s.NewFunction(func(l *lua.LState) int {
- tabTitle := l.ToString(-6)
- tabUrl := l.ToString(-5)
- timeout := l.ToInt64(-4)
- selectorType := l.ToInt(-3)
- selector := l.ToString(-2)
- save2dir := l.ToString(-1)
- err := b.DownloadFile(tabTitle, tabUrl, timeout, selector, selectorType, save2dir)
- if err != nil {
- l.Push(lua.LString(err.Error()))
- } else {
- l.Push(lua.LString("ok"))
- }
- return 1
- }))
- //注册打开地址
- s.SetGlobal("browser_navagite_download_res", s.NewFunction(func(l *lua.LState) int {
- tabTitle := l.ToString(-7)
- tabUrl := l.ToString(-6)
- timeout := l.ToInt64(-5)
- isNewTab := l.ToBool(-4)
- targetUrl := l.ToString(-3)
- saveFileTypeList := l.ToString(-2)
- savedir := l.ToString(-1)
- if err := b.NavigateAndSaveRes(tabTitle, tabUrl, timeout, isNewTab, targetUrl, saveFileTypeList, savedir); err != nil {
- l.Push(lua.LString(err.Error()))
- } else {
- l.Push(lua.LString("ok"))
- }
- return 1
- }))
- //发布时间格式化
- s.SetGlobal("browser_publishtime", s.NewFunction(func(l *lua.LState) int {
- text := l.ToString(-1)
- publishtime := getPublitime(text)
- l.Push(lua.LString(publishtime))
- return 1
- }))
- //保存数据
- s.SetGlobal("browser_savedata", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_upsertdata---")
- page := l.ToString(-2)
- data := l.ToTable(-1)
- result := TableToMap(data)
- if page == "list" {
- result["recordid"] = recordId
- }
- DataCache <- result
- return 1
- }))
- //获取数据
- s.SetGlobal("browser_getdata", s.NewFunction(func(l *lua.LState) int {
- fmt.Println("---browser_getdata---")
- num := l.ToInt(-1) //获取多少条数据
- count := len(Datas)
- if count == 0 {
- l.Push(lua.LString("err"))
- l.Push(lua.LString("当前可下载量为0"))
- } else {
- if count < num {
- num = count
- }
- data := Datas[:num]
- Datas = Datas[num:]
- tMap := MapToTable(map[string]interface{}{"data": data})
- l.Push(lua.LString("ok"))
- l.Push(tMap.RawGetString("data"))
- }
- return 2
- }))
- }
|