script.go 15 KB


  1. package script
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "github.com/yuin/gopher-lua/parse"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "net/url"
  9. "os"
  10. "path/filepath"
  11. "spider_creator/backend"
  12. "strconv"
  13. "strings"
  14. "time"
  15. "github.com/chromedp/cdproto/browser"
  16. "github.com/chromedp/cdproto/network"
  17. "github.com/chromedp/cdproto/page"
  18. "github.com/chromedp/chromedp"
  19. "github.com/yuin/gopher-lua"
  20. be "spider_creator/backend"
  21. )
  22. const (
  23. selector_type_id = 0
  24. selector_type_query = 1
  25. selector_type_search = 2
  26. selector_type_jspath = 3
  27. selector_type_query_all = 4
  28. execute_return_type_string = 0
  29. execute_return_type_list = 1
  30. execute_return_type_table = 2
  31. qlm_list_lua = "/script/qlm_list.lua"
  32. qlm_detail_lua = "/script/qlm_detail.lua"
  33. )
  34. type GLVm struct {
  35. attachesDir string
  36. dnf backend.EventNotifyFace
  37. Headless bool
  38. ShowImage bool
  39. ProxyServer bool
  40. ProxyAddr string
  41. B *GLBrowser
  42. //WsAddr string
  43. //RunMode int
  44. //S Storage
  45. }
  46. type GLBrowser struct {
  47. BaseCancelFn context.CancelFunc
  48. Ctx context.Context
  49. CancelFn context.CancelFunc
  50. }
  51. func NewGLVM(attachesDir string, dnf be.EventNotifyFace) *GLVm {
  52. return &GLVm{
  53. attachesDir: attachesDir,
  54. dnf: dnf,
  55. }
  56. }
  57. // LoadScript 加载脚本
  58. func (glvm *GLVm) LoadScript(page string) string {
  59. var path string
  60. if page == "list" {
  61. path = glvm.attachesDir + qlm_list_lua
  62. } else if page == "detail" {
  63. path = glvm.attachesDir + qlm_detail_lua
  64. }
  65. bs, err := os.ReadFile(path)
  66. if err != nil {
  67. qu.Debug(path, "脚本加载失败...")
  68. }
  69. return string(bs)
  70. }
  71. // RunScript 执行lua代码
  72. func (glvm *GLVm) RunScript(script string) error {
  73. defer Catch()
  74. var state *lua.LState = lua.NewState()
  75. defer state.Close()
  76. //方法绑定
  77. glvm.ResetBrowser() //先创建浏览器对象
  78. glvm.BindLuaState(state) //绑定虚拟机函数
  79. glvm.B.BindLuaState(state)
  80. defer func() {
  81. if b := glvm.B; b != nil {
  82. b.CancelFn()
  83. b.Ctx = nil
  84. b.CancelFn = nil
  85. b.BaseCancelFn()
  86. b.BaseCancelFn = nil
  87. }
  88. }()
  89. reader := strings.NewReader(script)
  90. chunk, err := parse.Parse(reader, "code")
  91. if err != nil {
  92. return err
  93. }
  94. proto, err := lua.Compile(chunk, script)
  95. if err != nil {
  96. return err
  97. }
  98. lfunc := state.NewFunctionFromProto(proto)
  99. state.Push(lfunc)
  100. state.Call(0, 0)
  101. return nil
  102. }
  103. // ResetBrowser 重置浏览器
  104. func (vm *GLVm) ResetBrowser() {
  105. if vm.B != nil && vm.B.CancelFn != nil && vm.B.BaseCancelFn != nil {
  106. vm.B.CancelFn()
  107. vm.B.BaseCancelFn()
  108. vm.B.Ctx = nil
  109. vm.B.CancelFn = nil
  110. vm.B.BaseCancelFn = nil
  111. }
  112. _, baseCancelFn, _, _, ctx, incCancelFn := backend.NewBrowser(vm.Headless, vm.ShowImage, vm.ProxyServer, "https://")
  113. b := &GLBrowser{
  114. BaseCancelFn: baseCancelFn,
  115. Ctx: ctx,
  116. CancelFn: incCancelFn,
  117. }
  118. if vm.B == nil {
  119. vm.B = b
  120. } else {
  121. vm.B.Ctx, vm.B.CancelFn = b.Ctx, b.CancelFn
  122. }
  123. }
  124. // BindLuaState 绑定虚拟机函数
  125. func (vm *GLVm) BindLuaState(state *lua.LState) {
  126. state.SetGlobal("browser_reset", state.NewFunction(func(l *lua.LState) int {
  127. vm.ResetBrowser()
  128. return 0
  129. }))
  130. //
  131. state.SetGlobal("browser_save", state.NewFunction(func(l *lua.LState) int {
  132. //spiderCode := l.ToString(-5)
  133. //siteName := l.ToString(-4)
  134. //siteChannelName := l.ToString(-3)
  135. //siteChannelUrl := l.ToString(-2)
  136. /*table := l.ToTable(-1)
  137. data := TableToMap(table)*/
  138. //vm.S.Save(spiderCode, siteName, siteChannelName, siteChannelUrl, data)
  139. return 0
  140. }))
  141. }
  142. // findTab 根据标题、url找tab
  143. func (b *GLBrowser) findTabContext(tabTitle, tabUrl string, timeoutInt64 int64) (ctx context.Context, err error) {
  144. if timeoutInt64 == 0 {
  145. timeoutInt64 = 5000
  146. }
  147. timeout := time.Duration(timeoutInt64) * time.Millisecond
  148. if tabTitle == "" && tabUrl == "" {
  149. ctx, _ = context.WithTimeout(b.Ctx, timeout)
  150. return ctx, nil
  151. } else {
  152. ts, err := chromedp.Targets(b.Ctx)
  153. if err != nil {
  154. return nil, err
  155. }
  156. for _, t := range ts {
  157. if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) {
  158. // log.Printf("find tab param<title,url>: %s %s found %s %s", tabTitle, tabUrl,
  159. // t.Title, t.URL)
  160. newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID))
  161. ctx, _ = context.WithTimeout(newCtx, timeout)
  162. return ctx, nil
  163. }
  164. }
  165. }
  166. return nil, errors.New("can't find tab")
  167. }
  168. // CloseTabs关闭页面
  169. func (b *GLBrowser) CloseTabs(tabTitle, tabUrl string, timeoutInt64 int64) (err error) {
  170. if timeoutInt64 == 0 {
  171. timeoutInt64 = 5
  172. }
  173. timeout := time.Duration(timeoutInt64) * time.Millisecond
  174. ts, err := chromedp.Targets(b.Ctx)
  175. if err != nil {
  176. return err
  177. }
  178. for _, t := range ts {
  179. if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) {
  180. newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID))
  181. ctx, _ := context.WithTimeout(newCtx, timeout)
  182. chromedp.Run(
  183. ctx,
  184. page.Close(),
  185. )
  186. }
  187. }
  188. return nil
  189. }
  190. // Navigate 导航到指定网址
  191. func (b *GLBrowser) Navigate(tabTitle string, tabUrl string, isNewTab bool, targetUrl string, timeout int64) (err error) {
  192. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  193. if err != nil {
  194. return err
  195. }
  196. //新标签页
  197. if isNewTab {
  198. ctx, _ = chromedp.NewContext(ctx)
  199. }
  200. //
  201. return chromedp.Run(ctx,
  202. chromedp.Navigate(targetUrl))
  203. }
  204. // Navigate 导航到指定网址,并保存请求资源,如图片等
  205. func (b *GLBrowser) NavigateAndSaveRes(tabTitle string, tabUrl string, timeout int64, isNewTab bool, targetUrl string, saveFileTypeList, save2dir string) (err error) {
  206. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  207. if err != nil {
  208. return err
  209. }
  210. //新标签页
  211. if isNewTab {
  212. ctx, _ = chromedp.NewContext(ctx)
  213. }
  214. //
  215. saveFileType := strings.Split(saveFileTypeList, " ")
  216. isNeedRes := func(fileType string) bool {
  217. for _, v := range saveFileType {
  218. if strings.Contains(fileType, v) {
  219. return true
  220. }
  221. }
  222. return false
  223. }
  224. fnURL2FileName := func(requestURL string) string {
  225. u, err := url.Parse(requestURL)
  226. if err != nil {
  227. return ""
  228. }
  229. _, filename := filepath.Split(u.Path)
  230. return filename
  231. }
  232. var cache = map[network.RequestID]string{}
  233. chromedp.ListenTarget(ctx, func(v interface{}) {
  234. switch ev := v.(type) {
  235. case *network.EventRequestWillBeSent: //准备下载
  236. cache[ev.RequestID] = ev.Request.URL
  237. case *network.EventResponseReceived: //检查回应头的contenttype
  238. contentType, _ := ev.Response.Headers["Content-Type"].(string)
  239. fmt.Println(contentType)
  240. if !isNeedRes(contentType) {
  241. delete(cache, ev.RequestID)
  242. }
  243. case *network.EventLoadingFinished: //下载完成
  244. if uri, ok := cache[ev.RequestID]; ok {
  245. filename := fnURL2FileName(uri)
  246. fmt.Println("save2file", filename)
  247. if filename != "" {
  248. filePath := filepath.Join(save2dir, filename)
  249. var buf []byte
  250. if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
  251. var err error
  252. buf, err = network.GetResponseBody(ev.RequestID).Do(ctx)
  253. return err
  254. })); err == nil {
  255. os.WriteFile(filePath, buf, 0777)
  256. } else {
  257. fmt.Println(err.Error())
  258. }
  259. }
  260. }
  261. }
  262. })
  263. //
  264. err = chromedp.Run(ctx,
  265. chromedp.Navigate(targetUrl))
  266. //下载存储
  267. return err
  268. }
  269. // ExecuteJS 执行脚本
  270. func (b *GLBrowser) ExecuteJS(tabTitle, tabUrl, script string, ret interface{}, timeout int64) (err error) {
  271. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  272. if err != nil {
  273. return err
  274. }
  275. return chromedp.Run(ctx,
  276. chromedp.Evaluate(script, ret))
  277. }
  278. // Click 点击
  279. func (b *GLBrowser) Click(tabTitle, tabUrl, selector string, selectorType int, timeout int64) (err error) {
  280. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  281. if err != nil {
  282. return err
  283. }
  284. var act chromedp.QueryAction
  285. switch selectorType {
  286. case selector_type_id:
  287. act = chromedp.Click(selector, chromedp.ByID)
  288. case selector_type_query:
  289. act = chromedp.Click(selector, chromedp.ByQuery)
  290. case selector_type_search:
  291. act = chromedp.Click(selector, chromedp.BySearch)
  292. case selector_type_jspath:
  293. act = chromedp.Click(selector, chromedp.ByJSPath)
  294. default:
  295. act = chromedp.Click(selector, chromedp.ByQueryAll)
  296. }
  297. err = chromedp.Run(ctx,
  298. act)
  299. return err
  300. }
  301. // KeySend 键盘输入
  302. func (b *GLBrowser) KeySend(tabTitle, tabUrl, selector, sendStr string, selectorType int, timeout int64) (err error) {
  303. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  304. if err != nil {
  305. return err
  306. }
  307. var act chromedp.QueryAction
  308. switch selectorType {
  309. case selector_type_id:
  310. act = chromedp.SendKeys(selector, sendStr, chromedp.ByID)
  311. case selector_type_query:
  312. act = chromedp.SendKeys(selector, sendStr, chromedp.ByQuery)
  313. case selector_type_search:
  314. act = chromedp.SendKeys(selector, sendStr, chromedp.BySearch)
  315. case selector_type_jspath:
  316. act = chromedp.SendKeys(selector, sendStr, chromedp.ByJSPath)
  317. default:
  318. act = chromedp.SendKeys(selector, sendStr, chromedp.ByQueryAll)
  319. }
  320. return chromedp.Run(ctx,
  321. act)
  322. }
  323. // WaitVisible 等待元素可见
  324. func (b *GLBrowser) WaitVisible(tabTitle, tabUrl, selector string, selectorType int, timeout int64) error {
  325. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  326. if err != nil {
  327. return err
  328. }
  329. var act chromedp.QueryAction
  330. switch selectorType {
  331. case selector_type_id:
  332. act = chromedp.WaitVisible(selector, chromedp.ByID)
  333. case selector_type_query:
  334. act = chromedp.WaitVisible(selector, chromedp.ByQuery)
  335. case selector_type_search:
  336. act = chromedp.WaitVisible(selector, chromedp.BySearch)
  337. case selector_type_jspath:
  338. act = chromedp.WaitVisible(selector, chromedp.ByJSPath)
  339. default:
  340. act = chromedp.WaitVisible(selector, chromedp.ByQueryAll)
  341. }
  342. return chromedp.Run(ctx,
  343. act)
  344. }
  345. // 重置浏览器
  346. func (b *GLBrowser) Reset() {
  347. }
  348. // DownloadFile 只有在非headless模式下有效,与click方法其实是一致的
  349. func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selector string, selectorType int, save2dir string) error {
  350. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  351. if err != nil {
  352. return err
  353. }
  354. var act chromedp.QueryAction
  355. switch selectorType {
  356. case selector_type_id:
  357. act = chromedp.Click(selector, chromedp.ByID)
  358. case selector_type_query:
  359. act = chromedp.Click(selector, chromedp.ByQuery)
  360. case selector_type_search:
  361. act = chromedp.Click(selector, chromedp.BySearch)
  362. case selector_type_jspath:
  363. act = chromedp.Click(selector, chromedp.ByJSPath)
  364. default:
  365. act = chromedp.Click(selector, chromedp.ByQueryAll)
  366. }
  367. return chromedp.Run(ctx,
  368. browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).WithDownloadPath(save2dir).WithEventsEnabled(true),
  369. act)
  370. }
  371. // BindLuaState
  372. func (b *GLBrowser) BindLuaState(s *lua.LState) {
  373. //执行暂停
  374. s.SetGlobal("browser_sleep", s.NewFunction(func(l *lua.LState) int {
  375. fmt.Println("---browser_sleep---")
  376. timeout := l.ToInt64(-1)
  377. if timeout == 0 {
  378. timeout = 5
  379. }
  380. time.Sleep(time.Duration(timeout) * time.Millisecond)
  381. return 0
  382. }))
  383. //关闭tabl页
  384. s.SetGlobal("browser_closetabs", s.NewFunction(func(l *lua.LState) int {
  385. fmt.Println("---browser_closetabs---")
  386. timeout := l.ToInt64(-3)
  387. tabTitle := l.ToString(-2)
  388. tabUrl := l.ToString(-1)
  389. if timeout == 0 {
  390. timeout = 5
  391. }
  392. b.CloseTabs(tabTitle, tabUrl, timeout)
  393. return 0
  394. }))
  395. //注册打开地址
  396. s.SetGlobal("browser_navagite", s.NewFunction(func(l *lua.LState) int {
  397. fmt.Println("---browser_navagite---")
  398. tabTitle := l.ToString(-5) //指定标签页title
  399. tabUrl := l.ToString(-4) //指定标签页url
  400. isNewTab := l.ToBool(-3) //是否打开新的标签页
  401. timeout := l.ToInt64(-2) //网页打开的超时时间
  402. targetUrl := l.ToString(-1) //打开网页的链接
  403. if err := b.Navigate(tabTitle, tabUrl, isNewTab, targetUrl, timeout); err != nil {
  404. l.Push(lua.LString(err.Error()))
  405. } else {
  406. l.Push(lua.LString("ok"))
  407. }
  408. return 1
  409. }))
  410. //执行浏览器端js
  411. s.SetGlobal("browser_executejs", s.NewFunction(func(l *lua.LState) int {
  412. fmt.Println("---browser_executejs---")
  413. tabTitle := l.ToString(-5)
  414. tabUrl := l.ToString(-4)
  415. timeout := l.ToInt64(-3)
  416. returnType := l.ToInt(-2) //返回数据类型
  417. script := l.ToString(-1) //执行的js
  418. switch returnType {
  419. case execute_return_type_string: //返回string
  420. var ret string
  421. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  422. l.Push(lua.LString("ok"))
  423. l.Push(lua.LString(ret))
  424. } else {
  425. l.Push(lua.LString("err"))
  426. l.Push(lua.LString(err.Error()))
  427. }
  428. case execute_return_type_list: //返回list
  429. var ret = make([]interface{}, 0, 0)
  430. var tmp = make(map[string]interface{})
  431. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  432. for i, v := range ret {
  433. tmp[strconv.Itoa(i)] = v
  434. }
  435. l.Push(lua.LString("ok"))
  436. l.Push(MapToTable(tmp))
  437. } else {
  438. l.Push(lua.LString("err"))
  439. l.Push(lua.LString(err.Error()))
  440. }
  441. case execute_return_type_table: //返回table
  442. var ret = make(map[string]interface{})
  443. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  444. l.Push(lua.LString("ok"))
  445. l.Push(MapToTable(ret))
  446. } else {
  447. l.Push(lua.LString("err"))
  448. l.Push(lua.LString(err.Error()))
  449. }
  450. }
  451. return 2
  452. }))
  453. //按键
  454. s.SetGlobal("browser_keysend", s.NewFunction(func(l *lua.LState) int {
  455. fmt.Println("---browser_keysend---")
  456. tabTitle := l.ToString(-6)
  457. tabUrl := l.ToString(-5)
  458. timeout := l.ToInt64(-4)
  459. words := l.ToString(-3)
  460. selectorType := l.ToInt(-2)
  461. selector := l.ToString(-1)
  462. fmt.Println(selector, words, selectorType, timeout)
  463. err := b.KeySend(tabTitle, tabUrl, selector, words, selectorType, timeout)
  464. if err != nil {
  465. l.Push(lua.LString(err.Error()))
  466. } else {
  467. l.Push(lua.LString("ok"))
  468. }
  469. return 1
  470. }))
  471. //点击
  472. s.SetGlobal("browser_click", s.NewFunction(func(l *lua.LState) int {
  473. fmt.Println("---browser_click---")
  474. tabTitle := l.ToString(-5)
  475. tabUrl := l.ToString(-4)
  476. timeout := l.ToInt64(-3)
  477. selectorType := l.ToInt(-2)
  478. selector := l.ToString(-1)
  479. err := b.Click(tabTitle, tabUrl, selector, selectorType, timeout)
  480. if err != nil {
  481. l.Push(lua.LString(err.Error()))
  482. } else {
  483. l.Push(lua.LString("ok"))
  484. }
  485. return 1
  486. }))
  487. s.SetGlobal("browser_waitvisible", s.NewFunction(func(l *lua.LState) int {
  488. fmt.Println("---browser_waitvisible---")
  489. tabTitle := l.ToString(-5)
  490. tabUrl := l.ToString(-4)
  491. timeout := l.ToInt64(-3)
  492. selectorType := l.ToInt(-2) //选择器类型
  493. selector := l.ToString(-1) //选择器
  494. err := b.WaitVisible(tabTitle, tabUrl, selector, selectorType, timeout)
  495. if err != nil {
  496. l.Push(lua.LString(err.Error()))
  497. } else {
  498. l.Push(lua.LString("ok"))
  499. }
  500. return 1
  501. }))
  502. //点击
  503. s.SetGlobal("browser_downloadfile", s.NewFunction(func(l *lua.LState) int {
  504. tabTitle := l.ToString(-6)
  505. tabUrl := l.ToString(-5)
  506. timeout := l.ToInt64(-4)
  507. selectorType := l.ToInt(-3)
  508. selector := l.ToString(-2)
  509. save2dir := l.ToString(-1)
  510. err := b.DownloadFile(tabTitle, tabUrl, timeout, selector, selectorType, save2dir)
  511. if err != nil {
  512. l.Push(lua.LString(err.Error()))
  513. } else {
  514. l.Push(lua.LString("ok"))
  515. }
  516. return 1
  517. }))
  518. //注册打开地址
  519. s.SetGlobal("browser_navagite_download_res", s.NewFunction(func(l *lua.LState) int {
  520. tabTitle := l.ToString(-7)
  521. tabUrl := l.ToString(-6)
  522. timeout := l.ToInt64(-5)
  523. isNewTab := l.ToBool(-4)
  524. targetUrl := l.ToString(-3)
  525. saveFileTypeList := l.ToString(-2)
  526. savedir := l.ToString(-1)
  527. if err := b.NavigateAndSaveRes(tabTitle, tabUrl, timeout, isNewTab, targetUrl, saveFileTypeList, savedir); err != nil {
  528. l.Push(lua.LString(err.Error()))
  529. } else {
  530. l.Push(lua.LString("ok"))
  531. }
  532. return 1
  533. }))
  534. }