script.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. package script
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "github.com/yuin/gopher-lua/parse"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "net/url"
  9. "os"
  10. "path/filepath"
  11. "spider_creator/backend"
  12. "strconv"
  13. "strings"
  14. "time"
  15. "github.com/chromedp/cdproto/browser"
  16. "github.com/chromedp/cdproto/network"
  17. "github.com/chromedp/cdproto/page"
  18. "github.com/chromedp/chromedp"
  19. "github.com/yuin/gopher-lua"
  20. be "spider_creator/backend"
  21. )
  22. const (
  23. selector_type_id = 0
  24. selector_type_query = 1
  25. selector_type_search = 2
  26. selector_type_jspath = 3
  27. selector_type_query_all = 4
  28. execute_return_type_string = 0
  29. execute_return_type_list = 1
  30. execute_return_type_table = 2
  31. qlm_list_lua = "/script/qlm_list.lua"
  32. qlm_detail_lua = "/script/qlm_detail.lua"
  33. )
  34. type GLVm struct {
  35. attachesDir string
  36. dnf backend.EventNotifyFace
  37. Headless bool
  38. ShowImage bool
  39. ProxyServer bool
  40. ProxyAddr string
  41. B *GLBrowser
  42. }
  43. type GLTask struct {
  44. glvm *GLVm
  45. recordid string //记录id
  46. }
  47. type GLBrowser struct {
  48. BaseCancelFn context.CancelFunc
  49. Ctx context.Context
  50. CancelFn context.CancelFunc
  51. }
  52. func NewGLVM(attachesDir string, dnf be.EventNotifyFace) *GLVm {
  53. return &GLVm{
  54. attachesDir: attachesDir,
  55. dnf: dnf,
  56. }
  57. }
  58. // LoadScript 加载脚本
  59. func (glvm *GLVm) LoadScript(page string) string {
  60. var path string
  61. if page == "list" {
  62. path = glvm.attachesDir + qlm_list_lua
  63. } else if page == "detail" {
  64. path = glvm.attachesDir + qlm_detail_lua
  65. }
  66. bs, err := os.ReadFile(path)
  67. if err != nil {
  68. qu.Debug(path, "脚本加载失败...")
  69. }
  70. return string(bs)
  71. }
  72. // RunScript 执行lua代码
  73. func (glvm *GLVm) RunScript(script string) error {
  74. defer Catch()
  75. var state *lua.LState = lua.NewState()
  76. defer state.Close()
  77. //方法绑定
  78. glvm.ResetBrowser() //先创建浏览器对象
  79. glvm.BindLuaState(state) //绑定虚拟机函数
  80. glvm.B.BindLuaState(state)
  81. defer func() {
  82. if b := glvm.B; b != nil {
  83. b.CancelFn()
  84. b.Ctx = nil
  85. b.CancelFn = nil
  86. b.BaseCancelFn()
  87. b.BaseCancelFn = nil
  88. }
  89. }()
  90. reader := strings.NewReader(script)
  91. chunk, err := parse.Parse(reader, "code")
  92. if err != nil {
  93. return err
  94. }
  95. proto, err := lua.Compile(chunk, script)
  96. if err != nil {
  97. return err
  98. }
  99. lfunc := state.NewFunctionFromProto(proto)
  100. state.Push(lfunc)
  101. state.Call(0, 0)
  102. return nil
  103. }
  104. // ResetBrowser 重置浏览器
  105. func (glvm *GLVm) ResetBrowser() {
  106. if glvm.B != nil && glvm.B.CancelFn != nil && glvm.B.BaseCancelFn != nil {
  107. glvm.B.CancelFn()
  108. glvm.B.BaseCancelFn()
  109. glvm.B.Ctx = nil
  110. glvm.B.CancelFn = nil
  111. glvm.B.BaseCancelFn = nil
  112. }
  113. _, baseCancelFn, _, _, ctx, incCancelFn := backend.NewBrowser(glvm.Headless, glvm.ShowImage, glvm.ProxyServer, "https://")
  114. b := &GLBrowser{
  115. BaseCancelFn: baseCancelFn,
  116. Ctx: ctx,
  117. CancelFn: incCancelFn,
  118. }
  119. if glvm.B == nil {
  120. glvm.B = b
  121. } else {
  122. glvm.B.Ctx, glvm.B.CancelFn = b.Ctx, b.CancelFn
  123. }
  124. }
  125. // BindLuaState 绑定虚拟机函数
  126. func (glvm *GLVm) BindLuaState(state *lua.LState) {
  127. state.SetGlobal("browser_reset", state.NewFunction(func(l *lua.LState) int {
  128. glvm.ResetBrowser()
  129. return 0
  130. }))
  131. //
  132. state.SetGlobal("browser_save", state.NewFunction(func(l *lua.LState) int {
  133. //spiderCode := l.ToString(-5)
  134. //siteName := l.ToString(-4)
  135. //siteChannelName := l.ToString(-3)
  136. //siteChannelUrl := l.ToString(-2)
  137. /*table := l.ToTable(-1)
  138. data := TableToMap(table)*/
  139. //vm.S.Save(spiderCode, siteName, siteChannelName, siteChannelUrl, data)
  140. return 0
  141. }))
  142. }
  143. // findTab 根据标题、url找tab
  144. func (b *GLBrowser) findTabContext(tabTitle, tabUrl string, timeoutInt64 int64) (ctx context.Context, err error) {
  145. if timeoutInt64 == 0 {
  146. timeoutInt64 = 5000
  147. }
  148. timeout := time.Duration(timeoutInt64) * time.Millisecond
  149. if tabTitle == "" && tabUrl == "" {
  150. ctx, _ = context.WithTimeout(b.Ctx, timeout)
  151. return ctx, nil
  152. } else {
  153. ts, err := chromedp.Targets(b.Ctx)
  154. if err != nil {
  155. return nil, err
  156. }
  157. for _, t := range ts {
  158. if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) {
  159. // log.Printf("find tab param<title,url>: %s %s found %s %s", tabTitle, tabUrl,
  160. // t.Title, t.URL)
  161. newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID))
  162. ctx, _ = context.WithTimeout(newCtx, timeout)
  163. return ctx, nil
  164. }
  165. }
  166. }
  167. return nil, errors.New("can't find tab")
  168. }
  169. // CloseTabs 关闭页面
  170. func (b *GLBrowser) CloseTabs(tabTitle, tabUrl string, timeoutInt64 int64) (err error) {
  171. if timeoutInt64 == 0 {
  172. timeoutInt64 = 5
  173. }
  174. timeout := time.Duration(timeoutInt64) * time.Millisecond
  175. ts, err := chromedp.Targets(b.Ctx)
  176. if err != nil {
  177. return err
  178. }
  179. for _, t := range ts {
  180. if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) {
  181. newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID))
  182. ctx, _ := context.WithTimeout(newCtx, timeout)
  183. chromedp.Run(
  184. ctx,
  185. page.Close(),
  186. )
  187. }
  188. }
  189. return nil
  190. }
  191. // Navigate 导航到指定网址
  192. func (b *GLBrowser) Navigate(tabTitle string, tabUrl string, isNewTab bool, targetUrl string, timeout int64) (err error) {
  193. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  194. if err != nil {
  195. return err
  196. }
  197. //新标签页
  198. if isNewTab {
  199. ctx, _ = chromedp.NewContext(ctx)
  200. }
  201. //
  202. return chromedp.Run(ctx,
  203. chromedp.Navigate(targetUrl))
  204. }
  205. // Navigate 导航到指定网址,并保存请求资源,如图片等
  206. func (b *GLBrowser) NavigateAndSaveRes(tabTitle string, tabUrl string, timeout int64, isNewTab bool, targetUrl string, saveFileTypeList, save2dir string) (err error) {
  207. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  208. if err != nil {
  209. return err
  210. }
  211. //新标签页
  212. if isNewTab {
  213. ctx, _ = chromedp.NewContext(ctx)
  214. }
  215. //
  216. saveFileType := strings.Split(saveFileTypeList, " ")
  217. isNeedRes := func(fileType string) bool {
  218. for _, v := range saveFileType {
  219. if strings.Contains(fileType, v) {
  220. return true
  221. }
  222. }
  223. return false
  224. }
  225. fnURL2FileName := func(requestURL string) string {
  226. u, err := url.Parse(requestURL)
  227. if err != nil {
  228. return ""
  229. }
  230. _, filename := filepath.Split(u.Path)
  231. return filename
  232. }
  233. var cache = map[network.RequestID]string{}
  234. chromedp.ListenTarget(ctx, func(v interface{}) {
  235. switch ev := v.(type) {
  236. case *network.EventRequestWillBeSent: //准备下载
  237. cache[ev.RequestID] = ev.Request.URL
  238. case *network.EventResponseReceived: //检查回应头的contenttype
  239. contentType, _ := ev.Response.Headers["Content-Type"].(string)
  240. fmt.Println(contentType)
  241. if !isNeedRes(contentType) {
  242. delete(cache, ev.RequestID)
  243. }
  244. case *network.EventLoadingFinished: //下载完成
  245. if uri, ok := cache[ev.RequestID]; ok {
  246. filename := fnURL2FileName(uri)
  247. fmt.Println("save2file", filename)
  248. if filename != "" {
  249. filePath := filepath.Join(save2dir, filename)
  250. var buf []byte
  251. if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
  252. var err error
  253. buf, err = network.GetResponseBody(ev.RequestID).Do(ctx)
  254. return err
  255. })); err == nil {
  256. os.WriteFile(filePath, buf, 0777)
  257. } else {
  258. fmt.Println(err.Error())
  259. }
  260. }
  261. }
  262. }
  263. })
  264. //
  265. err = chromedp.Run(ctx,
  266. chromedp.Navigate(targetUrl))
  267. //下载存储
  268. return err
  269. }
  270. // ExecuteJS 执行脚本
  271. func (b *GLBrowser) ExecuteJS(tabTitle, tabUrl, script string, ret interface{}, timeout int64) (err error) {
  272. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  273. if err != nil {
  274. return err
  275. }
  276. return chromedp.Run(ctx,
  277. chromedp.Evaluate(script, ret))
  278. }
  279. // Click 点击
  280. func (b *GLBrowser) Click(tabTitle, tabUrl, selector string, selectorType int, timeout int64) (err error) {
  281. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  282. if err != nil {
  283. return err
  284. }
  285. var act chromedp.QueryAction
  286. switch selectorType {
  287. case selector_type_id:
  288. act = chromedp.Click(selector, chromedp.ByID)
  289. case selector_type_query:
  290. act = chromedp.Click(selector, chromedp.ByQuery)
  291. case selector_type_search:
  292. act = chromedp.Click(selector, chromedp.BySearch)
  293. case selector_type_jspath:
  294. act = chromedp.Click(selector, chromedp.ByJSPath)
  295. default:
  296. act = chromedp.Click(selector, chromedp.ByQueryAll)
  297. }
  298. err = chromedp.Run(ctx,
  299. act)
  300. return err
  301. }
  302. // KeySend 键盘输入
  303. func (b *GLBrowser) KeySend(tabTitle, tabUrl, selector, sendStr string, selectorType int, timeout int64) (err error) {
  304. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  305. if err != nil {
  306. return err
  307. }
  308. var act chromedp.QueryAction
  309. switch selectorType {
  310. case selector_type_id:
  311. act = chromedp.SendKeys(selector, sendStr, chromedp.ByID)
  312. case selector_type_query:
  313. act = chromedp.SendKeys(selector, sendStr, chromedp.ByQuery)
  314. case selector_type_search:
  315. act = chromedp.SendKeys(selector, sendStr, chromedp.BySearch)
  316. case selector_type_jspath:
  317. act = chromedp.SendKeys(selector, sendStr, chromedp.ByJSPath)
  318. default:
  319. act = chromedp.SendKeys(selector, sendStr, chromedp.ByQueryAll)
  320. }
  321. return chromedp.Run(ctx,
  322. act)
  323. }
  324. // WaitVisible 等待元素可见
  325. func (b *GLBrowser) WaitVisible(tabTitle, tabUrl, selector string, selectorType int, timeout int64) error {
  326. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  327. if err != nil {
  328. return err
  329. }
  330. var act chromedp.QueryAction
  331. switch selectorType {
  332. case selector_type_id:
  333. act = chromedp.WaitVisible(selector, chromedp.ByID)
  334. case selector_type_query:
  335. act = chromedp.WaitVisible(selector, chromedp.ByQuery)
  336. case selector_type_search:
  337. act = chromedp.WaitVisible(selector, chromedp.BySearch)
  338. case selector_type_jspath:
  339. act = chromedp.WaitVisible(selector, chromedp.ByJSPath)
  340. default:
  341. act = chromedp.WaitVisible(selector, chromedp.ByQueryAll)
  342. }
  343. return chromedp.Run(ctx,
  344. act)
  345. }
  346. // 重置浏览器
  347. func (b *GLBrowser) Reset() {
  348. }
  349. // DownloadFile 只有在非headless模式下有效,与click方法其实是一致的
  350. func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selector string, selectorType int, save2dir string) error {
  351. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  352. if err != nil {
  353. return err
  354. }
  355. var act chromedp.QueryAction
  356. switch selectorType {
  357. case selector_type_id:
  358. act = chromedp.Click(selector, chromedp.ByID)
  359. case selector_type_query:
  360. act = chromedp.Click(selector, chromedp.ByQuery)
  361. case selector_type_search:
  362. act = chromedp.Click(selector, chromedp.BySearch)
  363. case selector_type_jspath:
  364. act = chromedp.Click(selector, chromedp.ByJSPath)
  365. default:
  366. act = chromedp.Click(selector, chromedp.ByQueryAll)
  367. }
  368. return chromedp.Run(ctx,
  369. browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).WithDownloadPath(save2dir).WithEventsEnabled(true),
  370. act)
  371. }
  372. // BindLuaState
  373. func (b *GLBrowser) BindLuaState(s *lua.LState) {
  374. //执行暂停
  375. s.SetGlobal("browser_sleep", s.NewFunction(func(l *lua.LState) int {
  376. fmt.Println("---browser_sleep---")
  377. timeout := l.ToInt64(-1)
  378. if timeout == 0 {
  379. timeout = 5
  380. }
  381. time.Sleep(time.Duration(timeout) * time.Millisecond)
  382. return 0
  383. }))
  384. //关闭tabl页
  385. s.SetGlobal("browser_closetabs", s.NewFunction(func(l *lua.LState) int {
  386. fmt.Println("---browser_closetabs---")
  387. timeout := l.ToInt64(-3)
  388. tabTitle := l.ToString(-2)
  389. tabUrl := l.ToString(-1)
  390. if timeout == 0 {
  391. timeout = 5
  392. }
  393. b.CloseTabs(tabTitle, tabUrl, timeout)
  394. return 0
  395. }))
  396. //注册打开地址
  397. s.SetGlobal("browser_navagite", s.NewFunction(func(l *lua.LState) int {
  398. fmt.Println("---browser_navagite---")
  399. tabTitle := l.ToString(-5) //指定标签页title
  400. tabUrl := l.ToString(-4) //指定标签页url
  401. isNewTab := l.ToBool(-3) //是否打开新的标签页
  402. timeout := l.ToInt64(-2) //网页打开的超时时间
  403. targetUrl := l.ToString(-1) //打开网页的链接
  404. if err := b.Navigate(tabTitle, tabUrl, isNewTab, targetUrl, timeout); err != nil {
  405. l.Push(lua.LString(err.Error()))
  406. } else {
  407. l.Push(lua.LString("ok"))
  408. }
  409. return 1
  410. }))
  411. //执行浏览器端js
  412. s.SetGlobal("browser_executejs", s.NewFunction(func(l *lua.LState) int {
  413. fmt.Println("---browser_executejs---")
  414. tabTitle := l.ToString(-5)
  415. tabUrl := l.ToString(-4)
  416. timeout := l.ToInt64(-3)
  417. returnType := l.ToInt(-2) //返回数据类型
  418. script := l.ToString(-1) //执行的js
  419. switch returnType {
  420. case execute_return_type_string: //返回string
  421. var ret string
  422. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  423. l.Push(lua.LString("ok"))
  424. l.Push(lua.LString(ret))
  425. } else {
  426. l.Push(lua.LString("err"))
  427. l.Push(lua.LString(err.Error()))
  428. }
  429. case execute_return_type_list: //返回list
  430. var ret = make([]interface{}, 0, 0)
  431. var tmp = make(map[string]interface{})
  432. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  433. for i, v := range ret {
  434. tmp[strconv.Itoa(i)] = v
  435. }
  436. l.Push(lua.LString("ok"))
  437. l.Push(MapToTable(tmp))
  438. } else {
  439. l.Push(lua.LString("err"))
  440. l.Push(lua.LString(err.Error()))
  441. }
  442. case execute_return_type_table: //返回table
  443. var ret = make(map[string]interface{})
  444. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  445. l.Push(lua.LString("ok"))
  446. l.Push(MapToTable(ret))
  447. } else {
  448. l.Push(lua.LString("err"))
  449. l.Push(lua.LString(err.Error()))
  450. }
  451. }
  452. return 2
  453. }))
  454. //按键
  455. s.SetGlobal("browser_keysend", s.NewFunction(func(l *lua.LState) int {
  456. fmt.Println("---browser_keysend---")
  457. tabTitle := l.ToString(-6)
  458. tabUrl := l.ToString(-5)
  459. timeout := l.ToInt64(-4)
  460. words := l.ToString(-3)
  461. selectorType := l.ToInt(-2)
  462. selector := l.ToString(-1)
  463. fmt.Println(selector, words, selectorType, timeout)
  464. err := b.KeySend(tabTitle, tabUrl, selector, words, selectorType, timeout)
  465. if err != nil {
  466. l.Push(lua.LString(err.Error()))
  467. } else {
  468. l.Push(lua.LString("ok"))
  469. }
  470. return 1
  471. }))
  472. //点击
  473. s.SetGlobal("browser_click", s.NewFunction(func(l *lua.LState) int {
  474. fmt.Println("---browser_click---")
  475. tabTitle := l.ToString(-5)
  476. tabUrl := l.ToString(-4)
  477. timeout := l.ToInt64(-3)
  478. selectorType := l.ToInt(-2)
  479. selector := l.ToString(-1)
  480. err := b.Click(tabTitle, tabUrl, selector, selectorType, timeout)
  481. if err != nil {
  482. l.Push(lua.LString(err.Error()))
  483. } else {
  484. l.Push(lua.LString("ok"))
  485. }
  486. return 1
  487. }))
  488. s.SetGlobal("browser_waitvisible", s.NewFunction(func(l *lua.LState) int {
  489. fmt.Println("---browser_waitvisible---")
  490. tabTitle := l.ToString(-5)
  491. tabUrl := l.ToString(-4)
  492. timeout := l.ToInt64(-3)
  493. selectorType := l.ToInt(-2) //选择器类型
  494. selector := l.ToString(-1) //选择器
  495. err := b.WaitVisible(tabTitle, tabUrl, selector, selectorType, timeout)
  496. if err != nil {
  497. l.Push(lua.LString(err.Error()))
  498. } else {
  499. l.Push(lua.LString("ok"))
  500. }
  501. return 1
  502. }))
  503. //点击
  504. s.SetGlobal("browser_downloadfile", s.NewFunction(func(l *lua.LState) int {
  505. tabTitle := l.ToString(-6)
  506. tabUrl := l.ToString(-5)
  507. timeout := l.ToInt64(-4)
  508. selectorType := l.ToInt(-3)
  509. selector := l.ToString(-2)
  510. save2dir := l.ToString(-1)
  511. err := b.DownloadFile(tabTitle, tabUrl, timeout, selector, selectorType, save2dir)
  512. if err != nil {
  513. l.Push(lua.LString(err.Error()))
  514. } else {
  515. l.Push(lua.LString("ok"))
  516. }
  517. return 1
  518. }))
  519. //注册打开地址
  520. s.SetGlobal("browser_navagite_download_res", s.NewFunction(func(l *lua.LState) int {
  521. tabTitle := l.ToString(-7)
  522. tabUrl := l.ToString(-6)
  523. timeout := l.ToInt64(-5)
  524. isNewTab := l.ToBool(-4)
  525. targetUrl := l.ToString(-3)
  526. saveFileTypeList := l.ToString(-2)
  527. savedir := l.ToString(-1)
  528. if err := b.NavigateAndSaveRes(tabTitle, tabUrl, timeout, isNewTab, targetUrl, saveFileTypeList, savedir); err != nil {
  529. l.Push(lua.LString(err.Error()))
  530. } else {
  531. l.Push(lua.LString("ok"))
  532. }
  533. return 1
  534. }))
  535. //保存
  536. s.SetGlobal("browser_savedata", s.NewFunction(func(l *lua.LState) int {
  537. //fmt.Println("---browser_upsertdata---")
  538. //param := l.ToTable(-1)
  539. //upset := TableToMap(param)
  540. return 1
  541. }))
  542. }