script.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. package script
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "github.com/yuin/gopher-lua/parse"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "net/url"
  9. "os"
  10. "path/filepath"
  11. "spider_creator/backend"
  12. "strconv"
  13. "strings"
  14. "time"
  15. "github.com/chromedp/cdproto/browser"
  16. "github.com/chromedp/cdproto/network"
  17. "github.com/chromedp/cdproto/page"
  18. "github.com/chromedp/chromedp"
  19. "github.com/yuin/gopher-lua"
  20. be "spider_creator/backend"
  21. )
  22. const (
  23. selector_type_id = 0
  24. selector_type_query = 1
  25. selector_type_search = 2
  26. selector_type_jspath = 3
  27. selector_type_query_all = 4
  28. execute_return_type_string = 0
  29. execute_return_type_list = 1
  30. execute_return_type_table = 2
  31. qlm_list_lua = "/script/qlm_list.lua"
  32. qlm_detail_lua = "/script/qlm_detail.lua"
  33. )
  34. var (
  35. DataCache = make(chan map[string]interface{}, 2000)
  36. Datas []map[string]interface{}
  37. )
  38. type GLVm struct {
  39. AttachesDir string
  40. Dnf backend.EventNotifyFace
  41. Headless bool
  42. ShowImage bool
  43. ProxyServer bool
  44. ProxyAddr string
  45. B *GLBrowser
  46. ScriptRunning bool //控制一次只能执行一个脚本
  47. DataSaveOver chan bool
  48. }
  49. type GLBrowser struct {
  50. Ctx context.Context
  51. CancelFn context.CancelFunc
  52. }
  53. func NewGLVM(attachesDir string, dnf be.EventNotifyFace) *GLVm {
  54. return &GLVm{
  55. AttachesDir: attachesDir,
  56. Dnf: dnf,
  57. DataSaveOver: make(chan bool, 1),
  58. }
  59. }
  60. // LoadScript 加载脚本
  61. func (glvm *GLVm) LoadScript(page string) string {
  62. var path string
  63. if page == "list" {
  64. path = glvm.AttachesDir + qlm_list_lua
  65. } else if page == "detail" {
  66. path = glvm.AttachesDir + qlm_detail_lua
  67. }
  68. bs, err := os.ReadFile(path)
  69. if err != nil {
  70. qu.Debug(path, "脚本加载失败...")
  71. }
  72. return string(bs)
  73. }
  74. // RunScript 执行lua代码
  75. func (glvm *GLVm) RunScript(script, recordId string) error {
  76. defer Catch()
  77. var s *lua.LState = lua.NewState()
  78. defer s.Close()
  79. //方法绑定
  80. glvm.ResetBrowser() //先创建浏览器对象
  81. glvm.BindLuaState(s) //绑定虚拟机函数
  82. glvm.B.BindLuaState(s, recordId)
  83. defer func() {
  84. if b := glvm.B; b != nil {
  85. b.CancelFn()
  86. b.Ctx = nil
  87. b.CancelFn = nil
  88. }
  89. }()
  90. reader := strings.NewReader(script)
  91. chunk, err := parse.Parse(reader, "code")
  92. if err != nil {
  93. return err
  94. }
  95. proto, err := lua.Compile(chunk, script)
  96. if err != nil {
  97. return err
  98. }
  99. lfunc := s.NewFunctionFromProto(proto)
  100. s.Push(lfunc)
  101. s.Call(0, 0)
  102. return nil
  103. }
  104. // ResetBrowser 重置浏览器
  105. func (glvm *GLVm) ResetBrowser() {
  106. if glvm.B != nil && glvm.B.CancelFn != nil {
  107. glvm.B.CancelFn()
  108. glvm.B.Ctx = nil
  109. glvm.B.CancelFn = nil
  110. }
  111. _, _, _, _, ctx, incCancelFn := backend.NewBrowser(glvm.Headless, glvm.ShowImage, glvm.ProxyServer, "http://")
  112. b := &GLBrowser{
  113. Ctx: ctx,
  114. CancelFn: incCancelFn,
  115. }
  116. if glvm.B == nil {
  117. glvm.B = b
  118. } else {
  119. glvm.B.Ctx, glvm.B.CancelFn = b.Ctx, b.CancelFn
  120. }
  121. }
  122. // BindLuaState 绑定虚拟机函数
  123. func (glvm *GLVm) BindLuaState(state *lua.LState) {
  124. state.SetGlobal("browser_reset", state.NewFunction(func(l *lua.LState) int {
  125. glvm.ResetBrowser()
  126. return 0
  127. }))
  128. //
  129. state.SetGlobal("browser_save", state.NewFunction(func(l *lua.LState) int {
  130. //spiderCode := l.ToString(-5)
  131. //siteName := l.ToString(-4)
  132. //siteChannelName := l.ToString(-3)
  133. //siteChannelUrl := l.ToString(-2)
  134. /*table := l.ToTable(-1)
  135. data := TableToMap(table)*/
  136. //vm.S.Save(spiderCode, siteName, siteChannelName, siteChannelUrl, data)
  137. return 0
  138. }))
  139. }
  140. // findTab 根据标题、url找tab
  141. func (b *GLBrowser) findTabContext(tabTitle, tabUrl string, timeoutInt64 int64) (ctx context.Context, err error) {
  142. if timeoutInt64 == 0 {
  143. timeoutInt64 = 5000
  144. }
  145. timeout := time.Duration(timeoutInt64) * time.Millisecond
  146. if tabTitle == "" && tabUrl == "" {
  147. ctx, _ = context.WithTimeout(b.Ctx, timeout)
  148. return ctx, nil
  149. } else {
  150. ts, err := chromedp.Targets(b.Ctx)
  151. if err != nil {
  152. return nil, err
  153. }
  154. for _, t := range ts {
  155. if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) {
  156. // log.Printf("find tab param<title,url>: %s %s found %s %s", tabTitle, tabUrl,
  157. // t.Title, t.URL)
  158. newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID))
  159. ctx, _ = context.WithTimeout(newCtx, timeout)
  160. return ctx, nil
  161. }
  162. }
  163. }
  164. return nil, errors.New("can't find tab")
  165. }
  166. // CloseTabs 关闭页面
  167. func (b *GLBrowser) CloseTabs(tabTitle, tabUrl string, timeoutInt64 int64) (err error) {
  168. if timeoutInt64 == 0 {
  169. timeoutInt64 = 5
  170. }
  171. timeout := time.Duration(timeoutInt64) * time.Millisecond
  172. ts, err := chromedp.Targets(b.Ctx)
  173. if err != nil {
  174. return err
  175. }
  176. for _, t := range ts {
  177. if (tabTitle != "" && strings.Contains(t.Title, tabTitle)) || (tabUrl != "" && strings.Contains(t.URL, tabUrl)) {
  178. newCtx, _ := chromedp.NewContext(b.Ctx, chromedp.WithTargetID(t.TargetID))
  179. ctx, _ := context.WithTimeout(newCtx, timeout)
  180. chromedp.Run(
  181. ctx,
  182. page.Close(),
  183. )
  184. }
  185. }
  186. return nil
  187. }
  188. // Navigate 导航到指定网址
  189. func (b *GLBrowser) Navigate(tabTitle string, tabUrl string, isNewTab bool, targetUrl string, timeout int64) (err error) {
  190. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  191. if err != nil {
  192. return err
  193. }
  194. //新标签页
  195. if isNewTab {
  196. ctx, _ = chromedp.NewContext(ctx)
  197. }
  198. //
  199. return chromedp.Run(ctx,
  200. chromedp.Navigate(targetUrl))
  201. }
  202. // Navigate 导航到指定网址,并保存请求资源,如图片等
  203. func (b *GLBrowser) NavigateAndSaveRes(tabTitle string, tabUrl string, timeout int64, isNewTab bool, targetUrl string, saveFileTypeList, save2dir string) (err error) {
  204. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  205. if err != nil {
  206. return err
  207. }
  208. //新标签页
  209. if isNewTab {
  210. ctx, _ = chromedp.NewContext(ctx)
  211. }
  212. //
  213. saveFileType := strings.Split(saveFileTypeList, " ")
  214. isNeedRes := func(fileType string) bool {
  215. for _, v := range saveFileType {
  216. if strings.Contains(fileType, v) {
  217. return true
  218. }
  219. }
  220. return false
  221. }
  222. fnURL2FileName := func(requestURL string) string {
  223. u, err := url.Parse(requestURL)
  224. if err != nil {
  225. return ""
  226. }
  227. _, filename := filepath.Split(u.Path)
  228. return filename
  229. }
  230. var cache = map[network.RequestID]string{}
  231. chromedp.ListenTarget(ctx, func(v interface{}) {
  232. switch ev := v.(type) {
  233. case *network.EventRequestWillBeSent: //准备下载
  234. cache[ev.RequestID] = ev.Request.URL
  235. case *network.EventResponseReceived: //检查回应头的contenttype
  236. contentType, _ := ev.Response.Headers["Content-Type"].(string)
  237. fmt.Println(contentType)
  238. if !isNeedRes(contentType) {
  239. delete(cache, ev.RequestID)
  240. }
  241. case *network.EventLoadingFinished: //下载完成
  242. if uri, ok := cache[ev.RequestID]; ok {
  243. filename := fnURL2FileName(uri)
  244. fmt.Println("save2file", filename)
  245. if filename != "" {
  246. filePath := filepath.Join(save2dir, filename)
  247. var buf []byte
  248. if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
  249. var err error
  250. buf, err = network.GetResponseBody(ev.RequestID).Do(ctx)
  251. return err
  252. })); err == nil {
  253. os.WriteFile(filePath, buf, 0777)
  254. } else {
  255. fmt.Println(err.Error())
  256. }
  257. }
  258. }
  259. }
  260. })
  261. //
  262. err = chromedp.Run(ctx,
  263. chromedp.Navigate(targetUrl))
  264. //下载存储
  265. return err
  266. }
  267. // ExecuteJS 执行脚本
  268. func (b *GLBrowser) ExecuteJS(tabTitle, tabUrl, script string, ret interface{}, timeout int64) (err error) {
  269. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  270. if err != nil {
  271. return err
  272. }
  273. return chromedp.Run(ctx,
  274. chromedp.Evaluate(script, ret))
  275. }
  276. // Click 点击
  277. func (b *GLBrowser) Click(tabTitle, tabUrl, selector string, selectorType int, timeout int64) (err error) {
  278. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  279. if err != nil {
  280. return err
  281. }
  282. var act chromedp.QueryAction
  283. switch selectorType {
  284. case selector_type_id:
  285. act = chromedp.Click(selector, chromedp.ByID)
  286. case selector_type_query:
  287. act = chromedp.Click(selector, chromedp.ByQuery)
  288. case selector_type_search:
  289. act = chromedp.Click(selector, chromedp.BySearch)
  290. case selector_type_jspath:
  291. act = chromedp.Click(selector, chromedp.ByJSPath)
  292. default:
  293. act = chromedp.Click(selector, chromedp.ByQueryAll)
  294. }
  295. err = chromedp.Run(ctx,
  296. act)
  297. return err
  298. }
  299. // KeySend 键盘输入
  300. func (b *GLBrowser) KeySend(tabTitle, tabUrl, selector, sendStr string, selectorType int, timeout int64) (err error) {
  301. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  302. if err != nil {
  303. return err
  304. }
  305. var act chromedp.QueryAction
  306. switch selectorType {
  307. case selector_type_id:
  308. act = chromedp.SendKeys(selector, sendStr, chromedp.ByID)
  309. case selector_type_query:
  310. act = chromedp.SendKeys(selector, sendStr, chromedp.ByQuery)
  311. case selector_type_search:
  312. act = chromedp.SendKeys(selector, sendStr, chromedp.BySearch)
  313. case selector_type_jspath:
  314. act = chromedp.SendKeys(selector, sendStr, chromedp.ByJSPath)
  315. default:
  316. act = chromedp.SendKeys(selector, sendStr, chromedp.ByQueryAll)
  317. }
  318. return chromedp.Run(ctx,
  319. act)
  320. }
  321. // WaitVisible 等待元素可见
  322. func (b *GLBrowser) WaitVisible(tabTitle, tabUrl, selector string, selectorType int, timeout int64) error {
  323. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  324. if err != nil {
  325. return err
  326. }
  327. var act chromedp.QueryAction
  328. switch selectorType {
  329. case selector_type_id:
  330. act = chromedp.WaitVisible(selector, chromedp.ByID)
  331. case selector_type_query:
  332. act = chromedp.WaitVisible(selector, chromedp.ByQuery)
  333. case selector_type_search:
  334. act = chromedp.WaitVisible(selector, chromedp.BySearch)
  335. case selector_type_jspath:
  336. act = chromedp.WaitVisible(selector, chromedp.ByJSPath)
  337. default:
  338. act = chromedp.WaitVisible(selector, chromedp.ByQueryAll)
  339. }
  340. return chromedp.Run(ctx,
  341. act)
  342. }
  343. // 重置浏览器
  344. func (b *GLBrowser) Reset() {
  345. }
  346. // DownloadFile 只有在非headless模式下有效,与click方法其实是一致的
  347. func (b *GLBrowser) DownloadFile(tabTitle, tabUrl string, timeout int64, selector string, selectorType int, save2dir string) error {
  348. ctx, err := b.findTabContext(tabTitle, tabUrl, timeout)
  349. if err != nil {
  350. return err
  351. }
  352. var act chromedp.QueryAction
  353. switch selectorType {
  354. case selector_type_id:
  355. act = chromedp.Click(selector, chromedp.ByID)
  356. case selector_type_query:
  357. act = chromedp.Click(selector, chromedp.ByQuery)
  358. case selector_type_search:
  359. act = chromedp.Click(selector, chromedp.BySearch)
  360. case selector_type_jspath:
  361. act = chromedp.Click(selector, chromedp.ByJSPath)
  362. default:
  363. act = chromedp.Click(selector, chromedp.ByQueryAll)
  364. }
  365. return chromedp.Run(ctx,
  366. browser.SetDownloadBehavior(browser.SetDownloadBehaviorBehaviorAllowAndName).WithDownloadPath(save2dir).WithEventsEnabled(true),
  367. act)
  368. }
  369. // BindLuaState
  370. func (b *GLBrowser) BindLuaState(s *lua.LState, recordId string) {
  371. //执行暂停
  372. s.SetGlobal("browser_sleep", s.NewFunction(func(l *lua.LState) int {
  373. fmt.Println("---browser_sleep---")
  374. timeout := l.ToInt64(-1)
  375. if timeout == 0 {
  376. timeout = 5
  377. }
  378. time.Sleep(time.Duration(timeout) * time.Millisecond)
  379. return 0
  380. }))
  381. //关闭tabl页
  382. s.SetGlobal("browser_closetabs", s.NewFunction(func(l *lua.LState) int {
  383. fmt.Println("---browser_closetabs---")
  384. timeout := l.ToInt64(-3)
  385. tabTitle := l.ToString(-2)
  386. tabUrl := l.ToString(-1)
  387. if timeout == 0 {
  388. timeout = 5
  389. }
  390. b.CloseTabs(tabTitle, tabUrl, timeout)
  391. return 0
  392. }))
  393. //注册打开地址
  394. s.SetGlobal("browser_navagite", s.NewFunction(func(l *lua.LState) int {
  395. fmt.Println("---browser_navagite---")
  396. tabTitle := l.ToString(-5) //指定标签页title
  397. tabUrl := l.ToString(-4) //指定标签页url
  398. isNewTab := l.ToBool(-3) //是否打开新的标签页
  399. timeout := l.ToInt64(-2) //网页打开的超时时间
  400. targetUrl := l.ToString(-1) //打开网页的链接
  401. if err := b.Navigate(tabTitle, tabUrl, isNewTab, targetUrl, timeout); err != nil {
  402. l.Push(lua.LString(err.Error()))
  403. } else {
  404. l.Push(lua.LString("ok"))
  405. }
  406. return 1
  407. }))
  408. //执行浏览器端js
  409. s.SetGlobal("browser_executejs", s.NewFunction(func(l *lua.LState) int {
  410. fmt.Println("---browser_executejs---")
  411. tabTitle := l.ToString(-5)
  412. tabUrl := l.ToString(-4)
  413. timeout := l.ToInt64(-3)
  414. returnType := l.ToInt(-2) //返回数据类型
  415. script := l.ToString(-1) //执行的js
  416. switch returnType {
  417. case execute_return_type_string: //返回string
  418. var ret string
  419. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  420. l.Push(lua.LString("ok"))
  421. l.Push(lua.LString(ret))
  422. } else {
  423. l.Push(lua.LString("err"))
  424. l.Push(lua.LString(err.Error()))
  425. }
  426. case execute_return_type_list: //返回list
  427. var ret = make([]interface{}, 0, 0)
  428. var tmp = make(map[string]interface{})
  429. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  430. for i, v := range ret {
  431. tmp[strconv.Itoa(i)] = v
  432. }
  433. l.Push(lua.LString("ok"))
  434. l.Push(MapToTable(tmp))
  435. } else {
  436. l.Push(lua.LString("err"))
  437. l.Push(lua.LString(err.Error()))
  438. }
  439. case execute_return_type_table: //返回table
  440. var ret = make(map[string]interface{})
  441. if err := b.ExecuteJS(tabTitle, tabUrl, script, &ret, timeout); err == nil {
  442. l.Push(lua.LString("ok"))
  443. l.Push(MapToTable(ret))
  444. } else {
  445. l.Push(lua.LString("err"))
  446. l.Push(lua.LString(err.Error()))
  447. }
  448. }
  449. return 2
  450. }))
  451. //按键
  452. s.SetGlobal("browser_keysend", s.NewFunction(func(l *lua.LState) int {
  453. fmt.Println("---browser_keysend---")
  454. tabTitle := l.ToString(-6)
  455. tabUrl := l.ToString(-5)
  456. timeout := l.ToInt64(-4)
  457. words := l.ToString(-3)
  458. selectorType := l.ToInt(-2)
  459. selector := l.ToString(-1)
  460. fmt.Println(selector, words, selectorType, timeout)
  461. err := b.KeySend(tabTitle, tabUrl, selector, words, selectorType, timeout)
  462. if err != nil {
  463. l.Push(lua.LString(err.Error()))
  464. } else {
  465. l.Push(lua.LString("ok"))
  466. }
  467. return 1
  468. }))
  469. //点击
  470. s.SetGlobal("browser_click", s.NewFunction(func(l *lua.LState) int {
  471. fmt.Println("---browser_click---")
  472. tabTitle := l.ToString(-5)
  473. tabUrl := l.ToString(-4)
  474. timeout := l.ToInt64(-3)
  475. selectorType := l.ToInt(-2)
  476. selector := l.ToString(-1)
  477. err := b.Click(tabTitle, tabUrl, selector, selectorType, timeout)
  478. if err != nil {
  479. l.Push(lua.LString(err.Error()))
  480. } else {
  481. l.Push(lua.LString("ok"))
  482. }
  483. return 1
  484. }))
  485. s.SetGlobal("browser_waitvisible", s.NewFunction(func(l *lua.LState) int {
  486. fmt.Println("---browser_waitvisible---")
  487. tabTitle := l.ToString(-5)
  488. tabUrl := l.ToString(-4)
  489. timeout := l.ToInt64(-3)
  490. selectorType := l.ToInt(-2) //选择器类型
  491. selector := l.ToString(-1) //选择器
  492. err := b.WaitVisible(tabTitle, tabUrl, selector, selectorType, timeout)
  493. if err != nil {
  494. l.Push(lua.LString(err.Error()))
  495. } else {
  496. l.Push(lua.LString("ok"))
  497. }
  498. return 1
  499. }))
  500. //点击
  501. s.SetGlobal("browser_downloadfile", s.NewFunction(func(l *lua.LState) int {
  502. tabTitle := l.ToString(-6)
  503. tabUrl := l.ToString(-5)
  504. timeout := l.ToInt64(-4)
  505. selectorType := l.ToInt(-3)
  506. selector := l.ToString(-2)
  507. save2dir := l.ToString(-1)
  508. err := b.DownloadFile(tabTitle, tabUrl, timeout, selector, selectorType, save2dir)
  509. if err != nil {
  510. l.Push(lua.LString(err.Error()))
  511. } else {
  512. l.Push(lua.LString("ok"))
  513. }
  514. return 1
  515. }))
  516. //注册打开地址
  517. s.SetGlobal("browser_navagite_download_res", s.NewFunction(func(l *lua.LState) int {
  518. tabTitle := l.ToString(-7)
  519. tabUrl := l.ToString(-6)
  520. timeout := l.ToInt64(-5)
  521. isNewTab := l.ToBool(-4)
  522. targetUrl := l.ToString(-3)
  523. saveFileTypeList := l.ToString(-2)
  524. savedir := l.ToString(-1)
  525. if err := b.NavigateAndSaveRes(tabTitle, tabUrl, timeout, isNewTab, targetUrl, saveFileTypeList, savedir); err != nil {
  526. l.Push(lua.LString(err.Error()))
  527. } else {
  528. l.Push(lua.LString("ok"))
  529. }
  530. return 1
  531. }))
  532. //保存数据
  533. s.SetGlobal("browser_savedata", s.NewFunction(func(l *lua.LState) int {
  534. fmt.Println("---browser_upsertdata---")
  535. page := l.ToString(-2)
  536. data := l.ToTable(-1)
  537. result := TableToMap(data)
  538. if page == "list" {
  539. result["recordid"] = recordId
  540. }
  541. DataCache <- result
  542. return 1
  543. }))
  544. //获取数据
  545. s.SetGlobal("browser_getdata", s.NewFunction(func(l *lua.LState) int {
  546. fmt.Println("---browser_getdata---")
  547. num := l.ToInt(-1) //获取多少条数据
  548. count := len(Datas)
  549. if count == 0 {
  550. l.Push(lua.LString("err"))
  551. l.Push(lua.LString("当前可下载量为0"))
  552. } else {
  553. resultTable := &lua.LTable{}
  554. for i := 0; i < num && i < count; i++ {
  555. resultTable.Append(MapToTable(Datas[i]))
  556. }
  557. l.Push(lua.LString("ok"))
  558. l.Push(resultTable)
  559. }
  560. return 2
  561. }))
  562. }