worker.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. package vm
  2. import (
  3. "container/list"
  4. "fmt"
  5. "log"
  6. be "spidercreator/backend"
  7. "sync"
  8. "time"
  9. "github.com/chromedp/chromedp"
  10. )
  11. // 销毁
  12. func (w *Worker) Destory() {
  13. if w.incCancel != nil {
  14. w.incCancel()
  15. }
  16. if w.baseCancel != nil {
  17. w.baseCancel()
  18. }
  19. }
  20. // NewWorker
  21. func NewWorker(headless bool, showImage bool, proxyServe string, contentDelay int64, js string, vm *VM) *Worker {
  22. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  23. return &Worker{baseCancel: baseCancel,
  24. incCancel: cancel,
  25. ctx: ctx,
  26. js: js,
  27. contentDelay: contentDelay,
  28. vm: vm,
  29. }
  30. }
  31. // 执行作业
  32. func (w *Worker) Run(v *be.ResultItem, ch chan *Worker, wg *sync.WaitGroup) {
  33. defer func() {
  34. ch <- w
  35. wg.Done()
  36. }()
  37. w.vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
  38. var result string = ""
  39. err := chromedp.Run(w.ctx, chromedp.Tasks{
  40. chromedp.Navigate(v.Href),
  41. chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
  42. chromedp.Sleep(time.Duration(w.contentDelay) * time.Millisecond),
  43. chromedp.Evaluate(w.js, v),
  44. })
  45. if err != nil {
  46. log.Println("执行JS代码失败_详情", err.Error())
  47. }
  48. if len(v.AttachLinks) > 0 { //有附件
  49. w.vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. 下载附件"))
  50. //TODO 下载附件
  51. downloadAttaches(v, w.vm.attachesDir)
  52. }
  53. //关闭当前TAB页
  54. chromedp.Run(w.ctx, chromedp.Tasks{
  55. chromedp.Evaluate(`var ret="";window.close();ret`, &result),
  56. })
  57. w.vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title))
  58. }
  59. // RunSpiderMulThreads
  60. func (vm *VM) RunSpiderMulThreads(url string, maxPages int, listDealy int64,
  61. trunPageDelay int64, contentDelay int64,
  62. headless bool, showImage bool, proxyServe string, threads int,
  63. exit chan bool,
  64. currentSpiderConfig *be.SpiderConfig, currentResult *list.List) {
  65. sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Url: url})
  66. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  67. log.Println("1浏览器打开")
  68. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  69. defer func() {
  70. cancel()
  71. baseCancel()
  72. log.Println("0浏览器已经销毁")
  73. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  74. close(exit)
  75. }()
  76. var runListJs, runContentJs string = sc.ListJSCode, sc.ContentJSCode
  77. if runListJs == "" {
  78. runListJs = renderJavascriptCoder(loadListItemsJS, sc)
  79. }
  80. if runContentJs == "" {
  81. runContentJs = renderJavascriptCoder(loadContentJS, sc)
  82. }
  83. log.Println("获取列表JS代码", runListJs)
  84. ws := make([]*Worker, threads)
  85. ch := make(chan *Worker, threads)
  86. wg := new(sync.WaitGroup)
  87. for i := 0; i < threads; i++ {
  88. w := NewWorker(headless, showImage, proxyServe, contentDelay, runContentJs, vm)
  89. ws = append(ws, w)
  90. ch <- w
  91. }
  92. //批量销毁
  93. defer func() {
  94. for _, w := range ws {
  95. if w != nil {
  96. w.Destory()
  97. }
  98. }
  99. }()
  100. no := 1
  101. //TODO 1.翻页操作,需要在外层打开列表页
  102. chromedp.Run(ctx, chromedp.Tasks{
  103. chromedp.Navigate(sc.Url),
  104. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  105. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  106. })
  107. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  108. log.Println("2页面打开")
  109. for i := 0; i < maxPages; i++ {
  110. listResult := make(be.ResultItems, 0)
  111. //TODO 2. 执行JS代码,获取列表页信息
  112. err := chromedp.Run(ctx, chromedp.Tasks{
  113. chromedp.Evaluate(runListJs, &listResult),
  114. })
  115. if err != nil {
  116. log.Println("执行JS代码失败_列表", err.Error())
  117. vm.dnf.Dispatch("debug_event", "2 列表-执行JS代码失败")
  118. return
  119. }
  120. vm.dnf.Dispatch("debug_event", "3 获取列表完成")
  121. log.Println("3获取列表完成")
  122. //TODO 3. 打开详情页 ,支持多线程
  123. for _, v := range listResult {
  124. select {
  125. case <-exit:
  126. return
  127. default:
  128. w := <-ch
  129. wg.Add(1)
  130. no += 1
  131. v.No = no
  132. currentResult.PushBack(v)
  133. go w.Run(v, ch, wg)
  134. }
  135. }
  136. wg.Wait()
  137. vm.dnf.Dispatch("debug_event", "4 当前页采集完成,准备执行翻页逻辑//"+currentSpiderConfig.ListNextPageCss)
  138. if err = trunPage(currentSpiderConfig, trunPageDelay, ctx); err != nil {
  139. log.Println("翻页失败", err.Error())
  140. vm.dnf.Dispatch("debug_event", "6 翻页失败: "+err.Error())
  141. time.Sleep(3 * time.Second)
  142. break
  143. }
  144. }
  145. vm.dnf.Dispatch("debug_event", "6 采集测试完成")
  146. log.Println("6 采集测试完成")
  147. }