check.go 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. package vm
  2. import (
  3. "container/list"
  4. "log"
  5. be "spider_creator/backend"
  6. "time"
  7. "github.com/chromedp/chromedp"
  8. )
  9. // VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
  10. func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
  11. verifyResult := list.New()
  12. ret := &be.SpiderConfigVerifyResult{true, true, true, true, true, true, true}
  13. _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
  14. _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
  15. defer func() {
  16. incCancelFn2()
  17. baseCancelFn2()
  18. incCancelFn()
  19. baseCancelFn()
  20. }()
  21. listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
  22. //TODO 2. 执行JS代码,获取列表页信息
  23. if listRunJs == "" {
  24. listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
  25. }
  26. if contentRunJs == "" {
  27. contentRunJs = renderJavascriptCoder(loadContentJS, sc)
  28. }
  29. //TODO 3.打开列表,获取条目清单
  30. chromedp.Run(ctx, chromedp.Tasks{
  31. chromedp.Navigate(sc.Href),
  32. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  33. //chromedp.Sleep(1000 * time.Millisecond),
  34. chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
  35. })
  36. no := 1
  37. T:
  38. for j := 0; j < 2; j++ { //最多检查2页
  39. listResult := make(be.ResultItems, 0)
  40. err := chromedp.Run(ctx, chromedp.Tasks{
  41. chromedp.Evaluate(listRunJs, &listResult),
  42. })
  43. if err != nil {
  44. log.Println("执行JS代码失败", err.Error())
  45. continue
  46. }
  47. //TODO 5.操作详情页
  48. for contentIndex, r := range listResult {
  49. if contentIndex > 1 { //每页只采集2条
  50. break
  51. }
  52. //打开详情页
  53. err = chromedp.Run(ctx2, chromedp.Tasks{
  54. chromedp.Navigate(r.Href),
  55. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  56. //chromedp.Sleep(1000 * time.Millisecond),
  57. chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
  58. })
  59. if err != nil {
  60. continue
  61. }
  62. //获取详情页内容
  63. err = chromedp.Run(ctx2, chromedp.Tasks{
  64. chromedp.Evaluate(contentRunJs, r),
  65. })
  66. if err != nil {
  67. continue
  68. }
  69. if sc.AttachCss != "" {
  70. downloadAttaches(r, vm.attachesDir)
  71. }
  72. r.Site = sc.Site
  73. r.Channel = sc.Channel
  74. if r.Title == "" {
  75. r.Title = r.ListTitle
  76. }
  77. if r.PublishTime == "" {
  78. r.PublishTime = r.ListPubTime
  79. }
  80. r.No = no
  81. no += 1
  82. //结果放入缓存
  83. verifyResult.PushBack(r)
  84. }
  85. //TODO 6.翻页
  86. //if err = trunPage(sc, 2000, ctx); err != nil {
  87. if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil {
  88. ret.ListTrunPage = false
  89. break T
  90. }
  91. }
  92. //检查
  93. for el := verifyResult.Front(); el != nil; el = el.Next() {
  94. r, _ := el.Value.(*be.ResultItem)
  95. if ret.Title {
  96. ret.Title = r.Title != ""
  97. }
  98. if ret.PublishUnit {
  99. ret.PublishUnit = r.PublishUnit != ""
  100. }
  101. if ret.PublishTime {
  102. ret.PublishTime = r.PublishTime != ""
  103. }
  104. if ret.Content {
  105. ret.Content = r.Content != ""
  106. }
  107. if ret.Attaches {
  108. ret.Attaches = len(r.AttachLinks) > 0
  109. }
  110. }
  111. if ret.ListItems {
  112. ret.ListItems = verifyResult.Len() > 2
  113. }
  114. //TODO:每次验证结果存库、内存?
  115. return ret, nil
  116. }