check.go 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. package vm
  2. import (
  3. "container/list"
  4. "log"
  5. be "spidercreator/backend"
  6. "time"
  7. "github.com/chromedp/chromedp"
  8. )
  9. // VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
  10. func (vm *VM) VerifySpiderConfig(sf *be.SpiderConfig, verifyResult *list.List) (*be.SpiderConfigVerifyResult, error) {
  11. ret := &be.SpiderConfigVerifyResult{true, true, true, true, true, true, true}
  12. _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
  13. _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
  14. defer func() {
  15. incCancelFn2()
  16. baseCancelFn2()
  17. incCancelFn()
  18. baseCancelFn()
  19. }()
  20. listRunJs, contentRunJs := sf.ListJSCode, sf.ContentJSCode
  21. //TODO 2. 执行JS代码,获取列表页信息
  22. if listRunJs == "" {
  23. listRunJs = renderJavascriptCoder(loadListItemsJS, sf)
  24. }
  25. if contentRunJs == "" {
  26. contentRunJs = renderJavascriptCoder(loadContentJS, sf)
  27. }
  28. //TODO 3.打开列表,获取条目清单
  29. chromedp.Run(ctx, chromedp.Tasks{
  30. chromedp.Navigate(sf.Url),
  31. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  32. chromedp.Sleep(1000 * time.Millisecond),
  33. })
  34. no := 1
  35. T:
  36. for j := 0; j < 2; j++ { //最多检查2页
  37. listResult := make(be.ResultItems, 0)
  38. err := chromedp.Run(ctx, chromedp.Tasks{
  39. chromedp.Evaluate(listRunJs, &listResult),
  40. })
  41. if err != nil {
  42. log.Println("执行JS代码失败", err.Error())
  43. continue
  44. }
  45. //TODO 5.操作详情页
  46. for contentIndex, r := range listResult {
  47. if contentIndex > 1 { //每页只采集2条
  48. break
  49. }
  50. //打开详情页
  51. err = chromedp.Run(ctx2, chromedp.Tasks{
  52. chromedp.Navigate(r.Href),
  53. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  54. chromedp.Sleep(1000 * time.Millisecond),
  55. })
  56. if err != nil {
  57. continue
  58. }
  59. //获取详情页内容
  60. err = chromedp.Run(ctx2, chromedp.Tasks{
  61. chromedp.Evaluate(contentRunJs, r),
  62. })
  63. if err != nil {
  64. continue
  65. }
  66. if sf.AttachCss != "" {
  67. downloadAttaches(r, vm.attachesDir)
  68. }
  69. r.Site = sf.Site
  70. r.Channel = sf.Channel
  71. if r.Title == "" {
  72. r.Title = r.ListTitle
  73. }
  74. if r.PublishTime == "" {
  75. r.PublishTime = r.ListPubTime
  76. }
  77. r.No = no
  78. no += 1
  79. //结果放入缓存
  80. verifyResult.PushBack(r)
  81. }
  82. //TODO 6.翻页
  83. if err = trunPage(sf, 2000, ctx); err != nil {
  84. ret.ListTrunPage = false
  85. break T
  86. }
  87. }
  88. //检查
  89. for el := verifyResult.Front(); el != nil; el = el.Next() {
  90. r, _ := el.Value.(*be.ResultItem)
  91. if ret.Title {
  92. ret.Title = r.Title != ""
  93. }
  94. if ret.PublishUnit {
  95. ret.PublishUnit = r.PublishUnit != ""
  96. }
  97. if ret.PublishTime {
  98. ret.PublishTime = r.PublishTime != ""
  99. }
  100. if ret.Content {
  101. ret.Content = r.Content != ""
  102. }
  103. if ret.Attaches {
  104. ret.Attaches = len(r.AttachLinks) > 0
  105. }
  106. }
  107. if ret.ListItems {
  108. ret.ListItems = verifyResult.Len() > 2
  109. }
  110. //TODO:每次验证结果存库、内存?
  111. return ret, nil
  112. }