check.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. package vm
  2. import (
  3. "container/list"
  4. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  5. be "spider_creator/backend"
  6. "time"
  7. "github.com/chromedp/chromedp"
  8. )
  9. // VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
  10. func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
  11. qu.Debug("sc---", *sc)
  12. verifyResult := list.New()
  13. ret := &be.SpiderConfigVerifyResult{true, true, true, true, true, true, true}
  14. _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
  15. _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
  16. defer func() {
  17. incCancelFn2()
  18. baseCancelFn2()
  19. incCancelFn()
  20. baseCancelFn()
  21. }()
  22. listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
  23. //TODO 2. 执行JS代码,获取列表页信息
  24. if listRunJs == "" {
  25. listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
  26. }
  27. if contentRunJs == "" {
  28. contentRunJs = renderJavascriptCoder(loadContentJS, sc)
  29. }
  30. //TODO 3.打开列表,获取条目清单
  31. chromedp.Run(ctx, chromedp.Tasks{
  32. chromedp.Navigate(sc.Href),
  33. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  34. //chromedp.Sleep(1000 * time.Millisecond),
  35. chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
  36. })
  37. no := 1
  38. T:
  39. for j := 0; j < 2; j++ { //最多检查2页
  40. listResult := make(be.ResultItems, 0)
  41. err := chromedp.Run(ctx, chromedp.Tasks{
  42. chromedp.Evaluate(listRunJs, &listResult),
  43. })
  44. if err != nil {
  45. qu.Debug("执行JS代码失败", err.Error())
  46. continue
  47. }
  48. //TODO 5.操作详情页
  49. for contentIndex, r := range listResult {
  50. if contentIndex > 1 { //每页只采集2条
  51. break
  52. }
  53. //打开详情页
  54. err = chromedp.Run(ctx2, chromedp.Tasks{
  55. chromedp.Navigate(r.Href),
  56. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  57. //chromedp.Sleep(1000 * time.Millisecond),
  58. chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
  59. })
  60. if err != nil {
  61. continue
  62. }
  63. //获取详情页内容
  64. err = chromedp.Run(ctx2, chromedp.Tasks{
  65. chromedp.Evaluate(contentRunJs, r),
  66. })
  67. if err != nil {
  68. continue
  69. }
  70. if sc.AttachCss != "" {
  71. downloadAttaches(r, vm.attachesDir)
  72. }
  73. r.Site = sc.Site
  74. r.Channel = sc.Channel
  75. if r.Title == "" {
  76. r.Title = r.ListTitle
  77. }
  78. if r.PublishTime == "" {
  79. r.PublishTime = r.ListPubTime
  80. }
  81. r.No = no
  82. no += 1
  83. //结果放入缓存
  84. verifyResult.PushBack(r)
  85. }
  86. //TODO 6.翻页
  87. //if err = trunPage(sc, 2000, ctx); err != nil {
  88. if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil {
  89. ret.ListTrunPage = false
  90. break T
  91. }
  92. }
  93. //检查
  94. for el := verifyResult.Front(); el != nil; el = el.Next() {
  95. r, _ := el.Value.(*be.ResultItem)
  96. if ret.Title {
  97. ret.Title = r.Title != ""
  98. }
  99. if ret.PublishUnit {
  100. ret.PublishUnit = r.PublishUnit != ""
  101. }
  102. if ret.PublishTime {
  103. ret.PublishTime = r.PublishTime != ""
  104. }
  105. if ret.Content {
  106. ret.Content = r.Content != ""
  107. }
  108. if ret.Attaches {
  109. ret.Attaches = len(r.AttachLinks) > 0
  110. }
  111. }
  112. qu.Debug(verifyResult.Len())
  113. if ret.ListItems {
  114. ret.ListItems = verifyResult.Len() > 2
  115. }
  116. //TODO:每次验证结果存库、内存?
  117. return ret, nil
  118. }