check.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. package vm
  2. import (
  3. "container/list"
  4. "errors"
  5. "fmt"
  6. "github.com/chromedp/chromedp"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. be "spider_creator/backend"
  9. "time"
  10. )
  11. // 爬虫验证
  12. func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
  13. if be.Cfg.IsOnly4MainSite {
  14. return vm.verifySpiderConfig4MainSite(sc) //重点网站
  15. } else {
  16. return vm.verifySpiderConfig4Prod(sc) //正式环境
  17. }
  18. }
  19. // verifySpiderConfig4Prod 验证爬虫配置,支持翻页,列表项数据只提取2条
  20. // 正式环境
  21. func (vm *VM) verifySpiderConfig4Prod(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
  22. qu.Debug("sc---", *sc)
  23. verifyResult := list.New()
  24. be.DataResults[sc.Code] = verifyResult
  25. ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
  26. _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href) //列表页使用
  27. _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, true, false, sc.Href) //详情页使用
  28. defer func() {
  29. incCancelFn2()
  30. baseCancelFn2()
  31. incCancelFn()
  32. baseCancelFn()
  33. }()
  34. listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
  35. //2. 执行JS代码,获取列表页信息
  36. if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
  37. listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
  38. }
  39. if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
  40. contentRunJs = renderJavascriptCoder(loadContentJS, sc)
  41. }
  42. qu.Debug("获取列表页JS代码:", listRunJs)
  43. qu.Debug("获取详情页JS代码:", contentRunJs)
  44. //3.打开列表,获取条目清单
  45. chromedp.Run(ctx, chromedp.Tasks{
  46. chromedp.Navigate(sc.Href),
  47. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  48. //chromedp.Sleep(1000 * time.Millisecond),
  49. chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
  50. })
  51. //初始化列表页信息
  52. if !vm.InitListPage(ctx, sc) {
  53. qu.Debug("初始化列表页失败,退出")
  54. return ret, errors.New("初始化列表页失败")
  55. }
  56. no := 1
  57. ret.ListTrunPage = true
  58. T:
  59. for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页
  60. qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
  61. listResult := make(be.ResultItems, 0)
  62. err := chromedp.Run(ctx, chromedp.Tasks{
  63. chromedp.Evaluate(listRunJs, &listResult),
  64. })
  65. if err != nil {
  66. qu.Debug("执行列表页JS代码失败", err.Error())
  67. continue
  68. }
  69. //TODO 5.操作详情页
  70. qu.Debug("列表采集条数:", len(listResult))
  71. for contentIndex, r := range listResult {
  72. qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
  73. if contentIndex > 1 { //每页只校验2条
  74. break
  75. }
  76. //打开详情页
  77. err = chromedp.Run(ctx2, chromedp.Tasks{
  78. chromedp.Navigate(r.Href),
  79. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  80. //chromedp.Sleep(2000 * time.Millisecond),
  81. chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
  82. })
  83. if err != nil {
  84. qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
  85. continue
  86. }
  87. //获取详情页内容
  88. err = chromedp.Run(ctx2, chromedp.Tasks{
  89. chromedp.Evaluate(contentRunJs, r),
  90. })
  91. if err != nil {
  92. qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
  93. continue
  94. }
  95. //下载附件
  96. if sc.AttachCss != "" {
  97. downloadAttaches(r, vm.attachesDir)
  98. }
  99. r.Site = sc.Site
  100. r.Channel = sc.Channel
  101. if r.Title == "" {
  102. r.Title = r.ListTitle
  103. }
  104. if r.PublishTime == "" {
  105. r.PublishTime = r.ListPubTime
  106. }
  107. r.No = no
  108. no += 1
  109. //结果放入缓存
  110. verifyResult.PushBack(r)
  111. }
  112. qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
  113. //翻页
  114. if verifyResult.Len() > 0 {
  115. if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage {
  116. if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
  117. qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
  118. ret.ListTrunPage = false
  119. break T
  120. }
  121. }
  122. } else {
  123. ret.ListTrunPage = false
  124. break T
  125. }
  126. }
  127. //检查
  128. for el := verifyResult.Front(); el != nil; el = el.Next() {
  129. r, _ := el.Value.(*be.ResultItem)
  130. ret.Title = r.Title != ""
  131. ret.PublishUnit = r.PublishUnit != ""
  132. ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime)
  133. ret.Content = r.Content != ""
  134. ret.Attaches = len(r.AttachLinks) > 0
  135. }
  136. qu.Debug(verifyResult.Len())
  137. ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
  138. return ret, nil
  139. }
  140. // verifySpiderConfig4MainSite 只验证列表标注
  141. // 重点网站测试环境
  142. func (vm *VM) verifySpiderConfig4MainSite(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
  143. qu.Debug("sc---", *sc)
  144. verifyResult := list.New()
  145. be.DataResults[sc.Code] = verifyResult
  146. ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
  147. _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href) //列表页使用
  148. defer func() {
  149. incCancelFn()
  150. baseCancelFn()
  151. }()
  152. listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
  153. //2. 执行JS代码,获取列表页信息
  154. if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
  155. listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
  156. }
  157. if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
  158. contentRunJs = renderJavascriptCoder(loadContentJS, sc)
  159. }
  160. qu.Debug("列表页JS:", listRunJs)
  161. //3.打开列表,获取条目清单
  162. chromedp.Run(ctx, chromedp.Tasks{
  163. chromedp.Navigate(sc.Href),
  164. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  165. //chromedp.Sleep(1000 * time.Millisecond),
  166. chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
  167. })
  168. //4.初始化列表页信息
  169. if !vm.InitListPage(ctx, sc) {
  170. qu.Debug("初始化列表页失败,退出")
  171. return ret, errors.New("初始化列表页失败")
  172. }
  173. no := 1
  174. ret.ListTrunPage = true
  175. T:
  176. for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页
  177. qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
  178. listResult := make(be.ResultItems, 0)
  179. err := chromedp.Run(ctx, chromedp.Tasks{
  180. chromedp.Evaluate(listRunJs, &listResult),
  181. })
  182. if err != nil {
  183. qu.Debug("执行列表页JS代码失败", err.Error())
  184. continue
  185. }
  186. //5.操作详情页
  187. qu.Debug("列表采集条数:", len(listResult))
  188. for contentIndex, r := range listResult {
  189. if contentIndex > 1 { //每页只校验2条
  190. break
  191. }
  192. qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
  193. r.Site = sc.Site
  194. r.Channel = sc.Channel
  195. //qu.Debug(r.Title, r.ListTitle)
  196. if r.Title == "" {
  197. r.Title = r.ListTitle
  198. }
  199. //qu.Debug(r.PublishTime, r.ListPubTime)
  200. if r.PublishTime == "" {
  201. r.PublishTime = r.ListPubTime
  202. }
  203. r.No = no
  204. no += 1
  205. //结果放入缓存
  206. verifyResult.PushBack(r)
  207. }
  208. qu.Debug("列表采集条数结果:", verifyResult.Len())
  209. //6.翻页
  210. if verifyResult.Len() > 0 {
  211. if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage {
  212. if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
  213. qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
  214. ret.ListTrunPage = false
  215. break T
  216. }
  217. }
  218. } else {
  219. ret.ListTrunPage = false
  220. break T
  221. }
  222. }
  223. //检查
  224. for el := verifyResult.Front(); el != nil; el = el.Next() {
  225. r, _ := el.Value.(*be.ResultItem)
  226. ret.Title = r.Title != ""
  227. qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
  228. ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime)
  229. qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
  230. }
  231. if ret.ListItems {
  232. ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
  233. }
  234. return ret, nil
  235. }