check.go 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. package vm
  2. import (
  3. "container/list"
  4. "errors"
  5. "fmt"
  6. "github.com/chromedp/chromedp"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. be "spider_creator/backend"
  9. "time"
  10. )
  11. // 爬虫验证
  12. func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error, []string) {
  13. if be.Cfg.IsOnly4MainSite {
  14. return vm.verifySpiderConfig4MainSite(sc) //重点网站
  15. } else {
  16. return vm.verifySpiderConfig4Prod(sc) //正式环境
  17. }
  18. }
  19. // verifySpiderConfig4Prod 验证爬虫配置,支持翻页,列表项数据只提取2条
  20. // 正式环境
  21. func (vm *VM) verifySpiderConfig4Prod(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error, []string) {
  22. qu.Debug("sc---", *sc)
  23. var errMsg []string
  24. verifyResult := list.New()
  25. be.DataResults[sc.Code] = verifyResult
  26. ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false}
  27. _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource) //列表页使用
  28. _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource) //详情页使用
  29. defer func() {
  30. incCancelFn2()
  31. baseCancelFn2()
  32. incCancelFn()
  33. baseCancelFn()
  34. }()
  35. listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
  36. //2. 执行JS代码,获取列表页信息
  37. if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
  38. listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
  39. }
  40. if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
  41. contentRunJs = renderJavascriptCoder(loadContentJS, sc)
  42. }
  43. qu.Debug("获取列表页JS代码:", listRunJs)
  44. qu.Debug("获取详情页JS代码:", contentRunJs)
  45. //3.打开列表,获取条目清单
  46. chromedp.Run(ctx, chromedp.Tasks{
  47. chromedp.Navigate(sc.Href),
  48. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  49. //chromedp.Sleep(1000 * time.Millisecond),
  50. chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
  51. })
  52. //初始化列表页信息
  53. if !vm.InitListPage(ctx, sc) {
  54. qu.Debug("初始化列表页失败,退出")
  55. return ret, errors.New("初始化列表页失败"), errMsg
  56. }
  57. no := 1
  58. ret.ListTrunPage = true
  59. T:
  60. for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页
  61. qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
  62. listResult := make(be.ResultItems, 0)
  63. err := chromedp.Run(ctx, chromedp.Tasks{
  64. chromedp.Evaluate(listRunJs, &listResult),
  65. })
  66. if err != nil {
  67. qu.Debug("执行列表页JS代码失败", err.Error())
  68. errMsg = append(errMsg, fmt.Sprintf("%s%d%s%s", "第", j+1, "页列表数据获取失败:", err.Error()))
  69. continue
  70. }
  71. //TODO 5.操作详情页
  72. qu.Debug("列表采集条数:", len(listResult))
  73. for contentIndex, r := range listResult {
  74. qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
  75. if contentIndex > 1 { //每页只校验2条
  76. break
  77. }
  78. //打开详情页
  79. err = chromedp.Run(ctx2, chromedp.Tasks{
  80. chromedp.Navigate(r.Href),
  81. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  82. //chromedp.Sleep(2000 * time.Millisecond),
  83. chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
  84. })
  85. if err != nil {
  86. qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常")
  87. errMsg = append(errMsg, fmt.Sprintf("%s%d%s%d%s%s", "第", j+1, "页,第", contentIndex+1, "条打开详情页失败:", err.Error()))
  88. continue
  89. }
  90. //获取详情页内容
  91. err = chromedp.Run(ctx2, chromedp.Tasks{
  92. chromedp.Evaluate(contentRunJs, r),
  93. })
  94. if err != nil {
  95. qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败")
  96. errMsg = append(errMsg, fmt.Sprintf("%s%d%s%d%s%s", "第", j+1, "页,第", contentIndex+1, "条详情页内容获取失败:", err.Error()))
  97. continue
  98. }
  99. //下载附件
  100. if sc.AttachCss != "" {
  101. downloadAttaches(r, vm.attachesDir)
  102. }
  103. r.Site = sc.Site
  104. r.Channel = sc.Channel
  105. if r.Title == "" {
  106. r.Title = r.ListTitle
  107. }
  108. if r.PublishTime == "" {
  109. r.PublishTime = r.ListPubTime
  110. }
  111. r.No = no
  112. no += 1
  113. //结果放入缓存
  114. verifyResult.PushBack(r)
  115. }
  116. qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len())
  117. //翻页
  118. if verifyResult.Len() > 0 {
  119. if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage {
  120. if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
  121. qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
  122. errMsg = append(errMsg, fmt.Sprintf("%s%d%s%s", "第", j+1, "页翻页失败:", err.Error()))
  123. ret.ListTrunPage = false
  124. break T
  125. }
  126. }
  127. } else {
  128. ret.ListTrunPage = false
  129. break T
  130. }
  131. }
  132. //检查
  133. for el := verifyResult.Front(); el != nil; el = el.Next() {
  134. r, _ := el.Value.(*be.ResultItem)
  135. ret.Title = r.Title != ""
  136. ret.PublishUnit = r.PublishUnit != ""
  137. ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime)
  138. ret.Content = r.Content != ""
  139. ret.Attaches = len(r.AttachLinks) > 0
  140. }
  141. qu.Debug(verifyResult.Len())
  142. ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
  143. return ret, nil, errMsg
  144. }
  145. // verifySpiderConfig4MainSite 只验证列表标注
  146. // 重点网站测试环境
  147. func (vm *VM) verifySpiderConfig4MainSite(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error, []string) {
  148. qu.Debug("sc---", *sc)
  149. var errMsg []string
  150. verifyResult := list.New()
  151. be.DataResults[sc.Code] = verifyResult
  152. ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false}
  153. _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource) //列表页使用
  154. defer func() {
  155. incCancelFn()
  156. baseCancelFn()
  157. }()
  158. listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
  159. //2. 执行JS代码,获取列表页信息
  160. if be.RegSpace.ReplaceAllString(listRunJs, "") == "" {
  161. listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
  162. }
  163. if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" {
  164. contentRunJs = renderJavascriptCoder(loadContentJS, sc)
  165. }
  166. qu.Debug("列表页JS:", listRunJs)
  167. //3.打开列表,获取条目清单
  168. chromedp.Run(ctx, chromedp.Tasks{
  169. chromedp.Navigate(sc.Href),
  170. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  171. //chromedp.Sleep(1000 * time.Millisecond),
  172. chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
  173. })
  174. //4.初始化列表页信息
  175. if !vm.InitListPage(ctx, sc) {
  176. qu.Debug("初始化列表页失败,退出")
  177. return ret, errors.New("初始化列表页失败"), errMsg
  178. }
  179. no := 1
  180. ret.ListTrunPage = true
  181. T:
  182. for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页
  183. qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
  184. listResult := make(be.ResultItems, 0)
  185. err := chromedp.Run(ctx, chromedp.Tasks{
  186. chromedp.Evaluate(listRunJs, &listResult),
  187. })
  188. if err != nil {
  189. qu.Debug("执行列表页JS代码失败", err.Error())
  190. errMsg = append(errMsg, fmt.Sprintf("%s%d%s%s", "第", j+1, "页列表数据获取失败:", err.Error()))
  191. continue
  192. }
  193. //5.操作详情页
  194. qu.Debug("列表采集条数:", len(listResult))
  195. for contentIndex, r := range listResult {
  196. if contentIndex > 1 { //每页只校验2条
  197. break
  198. }
  199. qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条")
  200. r.Site = sc.Site
  201. r.Channel = sc.Channel
  202. //qu.Debug(r.Title, r.ListTitle)
  203. if r.Title == "" {
  204. r.Title = r.ListTitle
  205. }
  206. //qu.Debug(r.PublishTime, r.ListPubTime)
  207. if r.PublishTime == "" {
  208. r.PublishTime = r.ListPubTime
  209. }
  210. r.No = no
  211. no += 1
  212. //结果放入缓存
  213. verifyResult.PushBack(r)
  214. }
  215. qu.Debug("列表采集条数结果:", verifyResult.Len())
  216. //6.翻页
  217. if verifyResult.Len() > 0 {
  218. if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage {
  219. if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败
  220. qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败")
  221. errMsg = append(errMsg, fmt.Sprintf("%s%d%s%s", "第", j+1, "页翻页失败:", err.Error()))
  222. ret.ListTrunPage = false
  223. break T
  224. }
  225. }
  226. } else {
  227. ret.ListTrunPage = false
  228. break T
  229. }
  230. }
  231. //检查
  232. for el := verifyResult.Front(); el != nil; el = el.Next() {
  233. r, _ := el.Value.(*be.ResultItem)
  234. ret.Title = r.Title != ""
  235. qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle)
  236. ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime)
  237. qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime)
  238. }
  239. if ret.ListItems {
  240. ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2)
  241. }
  242. return ret, nil, errMsg
  243. }