package vm import ( "container/list" "errors" "fmt" "github.com/chromedp/chromedp" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" be "spider_creator/backend" "time" ) // 爬虫验证 func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error, []string) { if be.Cfg.IsOnly4MainSite { return vm.verifySpiderConfig4MainSite(sc) //重点网站 } else { return vm.verifySpiderConfig4Prod(sc) //正式环境 } } // verifySpiderConfig4Prod 验证爬虫配置,支持翻页,列表项数据只提取2条 // 正式环境 func (vm *VM) verifySpiderConfig4Prod(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error, []string) { qu.Debug("sc---", *sc) var errMsg []string verifyResult := list.New() be.DataResults[sc.Code] = verifyResult ret := &be.SpiderConfigVerifyResult{false, false, false, false, false, false, false} _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource) //列表页使用 _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource) //详情页使用 defer func() { incCancelFn2() baseCancelFn2() incCancelFn() baseCancelFn() }() listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode //2. 执行JS代码,获取列表页信息 if be.RegSpace.ReplaceAllString(listRunJs, "") == "" { listRunJs = renderJavascriptCoder(loadListItemsJS, sc) } if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" { contentRunJs = renderJavascriptCoder(loadContentJS, sc) } qu.Debug("获取列表页JS代码:", listRunJs) qu.Debug("获取详情页JS代码:", contentRunJs) //3.打开列表,获取条目清单 chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Href), chromedp.WaitReady("document.body", chromedp.ByJSPath), //chromedp.Sleep(1000 * time.Millisecond), chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond), }) //初始化列表页信息 if !vm.InitListPage(ctx, sc) { qu.Debug("初始化列表页失败,退出") return ret, errors.New("初始化列表页失败"), errMsg } no := 1 ret.ListTrunPage = true T: for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页 qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...") listResult := make(be.ResultItems, 0) err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(listRunJs, &listResult), }) if err != nil { qu.Debug("执行列表页JS代码失败", err.Error()) errMsg = append(errMsg, fmt.Sprintf("%s%d%s%s", "第", j+1, "页列表数据获取失败:", err.Error())) continue } //TODO 5.操作详情页 qu.Debug("列表采集条数:", len(listResult)) for contentIndex, r := range listResult { qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条") if contentIndex > 1 { //每页只校验2条 break } //打开详情页 err = chromedp.Run(ctx2, chromedp.Tasks{ chromedp.Navigate(r.Href), chromedp.WaitReady("document.body", chromedp.ByJSPath), //chromedp.Sleep(2000 * time.Millisecond), chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond), }) if err != nil { qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页打开异常") errMsg = append(errMsg, fmt.Sprintf("%s%d%s%d%s%s", "第", j+1, "页,第", contentIndex+1, "条打开详情页失败:", err.Error())) continue } //获取详情页内容 err = chromedp.Run(ctx2, chromedp.Tasks{ chromedp.Evaluate(contentRunJs, r), }) if err != nil { qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条详情页内容获取失败") errMsg = append(errMsg, fmt.Sprintf("%s%d%s%d%s%s", "第", j+1, "页,第", contentIndex+1, "条详情页内容获取失败:", err.Error())) continue } //下载附件 if sc.AttachCss != "" { downloadAttaches(r, vm.attachesDir) } r.Site = sc.Site r.Channel = sc.Channel if r.Title == "" { r.Title = r.ListTitle } if r.PublishTime == "" { r.PublishTime = r.ListPubTime } r.No = no no += 1 //结果放入缓存 verifyResult.PushBack(r) } qu.Debug("第"+fmt.Sprint(j+1)+"页校验成功数据条数:", verifyResult.Len()) //翻页 if verifyResult.Len() > 0 { if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage { if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败 qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败") errMsg = append(errMsg, fmt.Sprintf("%s%d%s%s", "第", j+1, "页翻页失败:", err.Error())) ret.ListTrunPage = false break T } } } else { ret.ListTrunPage = false break T } } //检查 for el := verifyResult.Front(); el != nil; el = el.Next() { r, _ := el.Value.(*be.ResultItem) ret.Title = r.Title != "" ret.PublishUnit = r.PublishUnit != "" ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime) ret.Content = r.Content != "" ret.Attaches = len(r.AttachLinks) > 0 } qu.Debug(verifyResult.Len()) ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2) return ret, nil, errMsg } // verifySpiderConfig4MainSite 只验证列表标注 // 重点网站测试环境 func (vm *VM) verifySpiderConfig4MainSite(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error, []string) { qu.Debug("sc---", *sc) var errMsg []string verifyResult := list.New() be.DataResults[sc.Code] = verifyResult ret := &be.SpiderConfigVerifyResult{false, true, false, true, true, true, false} _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, true, false, sc.Href, sc.FilterResource) //列表页使用 defer func() { incCancelFn() baseCancelFn() }() listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode //2. 执行JS代码,获取列表页信息 if be.RegSpace.ReplaceAllString(listRunJs, "") == "" { listRunJs = renderJavascriptCoder(loadListItemsJS, sc) } if be.RegSpace.ReplaceAllString(contentRunJs, "") == "" { contentRunJs = renderJavascriptCoder(loadContentJS, sc) } qu.Debug("列表页JS:", listRunJs) //3.打开列表,获取条目清单 chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Href), chromedp.WaitReady("document.body", chromedp.ByJSPath), //chromedp.Sleep(1000 * time.Millisecond), chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond), }) //4.初始化列表页信息 if !vm.InitListPage(ctx, sc) { qu.Debug("初始化列表页失败,退出") return ret, errors.New("初始化列表页失败"), errMsg } no := 1 ret.ListTrunPage = true T: for j := 0; j < VERIVY_MAX_TRUN_PAGE && j < int(sc.MaxPages); j++ { //最多检查2页 qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...") listResult := make(be.ResultItems, 0) err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(listRunJs, &listResult), }) if err != nil { qu.Debug("执行列表页JS代码失败", err.Error()) errMsg = append(errMsg, fmt.Sprintf("%s%d%s%s", "第", j+1, "页列表数据获取失败:", err.Error())) continue } //5.操作详情页 qu.Debug("列表采集条数:", len(listResult)) for contentIndex, r := range listResult { if contentIndex > 1 { //每页只校验2条 break } qu.Debug("当前列表页第" + fmt.Sprint(contentIndex+1) + "条") r.Site = sc.Site r.Channel = sc.Channel //qu.Debug(r.Title, r.ListTitle) if r.Title == "" { r.Title = r.ListTitle } //qu.Debug(r.PublishTime, r.ListPubTime) if r.PublishTime == "" { r.PublishTime = r.ListPubTime } r.No = no no += 1 //结果放入缓存 verifyResult.PushBack(r) } qu.Debug("列表采集条数结果:", verifyResult.Len()) //6.翻页 if verifyResult.Len() > 0 { if sc.MaxPages > 1 && j < VERIVY_MAX_TRUN_PAGE-1 && j < int(sc.MaxPages)-1 { //&& !ret.ListTrunPage { if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { //翻页失败 qu.Debug("第" + fmt.Sprint(j+1) + "页翻页失败") errMsg = append(errMsg, fmt.Sprintf("%s%d%s%s", "第", j+1, "页翻页失败:", err.Error())) ret.ListTrunPage = false break T } } } else { ret.ListTrunPage = false break T } } //检查 for el := verifyResult.Front(); el != nil; el = el.Next() { r, _ := el.Value.(*be.ResultItem) ret.Title = r.Title != "" qu.Debug("Check Title:", ret.Title, r.Title, r.ListTitle) ret.PublishTime = r.PublishTime != "" && Reg_Date.MatchString(r.PublishTime) qu.Debug("Check PublishTime:", ret.PublishTime, r.PublishTime, r.ListPubTime) } if ret.ListItems { ret.ListItems = (sc.MaxPages == 1 && verifyResult.Len() > 0) || (sc.MaxPages > 1 && verifyResult.Len() > 2) } return ret, nil, errMsg }