package vm import ( "container/list" "log" be "spider_creator/backend" "time" "github.com/chromedp/chromedp" ) // VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条 func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) { verifyResult := list.New() ret := &be.SpiderConfigVerifyResult{true, true, true, true, true, true, true} _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用 _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用 defer func() { incCancelFn2() baseCancelFn2() incCancelFn() baseCancelFn() }() listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode //TODO 2. 执行JS代码,获取列表页信息 if listRunJs == "" { listRunJs = renderJavascriptCoder(loadListItemsJS, sc) } if contentRunJs == "" { contentRunJs = renderJavascriptCoder(loadContentJS, sc) } //TODO 3.打开列表,获取条目清单 chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Href), chromedp.WaitReady("document.body", chromedp.ByJSPath), //chromedp.Sleep(1000 * time.Millisecond), chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond), }) no := 1 T: for j := 0; j < 2; j++ { //最多检查2页 listResult := make(be.ResultItems, 0) err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(listRunJs, &listResult), }) if err != nil { log.Println("执行JS代码失败", err.Error()) continue } //TODO 5.操作详情页 for contentIndex, r := range listResult { if contentIndex > 1 { //每页只采集2条 break } //打开详情页 err = chromedp.Run(ctx2, chromedp.Tasks{ chromedp.Navigate(r.Href), chromedp.WaitReady("document.body", chromedp.ByJSPath), //chromedp.Sleep(1000 * time.Millisecond), chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond), }) if err != nil { continue } //获取详情页内容 err = chromedp.Run(ctx2, chromedp.Tasks{ chromedp.Evaluate(contentRunJs, r), }) if err != nil { continue } if sc.AttachCss != "" { downloadAttaches(r, vm.attachesDir) } r.Site = sc.Site r.Channel = sc.Channel if r.Title == "" { r.Title = r.ListTitle } if r.PublishTime == "" { r.PublishTime = r.ListPubTime } r.No = no no += 1 //结果放入缓存 verifyResult.PushBack(r) } //TODO 6.翻页 //if err = trunPage(sc, 2000, ctx); err != nil { if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil { ret.ListTrunPage = false break T } } //检查 for el := verifyResult.Front(); el != nil; el = el.Next() { r, _ := el.Value.(*be.ResultItem) if ret.Title { ret.Title = r.Title != "" } if ret.PublishUnit { ret.PublishUnit = r.PublishUnit != "" } if ret.PublishTime { ret.PublishTime = r.PublishTime != "" } if ret.Content { ret.Content = r.Content != "" } if ret.Attaches { ret.Attaches = len(r.AttachLinks) > 0 } } if ret.ListItems { ret.ListItems = verifyResult.Len() > 2 } //TODO:每次验证结果存库、内存? return ret, nil }