123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- package vm
- import (
- "container/list"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- be "spider_creator/backend"
- "time"
- "github.com/chromedp/chromedp"
- )
- // VerifySpiderConfig 验证爬虫配置,支持翻页,列表项数据只提取2条
- func (vm *VM) VerifySpiderConfig(sc *be.SpiderConfig) (*be.SpiderConfigVerifyResult, error) {
- qu.Debug("sc---", *sc)
- verifyResult := list.New()
- ret := &be.SpiderConfigVerifyResult{true, true, true, true, true, true, true}
- _, baseCancelFn, _, _, ctx, incCancelFn := be.NewBrowser(false, false, "") //列表页使用
- _, baseCancelFn2, _, _, ctx2, incCancelFn2 := be.NewBrowser(false, false, "") //详情页使用
- defer func() {
- incCancelFn2()
- baseCancelFn2()
- incCancelFn()
- baseCancelFn()
- }()
- listRunJs, contentRunJs := sc.ListJSCode, sc.ContentJSCode
- //TODO 2. 执行JS代码,获取列表页信息
- if listRunJs == "" {
- listRunJs = renderJavascriptCoder(loadListItemsJS, sc)
- }
- if contentRunJs == "" {
- contentRunJs = renderJavascriptCoder(loadContentJS, sc)
- }
- //TODO 3.打开列表,获取条目清单
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(sc.Href),
- chromedp.WaitReady("document.body", chromedp.ByJSPath),
- //chromedp.Sleep(1000 * time.Millisecond),
- chromedp.Sleep(time.Duration(sc.ListDelayTime) * time.Millisecond),
- })
- no := 1
- T:
- for j := 0; j < 2; j++ { //最多检查2页
- listResult := make(be.ResultItems, 0)
- err := chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(listRunJs, &listResult),
- })
- if err != nil {
- qu.Debug("执行JS代码失败", err.Error())
- continue
- }
- //TODO 5.操作详情页
- for contentIndex, r := range listResult {
- if contentIndex > 1 { //每页只采集2条
- break
- }
- //打开详情页
- err = chromedp.Run(ctx2, chromedp.Tasks{
- chromedp.Navigate(r.Href),
- chromedp.WaitReady("document.body", chromedp.ByJSPath),
- //chromedp.Sleep(1000 * time.Millisecond),
- chromedp.Sleep(time.Duration(sc.ContentDelayTime) * time.Millisecond),
- })
- if err != nil {
- continue
- }
- //获取详情页内容
- err = chromedp.Run(ctx2, chromedp.Tasks{
- chromedp.Evaluate(contentRunJs, r),
- })
- if err != nil {
- continue
- }
- if sc.AttachCss != "" {
- downloadAttaches(r, vm.attachesDir)
- }
- r.Site = sc.Site
- r.Channel = sc.Channel
- if r.Title == "" {
- r.Title = r.ListTitle
- }
- if r.PublishTime == "" {
- r.PublishTime = r.ListPubTime
- }
- r.No = no
- no += 1
- //结果放入缓存
- verifyResult.PushBack(r)
- }
- //TODO 6.翻页
- //if err = trunPage(sc, 2000, ctx); err != nil {
- if err = trunPage(sc, sc.ListTurnDelayTime, ctx); err != nil {
- ret.ListTrunPage = false
- break T
- }
- }
- //检查
- for el := verifyResult.Front(); el != nil; el = el.Next() {
- r, _ := el.Value.(*be.ResultItem)
- if ret.Title {
- ret.Title = r.Title != ""
- }
- if ret.PublishUnit {
- ret.PublishUnit = r.PublishUnit != ""
- }
- if ret.PublishTime {
- ret.PublishTime = r.PublishTime != ""
- }
- if ret.Content {
- ret.Content = r.Content != ""
- }
- if ret.Attaches {
- ret.Attaches = len(r.AttachLinks) > 0
- }
- }
- qu.Debug(verifyResult.Len())
- if ret.ListItems {
- ret.ListItems = verifyResult.Len() > 2
- }
- //TODO:每次验证结果存库、内存?
- return ret, nil
- }
|