package vm import ( "container/list" "context" _ "embed" "fmt" "github.com/chromedp/chromedp" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" be "spider_creator/backend" "spider_creator/backend/ai" "strconv" "time" ) // NewVM func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM { return &VM{ attachesDir, dnf, } } func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, contentDelay int64, headless bool, showImage bool, proxyServe bool, exit chan bool, cssMark map[string]interface{}) { sc, err := be.NewSpiderConfig(cssMark) if err != nil { qu.Debug("标注信息传输失败!") vm.dnf.Dispatch("debug_event", "标注信息传输失败!") return } if url != "" { sc.Href = url } _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe) qu.Debug("1浏览器打开", *sc) vm.dnf.Dispatch("debug_event", "1 浏览器打开") defer func() { cancel() baseCancel() qu.Debug("0浏览器已经销毁") vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁") close(exit) }() chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Href), //打开页面 chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕 chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待 }) vm.dnf.Dispatch("debug_event", "2 页面已经打开") qu.Debug("2页面打开") var runJs string = sc.ListJSCode //列表页信息初始化 vm.dnf.Dispatch("debug_event", "3 初始化列表页信息") if !vm.InitListPage(ctx, sc) { vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出") return } //TODO 2. 执行JS代码,获取列表页信息 if be.RegSpace.ReplaceAllString(runJs, "") == "" { runJs = renderJavascriptCoder(loadListItemsJS, sc) } qu.Debug("列表页执行JS:", runJs) currentResult := list.New() be.DataResults[sc.Code] = currentResult no := 1 for j := 0; j < maxPages; j++ { qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...") listResult := make(be.ResultItems, 0) err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(runJs, &listResult), }) if err != nil { qu.Debug("执行JS代码失败", err.Error()) vm.dnf.Dispatch("debug_event", "2 第"+fmt.Sprint(j+1)+"页执行JS代码失败") continue } qu.Debug("第"+fmt.Sprint(j+1)+"页列表采集条数:", len(listResult)) vm.dnf.Dispatch("debug_event", "3 第"+fmt.Sprint(j+1)+"获取列表完成") for _, v := range listResult { select { case <-exit: return default: qu.Debug(v.No, v.ListTitle, v.Href, v.Title, v.PublishTime, v.ListPubTime) v.Site = sc.Site v.Channel = sc.Channel v.Title = v.ListTitle v.Content = "详见正文" v.PublishTime = v.ListPubTime v.No = no no++ currentResult.PushBack(v) } } vm.dnf.Dispatch("debug_event", "4 第"+fmt.Sprint(j+1)+"页采集完成,准备执行翻页") if j < maxPages-1 { if err = trunPage(sc, trunPageDelay, ctx); err != nil { qu.Debug("翻页失败", err.Error()) vm.dnf.Dispatch("debug_event", "5 第"+fmt.Sprint(j+1)+"页翻页失败") time.Sleep(3 * time.Second) break } } } vm.dnf.Dispatch("debug_event", "6 采集测试完成") qu.Debug("6采集测试完成") } // RunSpider 适用于测试1页数据 func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe bool, exit chan bool, cssMark map[string]interface{}) { sc, err := be.NewSpiderConfig(cssMark) if err != nil { qu.Debug("标注信息传输失败!") vm.dnf.Dispatch("debug_event", "标注信息传输失败!") return } if url != "" { sc.Href = url } _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe) qu.Debug("1浏览器打开", *sc) vm.dnf.Dispatch("debug_event", "1 浏览器打开") defer func() { cancel() baseCancel() qu.Debug("0浏览器已经销毁") vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁") close(exit) }() chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Href), //打开页面 chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕 chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待 }) vm.dnf.Dispatch("debug_event", "2 页面已经打开") vm.dnf.Dispatch("debug_event", "3 初始化列表页信息") //1、列表页信息初始化 if !vm.InitListPage(ctx, sc) { vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出") return } var runJs string = sc.ListJSCode listResult := make(be.ResultItems, 0) //2、执行JS代码,获取列表页信息 if be.RegSpace.ReplaceAllString(runJs, "") == "" { runJs = renderJavascriptCoder(loadListItemsJS, sc) } qu.Debug("获取列表JS代码:", runJs) err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(runJs, &listResult), }) if err != nil { qu.Debug("执行JS代码失败", err.Error()) vm.dnf.Dispatch("debug_event", "4 执行JS代码失败") return } vm.dnf.Dispatch("debug_event", "4 获取列表完成") qu.Debug("3获取列表完成", len(listResult)) //3、打开详情页 runJs = sc.ContentJSCode if be.RegSpace.ReplaceAllString(runJs, "") == "" { runJs = renderJavascriptCoder(loadContentJS, sc) } currentResult := list.New() be.DataResults[sc.Code] = currentResult qu.Debug("详情页JS代码:", runJs) no := 1 for _, v := range listResult { select { case <-exit: return default: qu.Debug(v.No, v.Href, v.ListTitle, v.ListPubTime) vm.dnf.Dispatch("debug_event", fmt.Sprintf("5. %d- 待 下载详情页 %s ", v.No, v.ListTitle)) var result string = "" err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(v.Href), chromedp.WaitReady(`document.body`, chromedp.ByJSPath), chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond), chromedp.Evaluate(runJs, v), }) v.No = no no++ v.Site = sc.Site v.Channel = sc.Channel if err != nil { qu.Debug("执行JS代码失败", err.Error()) } if len(v.AttachLinks) > 0 { //有附件 vm.dnf.Dispatch("debug_event", fmt.Sprintf("6. 下载附件")) //4、下载附件 downloadAttaches(v, vm.attachesDir) } //关闭当前TAB页 chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(`var ret="";window.close();ret`, &result), }) vm.dnf.Dispatch("debug_event", fmt.Sprintf("5. %d- 下载详情页 %s 完成", v.No, v.Title)) currentResult.PushBack(v) } } vm.dnf.Dispatch("debug_event", "7 采集测试完成") qu.Debug("5采集测试完成") } // InitPage 初始化页面 func (vm *VM) InitListPage(ctx context.Context, sc *be.SpiderConfig) (initPage bool) { if len(sc.InitList) == 0 { //没有初始化页面行为 return true } for j, ac := range sc.InitList { arc := vm.RunAction(ctx, ac, j) //itype 0:执行成功 1:执行错误 2:超时 if !arc.Result { //动作执行失败,不再执行后续动作 return false } } return true } // RunAction 执行动作 func (vm *VM) RunAction(ctx context.Context, ac *be.Actions, num int) *be.ActionRunResult { ctxTmp, cancel := context.WithTimeout(context.Background(), time.Duration(ac.SleepTime+5000)*time.Millisecond) defer cancel() done := make(chan *be.ActionRunResult) go func() { for { select { case <-ctxTmp.Done(): done <- &be.ActionRunResult{ Result: false, RunResult: be.RUN_ACTION_TIMEOUT, CheckResult: be.CHECK_ACTION_TIMEOUT, } vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num)+"个动作JS,超时") qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS,超时") return default: //执行动作 vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num)+"个动作JS") qu.Debug("3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS:", ac.ActionJs) var result string err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(ac.ActionJs, &result), chromedp.Sleep(time.Duration(ac.SleepTime) * time.Millisecond), }) if err != nil { done <- &be.ActionRunResult{ Result: false, RunResult: be.RUN_ACTION_ERROR, CheckResult: be.CHECK_ACTION_NOTCHECK, } vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常") qu.Debug("3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常", err) return } //检查结果 var checkResult string if ac.CheckJs != "" { vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS") qu.Debug("3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS:", ac.CheckJs) err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(ac.CheckJs, &checkResult), }) if err != nil { vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常") qu.Debug("3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常", err) done <- &be.ActionRunResult{ Result: false, RunResult: be.RUN_ACTION_SUCCESS, CheckResult: be.CHECK_ACTION_ERROR, } return } vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS结果:"+checkResult) qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS结果:" + checkResult) done <- &be.ActionRunResult{ Result: checkResult == be.CHECH_RESULT, RunResult: be.RUN_ACTION_SUCCESS, CheckResult: be.CHECK_ACTION_SUCCESS, } return } vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS结果:true") qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS结果:true") done <- &be.ActionRunResult{ Result: true, RunResult: be.RUN_ACTION_SUCCESS, CheckResult: be.CHECK_ACTION_NOTCHECK, } return } } }() return <-done } // InitPageTmp 初始化页面 func (vm *VM) InitPageTmp(ctx context.Context, timeout int) bool { //1、页面初始化需要执行的事件(多个动作) initPageJs := `var clicklabel = document.querySelector("#app-base > div > div.IndexContent > div > div > form > div > div.ant-col.ant-col-4 > button.ant-btn.ant-btn-primary > span");if(clicklabel)clicklabel.click();"";` var result string err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(initPageJs, &result), //chromedp.WaitReady(".ant-list-items"), }) if err != nil { qu.Debug("初始化页面JS执行失败", err.Error()) return false } ctxTmp, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond) defer cancel() done := make(chan bool) go func() { for { select { case <-ctxTmp.Done(): done <- false return default: getJs := `var label = document.querySelector(".ant-list-items");if(label)label.outerText;` err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(getJs, &result), }) if result != "" { done <- true return } time.Sleep(1 * time.Second) // 模拟工作负载 } } }() get := <-done return get } // CountYestodayArts 统计昨日信息发布量 func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64, headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) { sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url}) _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, false) qu.Debug("1浏览器打开") vm.dnf.Dispatch("debug_event", "1 浏览器打开") defer func() { cancel() baseCancel() qu.Debug("0浏览器已经销毁") vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁") vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count)) close(exit) }() //时间比较 now := time.Now() yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期 startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location()) endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond) //TODO 1. chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Href), chromedp.WaitReady("document.body", chromedp.ByJSPath), chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), }) vm.dnf.Dispatch("debug_event", "2 页面已经打开") qu.Debug("2页面打开") //TODO 2. 执行JS代码,获取列表页信息 runJs := renderJavascriptCoder(loadListItemsJS, sc) tmp := map[string]bool{} //最多翻页1000页 for i := 0; i < MAX_TRUN_PAGE; i++ { select { case <-exit: return default: vm.dnf.Dispatch("debug_event", "3 执行列表页JS") listResult := make(be.ResultItems, 0) err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(runJs, &listResult), }) if err != nil { qu.Debug("执行JS代码失败", err.Error()) vm.dnf.Dispatch("debug_event", "3 执行JS代码失败") return } //TODO 人工智能转换采集到的日期 callAIState := false for j := 0; j < 5; j++ { vm.dnf.Dispatch("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1)) err := ai.UpdateResultDateStr(listResult) if err == nil { callAIState = true break } } if !callAIState { vm.dnf.Dispatch("debug_event", "3 多轮次调用AI均未得到合理结果") return } //TODO 日期统计 for _, r := range listResult { day, err := time.Parse("2006-01-02", r.ListPubTime) if err != nil { continue } if _, ok := tmp[r.Href]; ok { //去重 continue } if day.After(startOfYesterday) && day.Before(endOfYesterday) { count += 1 } else if day.Before(startOfYesterday) { return } } vm.dnf.Dispatch("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count)) //TODO 翻页 //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss) chromedp.Run(ctx, chromedp.Tasks{ chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss), chromedp.ByJSPath), chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond), }) } } return }