123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424 |
- package vm
- import (
- "container/list"
- "context"
- _ "embed"
- "fmt"
- "github.com/chromedp/chromedp"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- be "spider_creator/backend"
- "spider_creator/backend/ai"
- "strconv"
- "time"
- )
- // NewVM
- func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM {
- return &VM{
- attachesDir, dnf,
- }
- }
- func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, contentDelay int64, headless bool, showImage bool, proxyServe bool, exit chan bool, cssMark map[string]interface{}) {
- sc, err := be.NewSpiderConfig(cssMark)
- if err != nil {
- qu.Debug("标注信息传输失败!")
- vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
- return
- }
- if url != "" {
- sc.Href = url
- }
- _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
- qu.Debug("1浏览器打开", *sc)
- vm.dnf.Dispatch("debug_event", "1 浏览器打开")
- defer func() {
- cancel()
- baseCancel()
- qu.Debug("0浏览器已经销毁")
- vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
- close(exit)
- }()
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(sc.Href), //打开页面
- chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
- chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
- })
- vm.dnf.Dispatch("debug_event", "2 页面已经打开")
- qu.Debug("2页面打开")
- var runJs string = sc.ListJSCode
- //列表页信息初始化
- vm.dnf.Dispatch("debug_event", "3 初始化列表页信息")
- if !vm.InitListPage(ctx, sc) {
- vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出")
- return
- }
- //TODO 2. 执行JS代码,获取列表页信息
- if be.RegSpace.ReplaceAllString(runJs, "") == "" {
- runJs = renderJavascriptCoder(loadListItemsJS, sc)
- }
- qu.Debug("列表页执行JS:", runJs)
- currentResult := list.New()
- be.DataResults[sc.Code] = currentResult
- no := 1
- for j := 0; j < maxPages; j++ {
- qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
- listResult := make(be.ResultItems, 0)
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(runJs, &listResult),
- })
- if err != nil {
- qu.Debug("执行JS代码失败", err.Error())
- vm.dnf.Dispatch("debug_event", "2 第"+fmt.Sprint(j+1)+"页执行JS代码失败")
- continue
- }
- qu.Debug("第"+fmt.Sprint(j+1)+"页列表采集条数:", len(listResult))
- vm.dnf.Dispatch("debug_event", "3 第"+fmt.Sprint(j+1)+"获取列表完成")
- for _, v := range listResult {
- select {
- case <-exit:
- return
- default:
- qu.Debug(v.No, v.ListTitle, v.Href, v.Title, v.PublishTime, v.ListPubTime)
- v.Site = sc.Site
- v.Channel = sc.Channel
- v.Title = v.ListTitle
- v.Content = "详见正文"
- v.PublishTime = v.ListPubTime
- v.No = no
- no++
- currentResult.PushBack(v)
- }
- }
- vm.dnf.Dispatch("debug_event", "4 第"+fmt.Sprint(j+1)+"页采集完成,准备执行翻页")
- if j < maxPages-1 {
- if err = trunPage(sc, trunPageDelay, ctx); err != nil {
- qu.Debug("翻页失败", err.Error())
- vm.dnf.Dispatch("debug_event", "5 第"+fmt.Sprint(j+1)+"页翻页失败")
- time.Sleep(3 * time.Second)
- break
- }
- }
- }
- vm.dnf.Dispatch("debug_event", "6 采集测试完成")
- qu.Debug("6采集测试完成")
- }
- // RunSpider 适用于测试1页数据
- func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe bool, exit chan bool, cssMark map[string]interface{}) {
- sc, err := be.NewSpiderConfig(cssMark)
- if err != nil {
- qu.Debug("标注信息传输失败!")
- vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
- return
- }
- if url != "" {
- sc.Href = url
- }
- _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
- qu.Debug("1浏览器打开", *sc)
- vm.dnf.Dispatch("debug_event", "1 浏览器打开")
- defer func() {
- cancel()
- baseCancel()
- qu.Debug("0浏览器已经销毁")
- vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
- close(exit)
- }()
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(sc.Href), //打开页面
- chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
- chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
- })
- vm.dnf.Dispatch("debug_event", "2 页面已经打开")
- vm.dnf.Dispatch("debug_event", "3 初始化列表页信息")
- //1、列表页信息初始化
- if !vm.InitListPage(ctx, sc) {
- vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出")
- return
- }
- var runJs string = sc.ListJSCode
- listResult := make(be.ResultItems, 0)
- //2、执行JS代码,获取列表页信息
- if be.RegSpace.ReplaceAllString(runJs, "") == "" {
- runJs = renderJavascriptCoder(loadListItemsJS, sc)
- }
- qu.Debug("获取列表JS代码:", runJs)
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(runJs, &listResult),
- })
- if err != nil {
- qu.Debug("执行JS代码失败", err.Error())
- vm.dnf.Dispatch("debug_event", "4 执行JS代码失败")
- return
- }
- vm.dnf.Dispatch("debug_event", "4 获取列表完成")
- qu.Debug("3获取列表完成", len(listResult))
- //3、打开详情页
- runJs = sc.ContentJSCode
- if be.RegSpace.ReplaceAllString(runJs, "") == "" {
- runJs = renderJavascriptCoder(loadContentJS, sc)
- }
- currentResult := list.New()
- be.DataResults[sc.Code] = currentResult
- qu.Debug("详情页JS代码:", runJs)
- no := 1
- for _, v := range listResult {
- select {
- case <-exit:
- return
- default:
- qu.Debug(v.No, v.Href, v.ListTitle, v.ListPubTime)
- vm.dnf.Dispatch("debug_event", fmt.Sprintf("5. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
- var result string = ""
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(v.Href),
- chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
- chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond),
- chromedp.Evaluate(runJs, v),
- })
- v.No = no
- no++
- v.Site = sc.Site
- v.Channel = sc.Channel
- if err != nil {
- qu.Debug("执行JS代码失败", err.Error())
- }
- if len(v.AttachLinks) > 0 { //有附件
- vm.dnf.Dispatch("debug_event", fmt.Sprintf("6. 下载附件"))
- //4、下载附件
- downloadAttaches(v, vm.attachesDir)
- }
- //关闭当前TAB页
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(`var ret="";window.close();ret`, &result),
- })
- vm.dnf.Dispatch("debug_event", fmt.Sprintf("5. %d- 下载详情页 %s 完成", v.No, v.Title))
- currentResult.PushBack(v)
- }
- }
- vm.dnf.Dispatch("debug_event", "7 采集测试完成")
- qu.Debug("5采集测试完成")
- }
- // InitPage 初始化页面
- func (vm *VM) InitListPage(ctx context.Context, sc *be.SpiderConfig) (initPage bool) {
- if len(sc.InitList) == 0 { //没有初始化页面行为
- return true
- }
- for j, ac := range sc.InitList {
- arc := vm.RunAction(ctx, ac, j) //itype 0:执行成功 1:执行错误 2:超时
- if !arc.Result { //动作执行失败,不再执行后续动作
- return false
- }
- }
- return true
- }
- // RunAction 执行动作
- func (vm *VM) RunAction(ctx context.Context, ac *be.Actions, num int) *be.ActionRunResult {
- ctxTmp, cancel := context.WithTimeout(context.Background(), time.Duration(ac.SleepTime+5000)*time.Millisecond)
- defer cancel()
- done := make(chan *be.ActionRunResult)
- go func() {
- for {
- select {
- case <-ctxTmp.Done():
- done <- &be.ActionRunResult{
- Result: false,
- RunResult: be.RUN_ACTION_TIMEOUT,
- CheckResult: be.CHECK_ACTION_TIMEOUT,
- }
- vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num)+"个动作JS,超时")
- qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS,超时")
- return
- default:
- //执行动作
- vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num)+"个动作JS")
- qu.Debug("3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS:", ac.ActionJs)
- var result string
- err := chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(ac.ActionJs, &result),
- chromedp.Sleep(time.Duration(ac.SleepTime) * time.Millisecond),
- })
- if err != nil {
- done <- &be.ActionRunResult{
- Result: false,
- RunResult: be.RUN_ACTION_ERROR,
- CheckResult: be.CHECK_ACTION_NOTCHECK,
- }
- vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常")
- qu.Debug("3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常", err)
- return
- }
- //检查结果
- var checkResult string
- if ac.CheckJs != "" {
- vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS")
- qu.Debug("3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS:", ac.CheckJs)
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(ac.CheckJs, &checkResult),
- })
- if err != nil {
- vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常")
- qu.Debug("3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常", err)
- done <- &be.ActionRunResult{
- Result: false,
- RunResult: be.RUN_ACTION_SUCCESS,
- CheckResult: be.CHECK_ACTION_ERROR,
- }
- return
- }
- vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS结果:"+checkResult)
- qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS结果:" + checkResult)
- done <- &be.ActionRunResult{
- Result: checkResult == be.CHECH_RESULT,
- RunResult: be.RUN_ACTION_SUCCESS,
- CheckResult: be.CHECK_ACTION_SUCCESS,
- }
- return
- }
- vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS结果:true")
- qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS结果:true")
- done <- &be.ActionRunResult{
- Result: true,
- RunResult: be.RUN_ACTION_SUCCESS,
- CheckResult: be.CHECK_ACTION_NOTCHECK,
- }
- return
- }
- }
- }()
- return <-done
- }
- // InitPageTmp 初始化页面
- func (vm *VM) InitPageTmp(ctx context.Context, timeout int) bool {
- //1、页面初始化需要执行的事件(多个动作)
- initPageJs := `var clicklabel = document.querySelector("#app-base > div > div.IndexContent > div > div > form > div > div.ant-col.ant-col-4 > button.ant-btn.ant-btn-primary > span");if(clicklabel)clicklabel.click();"";`
- var result string
- err := chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(initPageJs, &result),
- //chromedp.WaitReady(".ant-list-items"),
- })
- if err != nil {
- qu.Debug("初始化页面JS执行失败", err.Error())
- return false
- }
- ctxTmp, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond)
- defer cancel()
- done := make(chan bool)
- go func() {
- for {
- select {
- case <-ctxTmp.Done():
- done <- false
- return
- default:
- getJs := `var label = document.querySelector(".ant-list-items");if(label)label.outerText;`
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(getJs, &result),
- })
- if result != "" {
- done <- true
- return
- }
- time.Sleep(1 * time.Second) // 模拟工作负载
- }
- }
- }()
- get := <-done
- return get
- }
- // CountYestodayArts 统计昨日信息发布量
- func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
- headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
- sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
- _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, false)
- qu.Debug("1浏览器打开")
- vm.dnf.Dispatch("debug_event", "1 浏览器打开")
- defer func() {
- cancel()
- baseCancel()
- qu.Debug("0浏览器已经销毁")
- vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
- vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
- close(exit)
- }()
- //时间比较
- now := time.Now()
- yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
- startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
- endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
- //TODO 1.
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(sc.Href),
- chromedp.WaitReady("document.body", chromedp.ByJSPath),
- chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
- })
- vm.dnf.Dispatch("debug_event", "2 页面已经打开")
- qu.Debug("2页面打开")
- //TODO 2. 执行JS代码,获取列表页信息
- runJs := renderJavascriptCoder(loadListItemsJS, sc)
- tmp := map[string]bool{}
- //最多翻页1000页
- for i := 0; i < MAX_TRUN_PAGE; i++ {
- select {
- case <-exit:
- return
- default:
- vm.dnf.Dispatch("debug_event", "3 执行列表页JS")
- listResult := make(be.ResultItems, 0)
- err := chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(runJs, &listResult),
- })
- if err != nil {
- qu.Debug("执行JS代码失败", err.Error())
- vm.dnf.Dispatch("debug_event", "3 执行JS代码失败")
- return
- }
- //TODO 人工智能转换采集到的日期
- callAIState := false
- for j := 0; j < 5; j++ {
- vm.dnf.Dispatch("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1))
- err := ai.UpdateResultDateStr(listResult)
- if err == nil {
- callAIState = true
- break
- }
- }
- if !callAIState {
- vm.dnf.Dispatch("debug_event", "3 多轮次调用AI均未得到合理结果")
- return
- }
- //TODO 日期统计
- for _, r := range listResult {
- day, err := time.Parse("2006-01-02", r.ListPubTime)
- if err != nil {
- continue
- }
- if _, ok := tmp[r.Href]; ok { //去重
- continue
- }
- if day.After(startOfYesterday) && day.Before(endOfYesterday) {
- count += 1
- } else if day.Before(startOfYesterday) {
- return
- }
- }
- vm.dnf.Dispatch("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
- //TODO 翻页
- //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss)
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss),
- chromedp.ByJSPath),
- chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
- })
- }
- }
- return
- }
|