package browser import ( "KeyWebsiteMonitor/spider/db" . "KeyWebsiteMonitor/spider/types" "bytes" "container/list" _ "embed" "fmt" "log" "text/template" "time" . "KeyWebsiteMonitor/spider/util" qu "app.yhyue.com/moapp/jybase/common" "app.yhyue.com/moapp/jybase/date" "github.com/chromedp/chromedp" ) const ( MAX_TRUN_PAGE = 1000 ) var ( //go:embed js/load_list_items.js loadListItemsJS string //go:embed js/load_content.js loadContentJS string currentResult = make(ResultItems, 0) AttachesDir = "" ) // renderJavascriptCoder func renderJavascriptCoder(tpl string, sc *SpiderConfig) string { t, err := template.New("").Parse(tpl) if err != nil { log.Println("创建JS代码模板失败", err.Error()) return "" } buf := new(bytes.Buffer) err = t.Execute(buf, sc) if err != nil { log.Println("执行JS代码模板失败", err.Error()) return "" } return buf.String() } var ( exitCh chan bool currentResults = list.New() //b.ResultItems = make(b.ResultItems, 0) ) func DoTask() { query := map[string]interface{}{ "cssmark": map[string]interface{}{ "$exists": true, }, } sess := db.Mgo.GetMgoConn() defer db.Mgo.DestoryMongoConn(sess) it := sess.DB("zxl").C("luaconfig").Find(query).Sort("_id").Select(map[string]interface{}{ "code": 1, "href": 1, "cssmark": 1, "site": 1, "channel": 1, }).Iter() total := 0 for tmp := make(map[string]interface{}); it.Next(&tmp); total++ { if total%5000 == 0 { log.Println("cur index ", total) } cssmark := qu.ObjToMap(tmp["cssmark"]) href := qu.ObjToString(tmp["href"]) site := qu.ObjToString(tmp["site"]) channel := qu.ObjToString(tmp["channel"]) code := qu.ObjToString(tmp["code"]) listDealy := qu.Int64All((*cssmark)["listDelayTime"]) trunPageDelay := qu.Int64All((*cssmark)["listTurnDelayTime"]) ///---可视化爬虫平台v1.0 listBodyCss := qu.ObjToString((*cssmark)["listBodyCss"]) listItemCss := qu.ObjToString((*cssmark)["listItemCss"]) listLinkCss := qu.ObjToString((*cssmark)["listLinkCss"]) listPubtimeCss := qu.ObjToString((*cssmark)["listPublishTimeCss"]) listNextPageCss := qu.ObjToString((*cssmark)["listNextPageCss"]) titleCss := qu.ObjToString((*cssmark)["titleCss"]) publishUnitCss := qu.ObjToString((*cssmark)["publishUnitCss"]) publishTimeCss := qu.ObjToString((*cssmark)["publishTimeCss"]) contentCss := qu.ObjToString((*cssmark)["contentCss"]) attachCss := qu.ObjToString((*cssmark)["attachCss"]) listJSCode := qu.ObjToString((*cssmark)["listJSCode"]) contentJSCode := qu.ObjToString((*cssmark)["contentJSCode"]) attachJSCode := qu.ObjToString((*cssmark)["attachJSCode"]) listTrunPageJSCode := qu.ObjToString((*cssmark)["listTrunPageJSCode"]) // headless := false showImage := false //获取重点网站 exitCh = make(chan bool, 1) // currentSpiderConfig := &SpiderConfig{ Site: site, Channel: channel, Url: href, Code: code, ListBodyCss: listBodyCss, ListItemCss: listItemCss, ListLinkCss: listLinkCss, ListPubtimeCss: listPubtimeCss, ListNextPageCss: listNextPageCss, TitleCss: titleCss, PublishUnitCss: publishUnitCss, PublishTimeCss: publishTimeCss, ContentCss: contentCss, AttachCss: attachCss, ListJSCode: listJSCode, ContentJSCode: contentJSCode, AttachJSCode: attachJSCode, ListTrunPageJSCode: listTrunPageJSCode, } // c, lrt := CountYestodayArts(href, listDealy, trunPageDelay, headless, showImage, exitCh, currentSpiderConfig) result := map[string]interface{}{ "count": c, "todayHasData": c > 0, } todayData := map[string]interface{}{} for k, v := range lrt { todayData[fmt.Sprint(k+1)] = map[string]interface{}{ "text": v.Title, "date": v.ListPubTime, "href": v.Href, } } result["todayData"] = todayData //存库 day := time.Now().AddDate(0, 0, -1).Format(date.Date_Short_Layout) db.Mgo.Save("checklist", map[string]interface{}{ "day": day, "site": site, "spidercode": code, "channel": channel, "channelurl": href, "comeintime": time.Now().Unix(), "result": result, }) tmp = make(map[string]interface{}) } } // CountYestodayArts 统计昨日信息发布量 func CountYestodayArts(url string, listDealy int64, trunPageDelay int64, headless bool, showImage bool, exit chan bool, currentSpiderConfig *SpiderConfig) (count int, lrt []*ResultItem) { sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url}) _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, "") log.Println("1浏览器打开") defer func() { cancel() baseCancel() log.Println("0浏览器已经销毁") log.Println(fmt.Sprintf("99 昨日信息发布量:%d ", count)) close(exit) }() //时间比较 // now := time.Unix(1721836800, 0) now := time.Now() /*.AddDate(0, 0, 1)*/ yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期 startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location()) endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond) log.Println(startOfYesterday) log.Println(endOfYesterday) //TODO 1. chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Url), chromedp.WaitReady("document.body", chromedp.ByJSPath), chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), }) log.Println("2页面打开") //TODO 2. 执行JS代码,获取列表页信息 runJs := renderJavascriptCoder(loadListItemsJS, sc) tmp := map[string]bool{} //最多翻页1000页 for i := 0; i < MAX_TRUN_PAGE; i++ { select { case <-exit: log.Println("$$$") return default: log.Println("3执行列表页JS") listResult := make(ResultItems, 0) err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(runJs, &listResult), }) if err != nil { log.Println("执行JS代码失败", err.Error()) return } //采集列表页发布时间转换 //TODO 日期统计 for _, r := range listResult { day, err := time.Parse("2006-01-02", r.ListPubTime) if err != nil { continue } if _, ok := tmp[r.Href]; ok { //去重 continue } if day.After(startOfYesterday) && day.Before(endOfYesterday) { count += 1 lrt = append(lrt, &ResultItem{ SpiderCode: r.SpiderCode, Site: r.Site, Href: r.Href, ListPubTime: r.ListPubTime, Title: RemoveNewlinesAndSpaces(r.ListTitle), }) } else if day.Before(startOfYesterday) { return } } log.Println(fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count)) //TODO 翻页 if currentSpiderConfig.ListBodyCss == "" || sc.ListNextPageCss == "" { log.Println("当前爬虫配置,不具备翻页条件") return } var runJs, result string = currentSpiderConfig.ListTrunPageJSCode, "" if runJs == "" { runJs = fmt.Sprintf(`var link=document.querySelector("%s");if(link)link.click();""`, sc.ListNextPageCss) } var result1, result2 string var checkRunJs = fmt.Sprintf(`document.querySelector("%s").outerText`, sc.ListBodyCss) log.Println("检查翻页是否成功,执行的JS", checkRunJs) err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(checkRunJs, &result1), }) if err != nil { log.Println("翻页检查1失败,", checkRunJs) return } //可能就没有分页 err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(runJs, &result), chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond), }) if err != nil { log.Println("翻页操作失败,", runJs) return } err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(checkRunJs, &result2), }) if err != nil { log.Println("翻页检查2失败,", checkRunJs) return } if result1 == "" || result2 == "" || result1 == result2 { log.Println("翻页失败,两次翻页获取到的列表区域块不符合要求") return } } } return }