123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272 |
- package browser
- import (
- "KeyWebsiteMonitor/spider/db"
- . "KeyWebsiteMonitor/spider/types"
- "bytes"
- "container/list"
- _ "embed"
- "fmt"
- "log"
- "text/template"
- "time"
- . "KeyWebsiteMonitor/spider/util"
- qu "app.yhyue.com/moapp/jybase/common"
- "app.yhyue.com/moapp/jybase/date"
- "github.com/chromedp/chromedp"
- )
- const (
- MAX_TRUN_PAGE = 1000
- )
- var (
- //go:embed js/load_list_items.js
- loadListItemsJS string
- //go:embed js/load_content.js
- loadContentJS string
- currentResult = make(ResultItems, 0)
- AttachesDir = ""
- )
- // renderJavascriptCoder
- func renderJavascriptCoder(tpl string, sc *SpiderConfig) string {
- t, err := template.New("").Parse(tpl)
- if err != nil {
- log.Println("创建JS代码模板失败", err.Error())
- return ""
- }
- buf := new(bytes.Buffer)
- err = t.Execute(buf, sc)
- if err != nil {
- log.Println("执行JS代码模板失败", err.Error())
- return ""
- }
- return buf.String()
- }
- var (
- exitCh chan bool
- currentResults = list.New() //b.ResultItems = make(b.ResultItems, 0)
- )
- func DoTask() {
- query := map[string]interface{}{
- "cssmark": map[string]interface{}{
- "$exists": true,
- },
- }
- sess := db.Mgo.GetMgoConn()
- defer db.Mgo.DestoryMongoConn(sess)
- it := sess.DB("zxl").C("luaconfig").Find(query).Sort("_id").Select(map[string]interface{}{
- "code": 1,
- "href": 1,
- "cssmark": 1,
- "site": 1,
- "channel": 1,
- }).Iter()
- total := 0
- for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
- if total%5000 == 0 {
- log.Println("cur index ", total)
- }
- cssmark := qu.ObjToMap(tmp["cssmark"])
- href := qu.ObjToString(tmp["href"])
- site := qu.ObjToString(tmp["site"])
- channel := qu.ObjToString(tmp["channel"])
- code := qu.ObjToString(tmp["code"])
- listDealy := qu.Int64All((*cssmark)["listDelayTime"])
- trunPageDelay := qu.Int64All((*cssmark)["listTurnDelayTime"])
- ///---可视化爬虫平台v1.0
- listBodyCss := qu.ObjToString((*cssmark)["listBodyCss"])
- listItemCss := qu.ObjToString((*cssmark)["listItemCss"])
- listLinkCss := qu.ObjToString((*cssmark)["listLinkCss"])
- listPubtimeCss := qu.ObjToString((*cssmark)["listPublishTimeCss"])
- listNextPageCss := qu.ObjToString((*cssmark)["listNextPageCss"])
- titleCss := qu.ObjToString((*cssmark)["titleCss"])
- publishUnitCss := qu.ObjToString((*cssmark)["publishUnitCss"])
- publishTimeCss := qu.ObjToString((*cssmark)["publishTimeCss"])
- contentCss := qu.ObjToString((*cssmark)["contentCss"])
- attachCss := qu.ObjToString((*cssmark)["attachCss"])
- listJSCode := qu.ObjToString((*cssmark)["listJSCode"])
- contentJSCode := qu.ObjToString((*cssmark)["contentJSCode"])
- attachJSCode := qu.ObjToString((*cssmark)["attachJSCode"])
- listTrunPageJSCode := qu.ObjToString((*cssmark)["listTrunPageJSCode"])
- //
- headless := false
- showImage := false
- //获取重点网站
- exitCh = make(chan bool, 1)
- //
- currentSpiderConfig := &SpiderConfig{
- Site: site,
- Channel: channel,
- Url: href,
- Code: code,
- ListBodyCss: listBodyCss,
- ListItemCss: listItemCss,
- ListLinkCss: listLinkCss,
- ListPubtimeCss: listPubtimeCss,
- ListNextPageCss: listNextPageCss,
- TitleCss: titleCss,
- PublishUnitCss: publishUnitCss,
- PublishTimeCss: publishTimeCss,
- ContentCss: contentCss,
- AttachCss: attachCss,
- ListJSCode: listJSCode,
- ContentJSCode: contentJSCode,
- AttachJSCode: attachJSCode,
- ListTrunPageJSCode: listTrunPageJSCode,
- }
- //
- c, lrt := CountYestodayArts(href, listDealy, trunPageDelay, headless, showImage, exitCh, currentSpiderConfig)
- result := map[string]interface{}{
- "count": c,
- "todayHasData": c > 0,
- }
- todayData := map[string]interface{}{}
- for k, v := range lrt {
- todayData[fmt.Sprint(k+1)] = map[string]interface{}{
- "text": v.Title,
- "date": v.ListPubTime,
- "href": v.Href,
- }
- }
- result["todayData"] = todayData
- //存库
- day := time.Now().AddDate(0, 0, -1).Format(date.Date_Short_Layout)
- db.Mgo.Save("checklist", map[string]interface{}{
- "day": day,
- "site": site,
- "spidercode": code,
- "channel": channel,
- "channelurl": href,
- "comeintime": time.Now().Unix(),
- "result": result,
- })
- tmp = make(map[string]interface{})
- }
- }
- // CountYestodayArts 统计昨日信息发布量
- func CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
- headless bool, showImage bool, exit chan bool, currentSpiderConfig *SpiderConfig) (count int, lrt []*ResultItem) {
- sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url})
- _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, "")
- log.Println("1浏览器打开")
- defer func() {
- cancel()
- baseCancel()
- log.Println("0浏览器已经销毁")
- log.Println(fmt.Sprintf("99 昨日信息发布量:%d ", count))
- close(exit)
- }()
- //时间比较
- // now := time.Unix(1721836800, 0)
- now := time.Now() /*.AddDate(0, 0, 1)*/
- yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
- startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
- endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
- log.Println(startOfYesterday)
- log.Println(endOfYesterday)
- //TODO 1.
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(sc.Url),
- chromedp.WaitReady("document.body", chromedp.ByJSPath),
- chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
- })
- log.Println("2页面打开")
- //TODO 2. 执行JS代码,获取列表页信息
- runJs := renderJavascriptCoder(loadListItemsJS, sc)
- tmp := map[string]bool{}
- //最多翻页1000页
- for i := 0; i < MAX_TRUN_PAGE; i++ {
- select {
- case <-exit:
- log.Println("$$$")
- return
- default:
- log.Println("3执行列表页JS")
- listResult := make(ResultItems, 0)
- err := chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(runJs, &listResult),
- })
- if err != nil {
- log.Println("执行JS代码失败", err.Error())
- return
- }
- //采集列表页发布时间转换
- //TODO 日期统计
- for _, r := range listResult {
- day, err := time.Parse("2006-01-02", r.ListPubTime)
- if err != nil {
- continue
- }
- if _, ok := tmp[r.Href]; ok { //去重
- continue
- }
- if day.After(startOfYesterday) && day.Before(endOfYesterday) {
- count += 1
- lrt = append(lrt, &ResultItem{
- SpiderCode: r.SpiderCode,
- Site: r.Site,
- Href: r.Href,
- ListPubTime: r.ListPubTime,
- Title: RemoveNewlinesAndSpaces(r.ListTitle),
- })
- } else if day.Before(startOfYesterday) {
- return
- }
- }
- log.Println(fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
- //TODO 翻页
- if currentSpiderConfig.ListBodyCss == "" || sc.ListNextPageCss == "" {
- log.Println("当前爬虫配置,不具备翻页条件")
- return
- }
- var runJs, result string = currentSpiderConfig.ListTrunPageJSCode, ""
- if runJs == "" {
- runJs = fmt.Sprintf(`var link=document.querySelector("%s");if(link)link.click();""`, sc.ListNextPageCss)
- }
- var result1, result2 string
- var checkRunJs = fmt.Sprintf(`document.querySelector("%s").outerText`, sc.ListBodyCss)
- log.Println("检查翻页是否成功,执行的JS", checkRunJs)
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(checkRunJs, &result1),
- })
- if err != nil {
- log.Println("翻页检查1失败,", checkRunJs)
- return
- }
- //可能就没有分页
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(runJs, &result),
- chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
- })
- if err != nil {
- log.Println("翻页操作失败,", runJs)
- return
- }
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(checkRunJs, &result2),
- })
- if err != nil {
- log.Println("翻页检查2失败,", checkRunJs)
- return
- }
- if result1 == "" || result2 == "" || result1 == result2 {
- log.Println("翻页失败,两次翻页获取到的列表区域块不符合要求")
- return
- }
- }
- }
- return
- }
|