package main import ( "bytes" _ "embed" "fmt" "log" "os" "strconv" "strings" "text/template" "time" "github.com/bmaupin/go-epub" "github.com/chromedp/chromedp" ) const ( MAX_TRUN_PAGE = 1000 ) var ( //go:embed tpl/load_list_items.js loadListItemsJS string //go:embed tpl/load_content.js loadContentJS string currentResult = make(ResultItems, 0) ) // renderJavascriptCoder func renderJavascriptCoder(tpl string, sc *SpiderConfig) string { t, err := template.New("").Parse(tpl) if err != nil { log.Println("创建JS代码模板失败", err.Error()) return "" } buf := new(bytes.Buffer) err = t.Execute(buf, sc) if err != nil { log.Println("执行JS代码模板失败", err.Error()) return "" } return buf.String() } // RunSpider func RunSpider(url string, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool) { sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url}) _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, proxyServe) log.Println("1浏览器打开") app.pushMessage("debug_event", "1 浏览器打开") defer func() { cancel() baseCancel() log.Println("0浏览器已经销毁") app.pushMessage("debug_event", "0 浏览器已经销毁") close(exit) }() currentResult = make(ResultItems, 0, 0) chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Url), chromedp.WaitReady("document.body", chromedp.ByJSPath), chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), }) app.pushMessage("debug_event", "2 页面已经打开") log.Println("2页面打开") listResult := make(ResultItems, 0) //TODO 2. 执行JS代码,获取列表页信息 runJs := renderJavascriptCoder(loadListItemsJS, sc) err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(runJs, &listResult), }) if err != nil { log.Println("执行JS代码失败", err.Error()) app.pushMessage("debug_event", "2 执行JS代码失败") return } app.pushMessage("debug_event", "3 获取列表完成") log.Println("3获取列表完成") //TODO 3. 打开详情页 ,最多打开10条 runJs = renderJavascriptCoder(loadContentJS, sc) for _, v := range listResult { select { case <-exit: return default: app.pushMessage("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.Title)) var result string = "" err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(v.Href), chromedp.WaitReady(`document.body`, chromedp.ByJSPath), chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond), chromedp.Evaluate(runJs, v), }) if err != nil { log.Println("执行JS代码失败", err.Error()) } //关闭当前TAB页 chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(`var ret="";window.close();ret`, &result), }) app.pushMessage("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title)) currentResult = append(currentResult, v) } } app.pushMessage("debug_event", "5 采集测试完成") log.Println("5采集测试完成") } // ExportEpubFile 导出epub文件 func ExportEpubFile(filepath string) { output := epub.NewEpub("") output.SetTitle(currentSpiderConfig.Site) output.SetAuthor("unknow") for i, art := range currentResult { body := "
" + strings.Join(strings.Split(art.Content, "\n"), "
") + "
" output.AddSection(body, art.Title, fmt.Sprintf("%06d.xhtml", i+1), "") } fo, err := os.Create(filepath) if err != nil { app.pushMessage("debug_event", err.Error()) } output.WriteTo(fo) fo.Close() } // CountYestodayArts 统计昨日信息发布量 func CountYestodayArts(url string, listDealy int64, trunPageDelay int64, headless bool, showImage bool, exit chan bool) (count int) { sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url}) _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, "") log.Println("1浏览器打开") app.pushMessage("debug_event", "1 浏览器打开") defer func() { cancel() baseCancel() log.Println("0浏览器已经销毁") app.pushMessage("debug_event", "0 浏览器已经销毁") app.pushMessage("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count)) close(exit) }() //时间比较 now := time.Now() yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期 startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location()) endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond) //TODO 1. chromedp.Run(ctx, chromedp.Tasks{ chromedp.Navigate(sc.Url), chromedp.WaitReady("document.body", chromedp.ByJSPath), chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), }) app.pushMessage("debug_event", "2 页面已经打开") log.Println("2页面打开") //TODO 2. 执行JS代码,获取列表页信息 runJs := renderJavascriptCoder(loadListItemsJS, sc) tmp := map[string]bool{} //最多翻页1000页 for i := 0; i < MAX_TRUN_PAGE; i++ { select { case <-exit: return default: app.pushMessage("debug_event", "3 执行列表页JS") listResult := make(ResultItems, 0) err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(runJs, &listResult), }) if err != nil { log.Println("执行JS代码失败", err.Error()) app.pushMessage("debug_event", "3 执行JS代码失败") return } //TODO 人工智能转换采集到的日期 callAIState := false for j := 0; j < 5; j++ { app.pushMessage("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1)) err := UpdateResultDateStr(listResult) if err == nil { callAIState = true break } } if !callAIState { app.pushMessage("debug_event", "3 多轮次调用AI均未得到合理结果") return } //TODO 日期统计 for _, r := range listResult { day, err := time.Parse("2006-01-02", r.ListPubTime) if err != nil { continue } if _, ok := tmp[r.Href]; ok { //去重 continue } if day.After(startOfYesterday) && day.Before(endOfYesterday) { count += 1 } else if day.Before(startOfYesterday) { return } } app.pushMessage("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count)) //TODO 翻页 //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss) chromedp.Run(ctx, chromedp.Tasks{ chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss), chromedp.ByJSPath), chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond), }) } } return }