123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218 |
- package main
- import (
- "bytes"
- _ "embed"
- "fmt"
- "log"
- "os"
- "strconv"
- "strings"
- "text/template"
- "time"
- "github.com/bmaupin/go-epub"
- "github.com/chromedp/chromedp"
- )
- const (
- MAX_TRUN_PAGE = 1000
- )
- var (
- //go:embed tpl/load_list_items.js
- loadListItemsJS string
- //go:embed tpl/load_content.js
- loadContentJS string
- currentResult = make(ResultItems, 0)
- )
- // renderJavascriptCoder
- func renderJavascriptCoder(tpl string, sc *SpiderConfig) string {
- t, err := template.New("").Parse(tpl)
- if err != nil {
- log.Println("创建JS代码模板失败", err.Error())
- return ""
- }
- buf := new(bytes.Buffer)
- err = t.Execute(buf, sc)
- if err != nil {
- log.Println("执行JS代码模板失败", err.Error())
- return ""
- }
- return buf.String()
- }
- // RunSpider
- func RunSpider(url string, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool) {
- sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url})
- _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, proxyServe)
- log.Println("1浏览器打开")
- app.pushMessage("debug_event", "1 浏览器打开")
- defer func() {
- cancel()
- baseCancel()
- log.Println("0浏览器已经销毁")
- app.pushMessage("debug_event", "0 浏览器已经销毁")
- close(exit)
- }()
- currentResult = make(ResultItems, 0, 0)
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(sc.Url),
- chromedp.WaitReady("document.body", chromedp.ByJSPath),
- chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
- })
- app.pushMessage("debug_event", "2 页面已经打开")
- log.Println("2页面打开")
- listResult := make(ResultItems, 0)
- //TODO 2. 执行JS代码,获取列表页信息
- runJs := renderJavascriptCoder(loadListItemsJS, sc)
- err := chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(runJs, &listResult),
- })
- if err != nil {
- log.Println("执行JS代码失败", err.Error())
- app.pushMessage("debug_event", "2 执行JS代码失败")
- return
- }
- app.pushMessage("debug_event", "3 获取列表完成")
- log.Println("3获取列表完成")
- //TODO 3. 打开详情页 ,最多打开10条
- runJs = renderJavascriptCoder(loadContentJS, sc)
- for _, v := range listResult {
- select {
- case <-exit:
- return
- default:
- app.pushMessage("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.Title))
- var result string = ""
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(v.Href),
- chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
- chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond),
- chromedp.Evaluate(runJs, v),
- })
- if err != nil {
- log.Println("执行JS代码失败", err.Error())
- }
- //关闭当前TAB页
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(`var ret="";window.close();ret`, &result),
- })
- app.pushMessage("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title))
- currentResult = append(currentResult, v)
- }
- }
- app.pushMessage("debug_event", "5 采集测试完成")
- log.Println("5采集测试完成")
- }
- // ExportEpubFile 导出epub文件
- func ExportEpubFile(filepath string) {
- output := epub.NewEpub("")
- output.SetTitle(currentSpiderConfig.Site)
- output.SetAuthor("unknow")
- for i, art := range currentResult {
- body := "<h2>" + art.Title + "</h2><p>" + strings.Join(strings.Split(art.Content, "\n"), "</p><p>") + "</p>"
- output.AddSection(body, art.Title, fmt.Sprintf("%06d.xhtml", i+1), "")
- }
- fo, err := os.Create(filepath)
- if err != nil {
- app.pushMessage("debug_event", err.Error())
- }
- output.WriteTo(fo)
- fo.Close()
- }
- // CountYestodayArts 统计昨日信息发布量
- func CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
- headless bool, showImage bool, exit chan bool) (count int) {
- sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url})
- _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, "")
- log.Println("1浏览器打开")
- app.pushMessage("debug_event", "1 浏览器打开")
- defer func() {
- cancel()
- baseCancel()
- log.Println("0浏览器已经销毁")
- app.pushMessage("debug_event", "0 浏览器已经销毁")
- app.pushMessage("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
- close(exit)
- }()
- //时间比较
- now := time.Now()
- yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
- startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
- endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
- //TODO 1.
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Navigate(sc.Url),
- chromedp.WaitReady("document.body", chromedp.ByJSPath),
- chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
- })
- app.pushMessage("debug_event", "2 页面已经打开")
- log.Println("2页面打开")
- //TODO 2. 执行JS代码,获取列表页信息
- runJs := renderJavascriptCoder(loadListItemsJS, sc)
- tmp := map[string]bool{}
- //最多翻页1000页
- for i := 0; i < MAX_TRUN_PAGE; i++ {
- select {
- case <-exit:
- return
- default:
- app.pushMessage("debug_event", "3 执行列表页JS")
- listResult := make(ResultItems, 0)
- err := chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(runJs, &listResult),
- })
- if err != nil {
- log.Println("执行JS代码失败", err.Error())
- app.pushMessage("debug_event", "3 执行JS代码失败")
- return
- }
- //TODO 人工智能转换采集到的日期
- callAIState := false
- for j := 0; j < 5; j++ {
- app.pushMessage("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1))
- err := UpdateResultDateStr(listResult)
- if err == nil {
- callAIState = true
- break
- }
- }
- if !callAIState {
- app.pushMessage("debug_event", "3 多轮次调用AI均未得到合理结果")
- return
- }
- //TODO 日期统计
- for _, r := range listResult {
- day, err := time.Parse("2006-01-02", r.ListPubTime)
- if err != nil {
- continue
- }
- if _, ok := tmp[r.Href]; ok { //去重
- continue
- }
- if day.After(startOfYesterday) && day.Before(endOfYesterday) {
- count += 1
- } else if day.Before(startOfYesterday) {
- return
- }
- }
- app.pushMessage("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
- //TODO 翻页
- //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss)
- chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss),
- chromedp.ByJSPath),
- chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
- })
- }
- }
- return
- }
|