vm.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. package main
  2. import (
  3. "bytes"
  4. _ "embed"
  5. "fmt"
  6. "log"
  7. "os"
  8. "strconv"
  9. "strings"
  10. "text/template"
  11. "time"
  12. "github.com/bmaupin/go-epub"
  13. "github.com/chromedp/chromedp"
  14. )
  15. const (
  16. MAX_TRUN_PAGE = 1000
  17. )
  18. var (
  19. //go:embed tpl/load_list_items.js
  20. loadListItemsJS string
  21. //go:embed tpl/load_content.js
  22. loadContentJS string
  23. currentResult = make(ResultItems, 0)
  24. )
  25. // renderJavascriptCoder
  26. func renderJavascriptCoder(tpl string, sc *SpiderConfig) string {
  27. t, err := template.New("").Parse(tpl)
  28. if err != nil {
  29. log.Println("创建JS代码模板失败", err.Error())
  30. return ""
  31. }
  32. buf := new(bytes.Buffer)
  33. err = t.Execute(buf, sc)
  34. if err != nil {
  35. log.Println("执行JS代码模板失败", err.Error())
  36. return ""
  37. }
  38. return buf.String()
  39. }
  40. // RunSpider
  41. func RunSpider(url string, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool) {
  42. sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url})
  43. _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, proxyServe)
  44. log.Println("1浏览器打开")
  45. app.pushMessage("debug_event", "1 浏览器打开")
  46. defer func() {
  47. cancel()
  48. baseCancel()
  49. log.Println("0浏览器已经销毁")
  50. app.pushMessage("debug_event", "0 浏览器已经销毁")
  51. close(exit)
  52. }()
  53. currentResult = make(ResultItems, 0, 0)
  54. chromedp.Run(ctx, chromedp.Tasks{
  55. chromedp.Navigate(sc.Url),
  56. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  57. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  58. })
  59. app.pushMessage("debug_event", "2 页面已经打开")
  60. log.Println("2页面打开")
  61. listResult := make(ResultItems, 0)
  62. //TODO 2. 执行JS代码,获取列表页信息
  63. runJs := renderJavascriptCoder(loadListItemsJS, sc)
  64. err := chromedp.Run(ctx, chromedp.Tasks{
  65. chromedp.Evaluate(runJs, &listResult),
  66. })
  67. if err != nil {
  68. log.Println("执行JS代码失败", err.Error())
  69. app.pushMessage("debug_event", "2 执行JS代码失败")
  70. return
  71. }
  72. app.pushMessage("debug_event", "3 获取列表完成")
  73. log.Println("3获取列表完成")
  74. //TODO 3. 打开详情页 ,最多打开10条
  75. runJs = renderJavascriptCoder(loadContentJS, sc)
  76. for _, v := range listResult {
  77. select {
  78. case <-exit:
  79. return
  80. default:
  81. app.pushMessage("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.Title))
  82. var result string = ""
  83. err = chromedp.Run(ctx, chromedp.Tasks{
  84. chromedp.Navigate(v.Href),
  85. chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
  86. chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond),
  87. chromedp.Evaluate(runJs, v),
  88. })
  89. if err != nil {
  90. log.Println("执行JS代码失败", err.Error())
  91. }
  92. //关闭当前TAB页
  93. chromedp.Run(ctx, chromedp.Tasks{
  94. chromedp.Evaluate(`var ret="";window.close();ret`, &result),
  95. })
  96. app.pushMessage("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title))
  97. currentResult = append(currentResult, v)
  98. }
  99. }
  100. app.pushMessage("debug_event", "5 采集测试完成")
  101. log.Println("5采集测试完成")
  102. }
  103. // ExportEpubFile 导出epub文件
  104. func ExportEpubFile(filepath string) {
  105. output := epub.NewEpub("")
  106. output.SetTitle(currentSpiderConfig.Site)
  107. output.SetAuthor("unknow")
  108. for i, art := range currentResult {
  109. body := "<h2>" + art.Title + "</h2><p>" + strings.Join(strings.Split(art.Content, "\n"), "</p><p>") + "</p>"
  110. output.AddSection(body, art.Title, fmt.Sprintf("%06d.xhtml", i+1), "")
  111. }
  112. fo, err := os.Create(filepath)
  113. if err != nil {
  114. app.pushMessage("debug_event", err.Error())
  115. }
  116. output.WriteTo(fo)
  117. fo.Close()
  118. }
  119. // CountYestodayArts 统计昨日信息发布量
  120. func CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
  121. headless bool, showImage bool, exit chan bool) (count int) {
  122. sc := MergeSpiderConfig(currentSpiderConfig, &SpiderConfig{Url: url})
  123. _, baseCancel, _, _, ctx, cancel := NewBrowser(headless, showImage, "")
  124. log.Println("1浏览器打开")
  125. app.pushMessage("debug_event", "1 浏览器打开")
  126. defer func() {
  127. cancel()
  128. baseCancel()
  129. log.Println("0浏览器已经销毁")
  130. app.pushMessage("debug_event", "0 浏览器已经销毁")
  131. app.pushMessage("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
  132. close(exit)
  133. }()
  134. //时间比较
  135. now := time.Now()
  136. yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
  137. startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
  138. endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
  139. //TODO 1.
  140. chromedp.Run(ctx, chromedp.Tasks{
  141. chromedp.Navigate(sc.Url),
  142. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  143. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  144. })
  145. app.pushMessage("debug_event", "2 页面已经打开")
  146. log.Println("2页面打开")
  147. //TODO 2. 执行JS代码,获取列表页信息
  148. runJs := renderJavascriptCoder(loadListItemsJS, sc)
  149. tmp := map[string]bool{}
  150. //最多翻页1000页
  151. for i := 0; i < MAX_TRUN_PAGE; i++ {
  152. select {
  153. case <-exit:
  154. return
  155. default:
  156. app.pushMessage("debug_event", "3 执行列表页JS")
  157. listResult := make(ResultItems, 0)
  158. err := chromedp.Run(ctx, chromedp.Tasks{
  159. chromedp.Evaluate(runJs, &listResult),
  160. })
  161. if err != nil {
  162. log.Println("执行JS代码失败", err.Error())
  163. app.pushMessage("debug_event", "3 执行JS代码失败")
  164. return
  165. }
  166. //TODO 人工智能转换采集到的日期
  167. callAIState := false
  168. for j := 0; j < 5; j++ {
  169. app.pushMessage("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1))
  170. err := UpdateResultDateStr(listResult)
  171. if err == nil {
  172. callAIState = true
  173. break
  174. }
  175. }
  176. if !callAIState {
  177. app.pushMessage("debug_event", "3 多轮次调用AI均未得到合理结果")
  178. return
  179. }
  180. //TODO 日期统计
  181. for _, r := range listResult {
  182. day, err := time.Parse("2006-01-02", r.ListPubTime)
  183. if err != nil {
  184. continue
  185. }
  186. if _, ok := tmp[r.Href]; ok { //去重
  187. continue
  188. }
  189. if day.After(startOfYesterday) && day.Before(endOfYesterday) {
  190. count += 1
  191. } else if day.Before(startOfYesterday) {
  192. return
  193. }
  194. }
  195. app.pushMessage("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
  196. //TODO 翻页
  197. //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss)
  198. chromedp.Run(ctx, chromedp.Tasks{
  199. chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss),
  200. chromedp.ByJSPath),
  201. chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
  202. })
  203. }
  204. }
  205. return
  206. }