single.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. package vm
  2. import (
  3. "container/list"
  4. _ "embed"
  5. "fmt"
  6. "log"
  7. be "spidercreator/backend"
  8. ai "spidercreator/backend/ai"
  9. "strconv"
  10. "time"
  11. "github.com/chromedp/chromedp"
  12. )
  13. // NewVM
  14. func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM {
  15. return &VM{
  16. attachesDir, dnf,
  17. }
  18. }
  19. // RunSpider
  20. func (vm *VM) RunSpider(url string, maxPages int,
  21. listDealy int64, contentDelay int64,
  22. headless bool, showImage bool,
  23. proxyServe string, exit chan bool,
  24. currentSpiderConfig *be.SpiderConfig,
  25. currentResult *list.List) {
  26. sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Url: url})
  27. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  28. log.Println("1浏览器打开", *sc)
  29. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  30. defer func() {
  31. cancel()
  32. baseCancel()
  33. log.Println("0浏览器已经销毁")
  34. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  35. close(exit)
  36. }()
  37. chromedp.Run(ctx, chromedp.Tasks{
  38. chromedp.Navigate(sc.Url),
  39. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  40. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  41. })
  42. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  43. log.Println("2页面打开")
  44. var runJs string = sc.ListJSCode
  45. listResult := make(be.ResultItems, 0)
  46. //TODO 2. 执行JS代码,获取列表页信息
  47. if runJs == "" {
  48. runJs = renderJavascriptCoder(loadListItemsJS, sc)
  49. }
  50. //log.Println("execute list jscode", runJs)
  51. err := chromedp.Run(ctx, chromedp.Tasks{
  52. chromedp.Evaluate(runJs, &listResult),
  53. })
  54. if err != nil {
  55. log.Println("执行JS代码失败", err.Error())
  56. vm.dnf.Dispatch("debug_event", "2 执行JS代码失败")
  57. return
  58. }
  59. vm.dnf.Dispatch("debug_event", "3 获取列表完成")
  60. log.Println("3获取列表完成")
  61. //TODO 3. 打开详情页 ,最多打开10条
  62. runJs = sc.ContentJSCode
  63. if runJs == "" {
  64. runJs = renderJavascriptCoder(loadContentJS, sc)
  65. }
  66. //log.Println("execute content js", runJs)
  67. for _, v := range listResult {
  68. select {
  69. case <-exit:
  70. return
  71. default:
  72. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
  73. var result string = ""
  74. err = chromedp.Run(ctx, chromedp.Tasks{
  75. chromedp.Navigate(v.Href),
  76. chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
  77. chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond),
  78. chromedp.Evaluate(runJs, v),
  79. })
  80. if err != nil {
  81. log.Println("执行JS代码失败", err.Error())
  82. }
  83. if len(v.AttachLinks) > 0 { //有附件
  84. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. 下载附件"))
  85. //TODO 下载附件
  86. downloadAttaches(v, vm.attachesDir)
  87. }
  88. //关闭当前TAB页
  89. chromedp.Run(ctx, chromedp.Tasks{
  90. chromedp.Evaluate(`var ret="";window.close();ret`, &result),
  91. })
  92. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title))
  93. currentResult.PushBack(v)
  94. }
  95. }
  96. vm.dnf.Dispatch("debug_event", "5 采集测试完成")
  97. log.Println("5采集测试完成")
  98. }
  99. // CountYestodayArts 统计昨日信息发布量
  100. func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
  101. headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
  102. sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Url: url})
  103. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, "")
  104. log.Println("1浏览器打开")
  105. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  106. defer func() {
  107. cancel()
  108. baseCancel()
  109. log.Println("0浏览器已经销毁")
  110. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  111. vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
  112. close(exit)
  113. }()
  114. //时间比较
  115. now := time.Now()
  116. yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
  117. startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
  118. endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
  119. //TODO 1.
  120. chromedp.Run(ctx, chromedp.Tasks{
  121. chromedp.Navigate(sc.Url),
  122. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  123. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  124. })
  125. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  126. log.Println("2页面打开")
  127. //TODO 2. 执行JS代码,获取列表页信息
  128. runJs := renderJavascriptCoder(loadListItemsJS, sc)
  129. tmp := map[string]bool{}
  130. //最多翻页1000页
  131. for i := 0; i < MAX_TRUN_PAGE; i++ {
  132. select {
  133. case <-exit:
  134. return
  135. default:
  136. vm.dnf.Dispatch("debug_event", "3 执行列表页JS")
  137. listResult := make(be.ResultItems, 0)
  138. err := chromedp.Run(ctx, chromedp.Tasks{
  139. chromedp.Evaluate(runJs, &listResult),
  140. })
  141. if err != nil {
  142. log.Println("执行JS代码失败", err.Error())
  143. vm.dnf.Dispatch("debug_event", "3 执行JS代码失败")
  144. return
  145. }
  146. //TODO 人工智能转换采集到的日期
  147. callAIState := false
  148. for j := 0; j < 5; j++ {
  149. vm.dnf.Dispatch("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1))
  150. err := ai.UpdateResultDateStr(listResult)
  151. if err == nil {
  152. callAIState = true
  153. break
  154. }
  155. }
  156. if !callAIState {
  157. vm.dnf.Dispatch("debug_event", "3 多轮次调用AI均未得到合理结果")
  158. return
  159. }
  160. //TODO 日期统计
  161. for _, r := range listResult {
  162. day, err := time.Parse("2006-01-02", r.ListPubTime)
  163. if err != nil {
  164. continue
  165. }
  166. if _, ok := tmp[r.Href]; ok { //去重
  167. continue
  168. }
  169. if day.After(startOfYesterday) && day.Before(endOfYesterday) {
  170. count += 1
  171. } else if day.Before(startOfYesterday) {
  172. return
  173. }
  174. }
  175. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
  176. //TODO 翻页
  177. //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss)
  178. chromedp.Run(ctx, chromedp.Tasks{
  179. chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss),
  180. chromedp.ByJSPath),
  181. chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
  182. })
  183. }
  184. }
  185. return
  186. }