single.go 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. package vm
  2. import (
  3. "container/list"
  4. _ "embed"
  5. "fmt"
  6. "github.com/chromedp/chromedp"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. be "spider_creator/backend"
  9. ai "spider_creator/backend/ai"
  10. "strconv"
  11. "time"
  12. )
  13. // NewVM
  14. func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM {
  15. return &VM{
  16. attachesDir, dnf,
  17. }
  18. }
  19. // RunSpider
  20. func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
  21. sc, err := be.NewSpiderConfig(cssMark)
  22. if err != nil {
  23. qu.Debug("标注信息传输失败!")
  24. vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
  25. return
  26. }
  27. if url != "" {
  28. sc.Href = url
  29. }
  30. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  31. qu.Debug("1浏览器打开", *sc)
  32. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  33. defer func() {
  34. cancel()
  35. baseCancel()
  36. qu.Debug("0浏览器已经销毁")
  37. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  38. close(exit)
  39. }()
  40. chromedp.Run(ctx, chromedp.Tasks{
  41. chromedp.Navigate(sc.Href), //打开页面
  42. chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
  43. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
  44. })
  45. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  46. qu.Debug("2页面打开")
  47. var runJs string = sc.ListJSCode
  48. listResult := make(be.ResultItems, 0)
  49. //TODO 2. 执行JS代码,获取列表页信息
  50. if runJs == "" {
  51. runJs = renderJavascriptCoder(loadListItemsJS, sc)
  52. }
  53. qu.Debug("execute list jscode", runJs)
  54. err = chromedp.Run(ctx, chromedp.Tasks{
  55. chromedp.Evaluate(runJs, &listResult),
  56. })
  57. if err != nil {
  58. qu.Debug("执行JS代码失败", err.Error())
  59. vm.dnf.Dispatch("debug_event", "2 执行JS代码失败")
  60. return
  61. }
  62. vm.dnf.Dispatch("debug_event", "3 获取列表完成")
  63. qu.Debug("3获取列表完成")
  64. //TODO 3. 打开详情页 ,最多打开10条
  65. runJs = sc.ContentJSCode
  66. if runJs == "" {
  67. runJs = renderJavascriptCoder(loadContentJS, sc)
  68. }
  69. currentResult := list.New()
  70. be.DataResults[sc.Code] = currentResult
  71. qu.Debug("execute content js", runJs)
  72. for _, v := range listResult {
  73. select {
  74. case <-exit:
  75. return
  76. default:
  77. qu.Debug(v.No, v.ListTitle, v.Href)
  78. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
  79. var result string = ""
  80. err = chromedp.Run(ctx, chromedp.Tasks{
  81. chromedp.Navigate(v.Href),
  82. chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
  83. chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond),
  84. chromedp.Evaluate(runJs, v),
  85. })
  86. v.Site = sc.Site
  87. v.Channel = sc.Channel
  88. if err != nil {
  89. qu.Debug("执行JS代码失败", err.Error())
  90. }
  91. if len(v.AttachLinks) > 0 { //有附件
  92. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. 下载附件"))
  93. //TODO 下载附件
  94. downloadAttaches(v, vm.attachesDir)
  95. }
  96. //关闭当前TAB页
  97. chromedp.Run(ctx, chromedp.Tasks{
  98. chromedp.Evaluate(`var ret="";window.close();ret`, &result),
  99. })
  100. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title))
  101. currentResult.PushBack(v)
  102. }
  103. }
  104. vm.dnf.Dispatch("debug_event", "5 采集测试完成")
  105. qu.Debug("5采集测试完成")
  106. }
  107. // CountYestodayArts 统计昨日信息发布量
  108. func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
  109. headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
  110. sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
  111. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, "")
  112. qu.Debug("1浏览器打开")
  113. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  114. defer func() {
  115. cancel()
  116. baseCancel()
  117. qu.Debug("0浏览器已经销毁")
  118. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  119. vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
  120. close(exit)
  121. }()
  122. //时间比较
  123. now := time.Now()
  124. yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
  125. startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
  126. endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
  127. //TODO 1.
  128. chromedp.Run(ctx, chromedp.Tasks{
  129. chromedp.Navigate(sc.Href),
  130. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  131. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  132. })
  133. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  134. qu.Debug("2页面打开")
  135. //TODO 2. 执行JS代码,获取列表页信息
  136. runJs := renderJavascriptCoder(loadListItemsJS, sc)
  137. tmp := map[string]bool{}
  138. //最多翻页1000页
  139. for i := 0; i < MAX_TRUN_PAGE; i++ {
  140. select {
  141. case <-exit:
  142. return
  143. default:
  144. vm.dnf.Dispatch("debug_event", "3 执行列表页JS")
  145. listResult := make(be.ResultItems, 0)
  146. err := chromedp.Run(ctx, chromedp.Tasks{
  147. chromedp.Evaluate(runJs, &listResult),
  148. })
  149. if err != nil {
  150. qu.Debug("执行JS代码失败", err.Error())
  151. vm.dnf.Dispatch("debug_event", "3 执行JS代码失败")
  152. return
  153. }
  154. //TODO 人工智能转换采集到的日期
  155. callAIState := false
  156. for j := 0; j < 5; j++ {
  157. vm.dnf.Dispatch("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1))
  158. err := ai.UpdateResultDateStr(listResult)
  159. if err == nil {
  160. callAIState = true
  161. break
  162. }
  163. }
  164. if !callAIState {
  165. vm.dnf.Dispatch("debug_event", "3 多轮次调用AI均未得到合理结果")
  166. return
  167. }
  168. //TODO 日期统计
  169. for _, r := range listResult {
  170. day, err := time.Parse("2006-01-02", r.ListPubTime)
  171. if err != nil {
  172. continue
  173. }
  174. if _, ok := tmp[r.Href]; ok { //去重
  175. continue
  176. }
  177. if day.After(startOfYesterday) && day.Before(endOfYesterday) {
  178. count += 1
  179. } else if day.Before(startOfYesterday) {
  180. return
  181. }
  182. }
  183. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
  184. //TODO 翻页
  185. //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss)
  186. chromedp.Run(ctx, chromedp.Tasks{
  187. chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss),
  188. chromedp.ByJSPath),
  189. chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
  190. })
  191. }
  192. }
  193. return
  194. }