single.go 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. package vm
  2. import (
  3. "container/list"
  4. _ "embed"
  5. "fmt"
  6. "github.com/chromedp/chromedp"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. be "spider_creator/backend"
  9. "spider_creator/backend/ai"
  10. "strconv"
  11. "time"
  12. )
  13. // NewVM
  14. func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM {
  15. return &VM{
  16. attachesDir, dnf,
  17. }
  18. }
  19. func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
  20. sc, err := be.NewSpiderConfig(cssMark)
  21. if err != nil {
  22. qu.Debug("标注信息传输失败!")
  23. vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
  24. return
  25. }
  26. if url != "" {
  27. sc.Href = url
  28. }
  29. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  30. qu.Debug("1浏览器打开", *sc)
  31. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  32. defer func() {
  33. cancel()
  34. baseCancel()
  35. qu.Debug("0浏览器已经销毁")
  36. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  37. close(exit)
  38. }()
  39. chromedp.Run(ctx, chromedp.Tasks{
  40. chromedp.Navigate(sc.Href), //打开页面
  41. chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
  42. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
  43. })
  44. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  45. qu.Debug("2页面打开")
  46. var runJs string = sc.ListJSCode
  47. //TODO 2. 执行JS代码,获取列表页信息
  48. if be.RegSpace.ReplaceAllString(runJs, "") == "" {
  49. runJs = renderJavascriptCoder(loadListItemsJS, sc)
  50. }
  51. qu.Debug("列表页执行JS:", runJs)
  52. currentResult := list.New()
  53. be.DataResults[sc.Code] = currentResult
  54. no := 1
  55. for j := 0; j < maxPages; j++ {
  56. qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
  57. listResult := make(be.ResultItems, 0)
  58. err = chromedp.Run(ctx, chromedp.Tasks{
  59. chromedp.Evaluate(runJs, &listResult),
  60. })
  61. if err != nil {
  62. qu.Debug("执行JS代码失败", err.Error())
  63. vm.dnf.Dispatch("debug_event", "2 第"+fmt.Sprint(j+1)+"页执行JS代码失败")
  64. continue
  65. }
  66. qu.Debug("第"+fmt.Sprint(j+1)+"页列表采集条数:", len(listResult))
  67. vm.dnf.Dispatch("debug_event", "3 第"+fmt.Sprint(j+1)+"获取列表完成")
  68. for _, v := range listResult {
  69. select {
  70. case <-exit:
  71. return
  72. default:
  73. qu.Debug(v.No, v.ListTitle, v.Href, v.Title, v.PublishTime, v.ListPubTime)
  74. v.Site = sc.Site
  75. v.Channel = sc.Channel
  76. v.Title = v.ListTitle
  77. v.Content = "详见正文"
  78. v.PublishTime = v.ListPubTime
  79. v.No = no
  80. no++
  81. currentResult.PushBack(v)
  82. }
  83. }
  84. vm.dnf.Dispatch("debug_event", "4 第"+fmt.Sprint(j+1)+"页采集完成,准备执行翻页")
  85. if j < maxPages-1 {
  86. if err = trunPage(sc, trunPageDelay, ctx); err != nil {
  87. qu.Debug("翻页失败", err.Error())
  88. vm.dnf.Dispatch("debug_event", "5 第"+fmt.Sprint(j+1)+"页翻页失败")
  89. time.Sleep(3 * time.Second)
  90. break
  91. }
  92. }
  93. }
  94. vm.dnf.Dispatch("debug_event", "6 采集测试完成")
  95. qu.Debug("6采集测试完成")
  96. }
  97. // RunSpider
  98. func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
  99. sc, err := be.NewSpiderConfig(cssMark)
  100. if err != nil {
  101. qu.Debug("标注信息传输失败!")
  102. vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
  103. return
  104. }
  105. if url != "" {
  106. sc.Href = url
  107. }
  108. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  109. qu.Debug("1浏览器打开", *sc)
  110. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  111. defer func() {
  112. cancel()
  113. baseCancel()
  114. qu.Debug("0浏览器已经销毁")
  115. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  116. close(exit)
  117. }()
  118. chromedp.Run(ctx, chromedp.Tasks{
  119. chromedp.Navigate(sc.Href), //打开页面
  120. chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
  121. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
  122. })
  123. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  124. qu.Debug("2页面打开")
  125. var runJs string = sc.ListJSCode
  126. listResult := make(be.ResultItems, 0)
  127. //TODO 2. 执行JS代码,获取列表页信息
  128. if be.RegSpace.ReplaceAllString(runJs, "") == "" {
  129. runJs = renderJavascriptCoder(loadListItemsJS, sc)
  130. }
  131. qu.Debug("execute list jscode", runJs)
  132. err = chromedp.Run(ctx, chromedp.Tasks{
  133. chromedp.Evaluate(runJs, &listResult),
  134. })
  135. if err != nil {
  136. qu.Debug("执行JS代码失败", err.Error())
  137. vm.dnf.Dispatch("debug_event", "2 执行JS代码失败")
  138. return
  139. }
  140. vm.dnf.Dispatch("debug_event", "3 获取列表完成")
  141. qu.Debug("3获取列表完成", len(listResult))
  142. //TODO 3. 打开详情页 ,最多打开10条
  143. runJs = sc.ContentJSCode
  144. if be.RegSpace.ReplaceAllString(runJs, "") == "" {
  145. runJs = renderJavascriptCoder(loadContentJS, sc)
  146. }
  147. currentResult := list.New()
  148. be.DataResults[sc.Code] = currentResult
  149. qu.Debug("execute content js", runJs)
  150. no := 1
  151. for _, v := range listResult {
  152. select {
  153. case <-exit:
  154. return
  155. default:
  156. qu.Debug(v.No, v.ListTitle, v.Href)
  157. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
  158. var result string = ""
  159. err = chromedp.Run(ctx, chromedp.Tasks{
  160. chromedp.Navigate(v.Href),
  161. chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
  162. chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond),
  163. chromedp.Evaluate(runJs, v),
  164. })
  165. v.No = no
  166. no++
  167. v.Site = sc.Site
  168. v.Channel = sc.Channel
  169. if err != nil {
  170. qu.Debug("执行JS代码失败", err.Error())
  171. }
  172. if len(v.AttachLinks) > 0 { //有附件
  173. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. 下载附件"))
  174. //TODO 下载附件
  175. downloadAttaches(v, vm.attachesDir)
  176. }
  177. //关闭当前TAB页
  178. chromedp.Run(ctx, chromedp.Tasks{
  179. chromedp.Evaluate(`var ret="";window.close();ret`, &result),
  180. })
  181. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title))
  182. currentResult.PushBack(v)
  183. }
  184. }
  185. vm.dnf.Dispatch("debug_event", "5 采集测试完成")
  186. qu.Debug("5采集测试完成")
  187. }
  188. // CountYestodayArts 统计昨日信息发布量
  189. func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
  190. headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
  191. sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
  192. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, "")
  193. qu.Debug("1浏览器打开")
  194. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  195. defer func() {
  196. cancel()
  197. baseCancel()
  198. qu.Debug("0浏览器已经销毁")
  199. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  200. vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
  201. close(exit)
  202. }()
  203. //时间比较
  204. now := time.Now()
  205. yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
  206. startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
  207. endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
  208. //TODO 1.
  209. chromedp.Run(ctx, chromedp.Tasks{
  210. chromedp.Navigate(sc.Href),
  211. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  212. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  213. })
  214. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  215. qu.Debug("2页面打开")
  216. //TODO 2. 执行JS代码,获取列表页信息
  217. runJs := renderJavascriptCoder(loadListItemsJS, sc)
  218. tmp := map[string]bool{}
  219. //最多翻页1000页
  220. for i := 0; i < MAX_TRUN_PAGE; i++ {
  221. select {
  222. case <-exit:
  223. return
  224. default:
  225. vm.dnf.Dispatch("debug_event", "3 执行列表页JS")
  226. listResult := make(be.ResultItems, 0)
  227. err := chromedp.Run(ctx, chromedp.Tasks{
  228. chromedp.Evaluate(runJs, &listResult),
  229. })
  230. if err != nil {
  231. qu.Debug("执行JS代码失败", err.Error())
  232. vm.dnf.Dispatch("debug_event", "3 执行JS代码失败")
  233. return
  234. }
  235. //TODO 人工智能转换采集到的日期
  236. callAIState := false
  237. for j := 0; j < 5; j++ {
  238. vm.dnf.Dispatch("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1))
  239. err := ai.UpdateResultDateStr(listResult)
  240. if err == nil {
  241. callAIState = true
  242. break
  243. }
  244. }
  245. if !callAIState {
  246. vm.dnf.Dispatch("debug_event", "3 多轮次调用AI均未得到合理结果")
  247. return
  248. }
  249. //TODO 日期统计
  250. for _, r := range listResult {
  251. day, err := time.Parse("2006-01-02", r.ListPubTime)
  252. if err != nil {
  253. continue
  254. }
  255. if _, ok := tmp[r.Href]; ok { //去重
  256. continue
  257. }
  258. if day.After(startOfYesterday) && day.Before(endOfYesterday) {
  259. count += 1
  260. } else if day.Before(startOfYesterday) {
  261. return
  262. }
  263. }
  264. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
  265. //TODO 翻页
  266. //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss)
  267. chromedp.Run(ctx, chromedp.Tasks{
  268. chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss),
  269. chromedp.ByJSPath),
  270. chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
  271. })
  272. }
  273. }
  274. return
  275. }