single.go 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. package vm
  2. import (
  3. "container/list"
  4. _ "embed"
  5. "fmt"
  6. "github.com/chromedp/chromedp"
  7. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. be "spider_creator/backend"
  9. ai "spider_creator/backend/ai"
  10. "strconv"
  11. "time"
  12. )
  13. // NewVM
  14. func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM {
  15. return &VM{
  16. attachesDir, dnf,
  17. }
  18. }
  19. func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
  20. sc, err := be.NewSpiderConfig(cssMark)
  21. if err != nil {
  22. qu.Debug("标注信息传输失败!")
  23. vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
  24. return
  25. }
  26. if url != "" {
  27. sc.Href = url
  28. }
  29. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  30. qu.Debug("1浏览器打开", *sc)
  31. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  32. defer func() {
  33. cancel()
  34. baseCancel()
  35. qu.Debug("0浏览器已经销毁")
  36. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  37. close(exit)
  38. }()
  39. chromedp.Run(ctx, chromedp.Tasks{
  40. chromedp.Navigate(sc.Href), //打开页面
  41. chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
  42. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
  43. })
  44. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  45. qu.Debug("2页面打开")
  46. var runJs string = sc.ListJSCode
  47. //TODO 2. 执行JS代码,获取列表页信息
  48. if runJs == "" {
  49. runJs = renderJavascriptCoder(loadListItemsJS, sc)
  50. }
  51. qu.Debug("列表页执行JS:", runJs)
  52. currentResult := list.New()
  53. be.DataResults[sc.Code] = currentResult
  54. no := 1
  55. for j := 0; j < maxPages; j++ {
  56. qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
  57. listResult := make(be.ResultItems, 0)
  58. err = chromedp.Run(ctx, chromedp.Tasks{
  59. chromedp.Evaluate(runJs, &listResult),
  60. })
  61. if err != nil {
  62. qu.Debug("执行JS代码失败", err.Error())
  63. vm.dnf.Dispatch("debug_event", "2 第"+fmt.Sprint(j+1)+"页执行JS代码失败")
  64. continue
  65. }
  66. qu.Debug("第"+fmt.Sprint(j+1)+"页列表采集条数:", len(listResult))
  67. vm.dnf.Dispatch("debug_event", "3 第"+fmt.Sprint(j+1)+"获取列表完成")
  68. for _, v := range listResult {
  69. select {
  70. case <-exit:
  71. return
  72. default:
  73. v.Site = sc.Site
  74. v.Channel = sc.Channel
  75. v.Title = v.ListTitle
  76. v.Content = "详见正文"
  77. v.No = no
  78. qu.Debug(v.No, v.ListTitle, v.Href)
  79. no++
  80. currentResult.PushBack(v)
  81. }
  82. }
  83. vm.dnf.Dispatch("debug_event", "4 第"+fmt.Sprint(j+1)+"页采集完成,准备执行翻页")
  84. if j < maxPages-1 {
  85. if err = trunPage(sc, trunPageDelay, ctx); err != nil {
  86. qu.Debug("翻页失败", err.Error())
  87. vm.dnf.Dispatch("debug_event", "5 第"+fmt.Sprint(j+1)+"页翻页失败")
  88. time.Sleep(3 * time.Second)
  89. break
  90. }
  91. }
  92. }
  93. vm.dnf.Dispatch("debug_event", "6 采集测试完成")
  94. qu.Debug("6采集测试完成")
  95. }
  96. // RunSpider
  97. func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe string, exit chan bool, cssMark map[string]interface{}) {
  98. sc, err := be.NewSpiderConfig(cssMark)
  99. if err != nil {
  100. qu.Debug("标注信息传输失败!")
  101. vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
  102. return
  103. }
  104. if url != "" {
  105. sc.Href = url
  106. }
  107. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  108. qu.Debug("1浏览器打开", *sc)
  109. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  110. defer func() {
  111. cancel()
  112. baseCancel()
  113. qu.Debug("0浏览器已经销毁")
  114. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  115. close(exit)
  116. }()
  117. chromedp.Run(ctx, chromedp.Tasks{
  118. chromedp.Navigate(sc.Href), //打开页面
  119. chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
  120. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
  121. })
  122. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  123. qu.Debug("2页面打开")
  124. var runJs string = sc.ListJSCode
  125. listResult := make(be.ResultItems, 0)
  126. //TODO 2. 执行JS代码,获取列表页信息
  127. if runJs == "" {
  128. runJs = renderJavascriptCoder(loadListItemsJS, sc)
  129. }
  130. qu.Debug("execute list jscode", runJs)
  131. err = chromedp.Run(ctx, chromedp.Tasks{
  132. chromedp.Evaluate(runJs, &listResult),
  133. })
  134. if err != nil {
  135. qu.Debug("执行JS代码失败", err.Error())
  136. vm.dnf.Dispatch("debug_event", "2 执行JS代码失败")
  137. return
  138. }
  139. vm.dnf.Dispatch("debug_event", "3 获取列表完成")
  140. qu.Debug("3获取列表完成")
  141. //TODO 3. 打开详情页 ,最多打开10条
  142. runJs = sc.ContentJSCode
  143. if runJs == "" {
  144. runJs = renderJavascriptCoder(loadContentJS, sc)
  145. }
  146. currentResult := list.New()
  147. be.DataResults[sc.Code] = currentResult
  148. qu.Debug("execute content js", runJs)
  149. no := 1
  150. for _, v := range listResult {
  151. select {
  152. case <-exit:
  153. return
  154. default:
  155. qu.Debug(v.No, v.ListTitle, v.Href)
  156. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
  157. var result string = ""
  158. err = chromedp.Run(ctx, chromedp.Tasks{
  159. chromedp.Navigate(v.Href),
  160. chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
  161. chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond),
  162. chromedp.Evaluate(runJs, v),
  163. })
  164. v.No = no
  165. no++
  166. v.Site = sc.Site
  167. v.Channel = sc.Channel
  168. if err != nil {
  169. qu.Debug("执行JS代码失败", err.Error())
  170. }
  171. if len(v.AttachLinks) > 0 { //有附件
  172. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. 下载附件"))
  173. //TODO 下载附件
  174. downloadAttaches(v, vm.attachesDir)
  175. }
  176. //关闭当前TAB页
  177. chromedp.Run(ctx, chromedp.Tasks{
  178. chromedp.Evaluate(`var ret="";window.close();ret`, &result),
  179. })
  180. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4. %d- 下载详情页 %s 完成", v.No, v.Title))
  181. currentResult.PushBack(v)
  182. }
  183. }
  184. vm.dnf.Dispatch("debug_event", "5 采集测试完成")
  185. qu.Debug("5采集测试完成")
  186. }
  187. // CountYestodayArts 统计昨日信息发布量
  188. func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
  189. headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
  190. sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
  191. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, "")
  192. qu.Debug("1浏览器打开")
  193. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  194. defer func() {
  195. cancel()
  196. baseCancel()
  197. qu.Debug("0浏览器已经销毁")
  198. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  199. vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
  200. close(exit)
  201. }()
  202. //时间比较
  203. now := time.Now()
  204. yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
  205. startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
  206. endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
  207. //TODO 1.
  208. chromedp.Run(ctx, chromedp.Tasks{
  209. chromedp.Navigate(sc.Href),
  210. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  211. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  212. })
  213. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  214. qu.Debug("2页面打开")
  215. //TODO 2. 执行JS代码,获取列表页信息
  216. runJs := renderJavascriptCoder(loadListItemsJS, sc)
  217. tmp := map[string]bool{}
  218. //最多翻页1000页
  219. for i := 0; i < MAX_TRUN_PAGE; i++ {
  220. select {
  221. case <-exit:
  222. return
  223. default:
  224. vm.dnf.Dispatch("debug_event", "3 执行列表页JS")
  225. listResult := make(be.ResultItems, 0)
  226. err := chromedp.Run(ctx, chromedp.Tasks{
  227. chromedp.Evaluate(runJs, &listResult),
  228. })
  229. if err != nil {
  230. qu.Debug("执行JS代码失败", err.Error())
  231. vm.dnf.Dispatch("debug_event", "3 执行JS代码失败")
  232. return
  233. }
  234. //TODO 人工智能转换采集到的日期
  235. callAIState := false
  236. for j := 0; j < 5; j++ {
  237. vm.dnf.Dispatch("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1))
  238. err := ai.UpdateResultDateStr(listResult)
  239. if err == nil {
  240. callAIState = true
  241. break
  242. }
  243. }
  244. if !callAIState {
  245. vm.dnf.Dispatch("debug_event", "3 多轮次调用AI均未得到合理结果")
  246. return
  247. }
  248. //TODO 日期统计
  249. for _, r := range listResult {
  250. day, err := time.Parse("2006-01-02", r.ListPubTime)
  251. if err != nil {
  252. continue
  253. }
  254. if _, ok := tmp[r.Href]; ok { //去重
  255. continue
  256. }
  257. if day.After(startOfYesterday) && day.Before(endOfYesterday) {
  258. count += 1
  259. } else if day.Before(startOfYesterday) {
  260. return
  261. }
  262. }
  263. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
  264. //TODO 翻页
  265. //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss)
  266. chromedp.Run(ctx, chromedp.Tasks{
  267. chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss),
  268. chromedp.ByJSPath),
  269. chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
  270. })
  271. }
  272. }
  273. return
  274. }