single.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. package vm
  2. import (
  3. "container/list"
  4. "context"
  5. _ "embed"
  6. "fmt"
  7. "github.com/chromedp/chromedp"
  8. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  9. be "spider_creator/backend"
  10. "spider_creator/backend/ai"
  11. "strconv"
  12. "time"
  13. )
  14. // NewVM
  15. func NewVM(attachesDir string, dnf be.EventNotifyFace) *VM {
  16. return &VM{
  17. attachesDir, dnf,
  18. }
  19. }
  20. func (vm *VM) RunSpiderTmp(url string, maxPages int, listDealy, trunPageDelay, contentDelay int64, headless bool, showImage bool, proxyServe bool, exit chan bool, cssMark map[string]interface{}) {
  21. sc, err := be.NewSpiderConfig(cssMark)
  22. if err != nil {
  23. qu.Debug("标注信息传输失败!")
  24. vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
  25. return
  26. }
  27. if url != "" {
  28. sc.Href = url
  29. }
  30. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  31. qu.Debug("1浏览器打开", *sc)
  32. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  33. defer func() {
  34. cancel()
  35. baseCancel()
  36. qu.Debug("0浏览器已经销毁")
  37. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  38. close(exit)
  39. }()
  40. chromedp.Run(ctx, chromedp.Tasks{
  41. chromedp.Navigate(sc.Href), //打开页面
  42. chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
  43. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
  44. })
  45. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  46. qu.Debug("2页面打开")
  47. var runJs string = sc.ListJSCode
  48. //列表页信息初始化
  49. vm.dnf.Dispatch("debug_event", "3 初始化列表页信息")
  50. if !vm.InitListPage(ctx, sc) {
  51. vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出")
  52. return
  53. }
  54. //TODO 2. 执行JS代码,获取列表页信息
  55. if be.RegSpace.ReplaceAllString(runJs, "") == "" {
  56. runJs = renderJavascriptCoder(loadListItemsJS, sc)
  57. }
  58. qu.Debug("列表页执行JS:", runJs)
  59. currentResult := list.New()
  60. be.DataResults[sc.Code] = currentResult
  61. no := 1
  62. for j := 0; j < maxPages; j++ {
  63. qu.Debug("开始检查第" + fmt.Sprint(j+1) + "页...")
  64. listResult := make(be.ResultItems, 0)
  65. err = chromedp.Run(ctx, chromedp.Tasks{
  66. chromedp.Evaluate(runJs, &listResult),
  67. })
  68. if err != nil {
  69. qu.Debug("执行JS代码失败", err.Error())
  70. vm.dnf.Dispatch("debug_event", "2 第"+fmt.Sprint(j+1)+"页执行JS代码失败")
  71. continue
  72. }
  73. qu.Debug("第"+fmt.Sprint(j+1)+"页列表采集条数:", len(listResult))
  74. vm.dnf.Dispatch("debug_event", "3 第"+fmt.Sprint(j+1)+"获取列表完成")
  75. for _, v := range listResult {
  76. select {
  77. case <-exit:
  78. return
  79. default:
  80. qu.Debug(v.No, v.ListTitle, v.Href, v.Title, v.PublishTime, v.ListPubTime)
  81. v.Site = sc.Site
  82. v.Channel = sc.Channel
  83. v.Title = v.ListTitle
  84. v.Content = "详见正文"
  85. v.PublishTime = v.ListPubTime
  86. v.No = no
  87. no++
  88. currentResult.PushBack(v)
  89. }
  90. }
  91. vm.dnf.Dispatch("debug_event", "4 第"+fmt.Sprint(j+1)+"页采集完成,准备执行翻页")
  92. if j < maxPages-1 {
  93. if err = trunPage(sc, trunPageDelay, ctx); err != nil {
  94. qu.Debug("翻页失败", err.Error())
  95. vm.dnf.Dispatch("debug_event", "5 第"+fmt.Sprint(j+1)+"页翻页失败")
  96. time.Sleep(3 * time.Second)
  97. break
  98. }
  99. }
  100. }
  101. vm.dnf.Dispatch("debug_event", "6 采集测试完成")
  102. qu.Debug("6采集测试完成")
  103. }
  104. // RunSpider 适用于测试1页数据
  105. func (vm *VM) RunSpider(url string, maxPages int, listDealy int64, contentDelay int64, headless bool, showImage bool, proxyServe bool, exit chan bool, cssMark map[string]interface{}) {
  106. sc, err := be.NewSpiderConfig(cssMark)
  107. if err != nil {
  108. qu.Debug("标注信息传输失败!")
  109. vm.dnf.Dispatch("debug_event", "标注信息传输失败!")
  110. return
  111. }
  112. if url != "" {
  113. sc.Href = url
  114. }
  115. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, proxyServe)
  116. qu.Debug("1浏览器打开", *sc)
  117. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  118. defer func() {
  119. cancel()
  120. baseCancel()
  121. qu.Debug("0浏览器已经销毁")
  122. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  123. close(exit)
  124. }()
  125. chromedp.Run(ctx, chromedp.Tasks{
  126. chromedp.Navigate(sc.Href), //打开页面
  127. chromedp.WaitReady("document.body", chromedp.ByJSPath), //等待body加载完毕
  128. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond), //列表页等待
  129. })
  130. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  131. vm.dnf.Dispatch("debug_event", "3 初始化列表页信息")
  132. //1、列表页信息初始化
  133. if !vm.InitListPage(ctx, sc) {
  134. vm.dnf.Dispatch("debug_event", "3 初始化列表页失败,退出")
  135. return
  136. }
  137. var runJs string = sc.ListJSCode
  138. listResult := make(be.ResultItems, 0)
  139. //2、执行JS代码,获取列表页信息
  140. if be.RegSpace.ReplaceAllString(runJs, "") == "" {
  141. runJs = renderJavascriptCoder(loadListItemsJS, sc)
  142. }
  143. qu.Debug("获取列表JS代码:", runJs)
  144. err = chromedp.Run(ctx, chromedp.Tasks{
  145. chromedp.Evaluate(runJs, &listResult),
  146. })
  147. if err != nil {
  148. qu.Debug("执行JS代码失败", err.Error())
  149. vm.dnf.Dispatch("debug_event", "4 执行JS代码失败")
  150. return
  151. }
  152. vm.dnf.Dispatch("debug_event", "4 获取列表完成")
  153. qu.Debug("3获取列表完成", len(listResult))
  154. //3、打开详情页
  155. runJs = sc.ContentJSCode
  156. if be.RegSpace.ReplaceAllString(runJs, "") == "" {
  157. runJs = renderJavascriptCoder(loadContentJS, sc)
  158. }
  159. currentResult := list.New()
  160. be.DataResults[sc.Code] = currentResult
  161. qu.Debug("详情页JS代码:", runJs)
  162. no := 1
  163. for _, v := range listResult {
  164. select {
  165. case <-exit:
  166. return
  167. default:
  168. qu.Debug(v.No, v.Href, v.ListTitle, v.ListPubTime)
  169. vm.dnf.Dispatch("debug_event", fmt.Sprintf("5. %d- 待 下载详情页 %s ", v.No, v.ListTitle))
  170. var result string = ""
  171. err = chromedp.Run(ctx, chromedp.Tasks{
  172. chromedp.Navigate(v.Href),
  173. chromedp.WaitReady(`document.body`, chromedp.ByJSPath),
  174. chromedp.Sleep(time.Duration(contentDelay) * time.Millisecond),
  175. chromedp.Evaluate(runJs, v),
  176. })
  177. v.No = no
  178. no++
  179. v.Site = sc.Site
  180. v.Channel = sc.Channel
  181. if err != nil {
  182. qu.Debug("执行JS代码失败", err.Error())
  183. }
  184. if len(v.AttachLinks) > 0 { //有附件
  185. vm.dnf.Dispatch("debug_event", fmt.Sprintf("6. 下载附件"))
  186. //4、下载附件
  187. downloadAttaches(v, vm.attachesDir)
  188. }
  189. //关闭当前TAB页
  190. chromedp.Run(ctx, chromedp.Tasks{
  191. chromedp.Evaluate(`var ret="";window.close();ret`, &result),
  192. })
  193. vm.dnf.Dispatch("debug_event", fmt.Sprintf("5. %d- 下载详情页 %s 完成", v.No, v.Title))
  194. currentResult.PushBack(v)
  195. }
  196. }
  197. vm.dnf.Dispatch("debug_event", "7 采集测试完成")
  198. qu.Debug("5采集测试完成")
  199. }
  200. // InitPage 初始化页面
  201. func (vm *VM) InitListPage(ctx context.Context, sc *be.SpiderConfig) (initPage bool) {
  202. if len(sc.InitList) == 0 { //没有初始化页面行为
  203. return true
  204. }
  205. for j, ac := range sc.InitList {
  206. arc := vm.RunAction(ctx, ac, j) //itype 0:执行成功 1:执行错误 2:超时
  207. if !arc.Result { //动作执行失败,不再执行后续动作
  208. return false
  209. }
  210. }
  211. return true
  212. }
  213. // RunAction 执行动作
  214. func (vm *VM) RunAction(ctx context.Context, ac *be.Actions, num int) *be.ActionRunResult {
  215. ctxTmp, cancel := context.WithTimeout(context.Background(), time.Duration(ac.SleepTime+5000)*time.Millisecond)
  216. defer cancel()
  217. done := make(chan *be.ActionRunResult)
  218. go func() {
  219. for {
  220. select {
  221. case <-ctxTmp.Done():
  222. done <- &be.ActionRunResult{
  223. Result: false,
  224. RunResult: be.RUN_ACTION_TIMEOUT,
  225. CheckResult: be.CHECK_ACTION_TIMEOUT,
  226. }
  227. vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num)+"个动作JS,超时")
  228. qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS,超时")
  229. return
  230. default:
  231. //执行动作
  232. vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num)+"个动作JS")
  233. qu.Debug("3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS:", ac.ActionJs)
  234. var result string
  235. err := chromedp.Run(ctx, chromedp.Tasks{
  236. chromedp.Evaluate(ac.ActionJs, &result),
  237. chromedp.Sleep(time.Duration(ac.SleepTime) * time.Millisecond),
  238. })
  239. if err != nil {
  240. done <- &be.ActionRunResult{
  241. Result: false,
  242. RunResult: be.RUN_ACTION_ERROR,
  243. CheckResult: be.CHECK_ACTION_NOTCHECK,
  244. }
  245. vm.dnf.Dispatch("debug_event", "3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常")
  246. qu.Debug("3.1 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS,异常", err)
  247. return
  248. }
  249. //检查结果
  250. var checkResult string
  251. if ac.CheckJs != "" {
  252. vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS")
  253. qu.Debug("3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS:", ac.CheckJs)
  254. err = chromedp.Run(ctx, chromedp.Tasks{
  255. chromedp.Evaluate(ac.CheckJs, &checkResult),
  256. })
  257. if err != nil {
  258. vm.dnf.Dispatch("debug_event", "3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常")
  259. qu.Debug("3.2 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作检查JS,异常", err)
  260. done <- &be.ActionRunResult{
  261. Result: false,
  262. RunResult: be.RUN_ACTION_SUCCESS,
  263. CheckResult: be.CHECK_ACTION_ERROR,
  264. }
  265. return
  266. }
  267. vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS结果:"+checkResult)
  268. qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS结果:" + checkResult)
  269. done <- &be.ActionRunResult{
  270. Result: checkResult == be.CHECH_RESULT,
  271. RunResult: be.RUN_ACTION_SUCCESS,
  272. CheckResult: be.CHECK_ACTION_SUCCESS,
  273. }
  274. return
  275. }
  276. vm.dnf.Dispatch("debug_event", "3.3 初始化列表页,执行第"+fmt.Sprint(num+1)+"个动作JS结果:true")
  277. qu.Debug("3.3 初始化列表页,执行第" + fmt.Sprint(num+1) + "个动作JS结果:true")
  278. done <- &be.ActionRunResult{
  279. Result: true,
  280. RunResult: be.RUN_ACTION_SUCCESS,
  281. CheckResult: be.CHECK_ACTION_NOTCHECK,
  282. }
  283. return
  284. }
  285. }
  286. }()
  287. return <-done
  288. }
  289. // InitPageTmp 初始化页面
  290. func (vm *VM) InitPageTmp(ctx context.Context, timeout int) bool {
  291. //1、页面初始化需要执行的事件(多个动作)
  292. initPageJs := `var clicklabel = document.querySelector("#app-base > div > div.IndexContent > div > div > form > div > div.ant-col.ant-col-4 > button.ant-btn.ant-btn-primary > span");if(clicklabel)clicklabel.click();"";`
  293. var result string
  294. err := chromedp.Run(ctx, chromedp.Tasks{
  295. chromedp.Evaluate(initPageJs, &result),
  296. //chromedp.WaitReady(".ant-list-items"),
  297. })
  298. if err != nil {
  299. qu.Debug("初始化页面JS执行失败", err.Error())
  300. return false
  301. }
  302. ctxTmp, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond)
  303. defer cancel()
  304. done := make(chan bool)
  305. go func() {
  306. for {
  307. select {
  308. case <-ctxTmp.Done():
  309. done <- false
  310. return
  311. default:
  312. getJs := `var label = document.querySelector(".ant-list-items");if(label)label.outerText;`
  313. err = chromedp.Run(ctx, chromedp.Tasks{
  314. chromedp.Evaluate(getJs, &result),
  315. })
  316. if result != "" {
  317. done <- true
  318. return
  319. }
  320. time.Sleep(1 * time.Second) // 模拟工作负载
  321. }
  322. }
  323. }()
  324. get := <-done
  325. return get
  326. }
  327. // CountYestodayArts 统计昨日信息发布量
  328. func (vm *VM) CountYestodayArts(url string, listDealy int64, trunPageDelay int64,
  329. headless bool, showImage bool, exit chan bool, currentSpiderConfig *be.SpiderConfig) (count int) {
  330. sc := be.MergeSpiderConfig(currentSpiderConfig, &be.SpiderConfig{Href: url})
  331. _, baseCancel, _, _, ctx, cancel := be.NewBrowser(headless, showImage, false)
  332. qu.Debug("1浏览器打开")
  333. vm.dnf.Dispatch("debug_event", "1 浏览器打开")
  334. defer func() {
  335. cancel()
  336. baseCancel()
  337. qu.Debug("0浏览器已经销毁")
  338. vm.dnf.Dispatch("debug_event", "0 浏览器已经销毁")
  339. vm.dnf.Dispatch("debug_event", fmt.Sprintf("99 昨日信息发布量:%d ", count))
  340. close(exit)
  341. }()
  342. //时间比较
  343. now := time.Now()
  344. yesterday := now.AddDate(0, 0, -1) // 获取昨天的日期
  345. startOfYesterday := time.Date(yesterday.Year(), yesterday.Month(), yesterday.Day(), 0, 0, 0, 0, now.Location())
  346. endOfYesterday := startOfYesterday.AddDate(0, 0, 1).Add(-time.Nanosecond)
  347. //TODO 1.
  348. chromedp.Run(ctx, chromedp.Tasks{
  349. chromedp.Navigate(sc.Href),
  350. chromedp.WaitReady("document.body", chromedp.ByJSPath),
  351. chromedp.Sleep(time.Duration(listDealy) * time.Millisecond),
  352. })
  353. vm.dnf.Dispatch("debug_event", "2 页面已经打开")
  354. qu.Debug("2页面打开")
  355. //TODO 2. 执行JS代码,获取列表页信息
  356. runJs := renderJavascriptCoder(loadListItemsJS, sc)
  357. tmp := map[string]bool{}
  358. //最多翻页1000页
  359. for i := 0; i < MAX_TRUN_PAGE; i++ {
  360. select {
  361. case <-exit:
  362. return
  363. default:
  364. vm.dnf.Dispatch("debug_event", "3 执行列表页JS")
  365. listResult := make(be.ResultItems, 0)
  366. err := chromedp.Run(ctx, chromedp.Tasks{
  367. chromedp.Evaluate(runJs, &listResult),
  368. })
  369. if err != nil {
  370. qu.Debug("执行JS代码失败", err.Error())
  371. vm.dnf.Dispatch("debug_event", "3 执行JS代码失败")
  372. return
  373. }
  374. //TODO 人工智能转换采集到的日期
  375. callAIState := false
  376. for j := 0; j < 5; j++ {
  377. vm.dnf.Dispatch("debug_event", "3 执行AI提取列表发布时间"+strconv.Itoa(j+1))
  378. err := ai.UpdateResultDateStr(listResult)
  379. if err == nil {
  380. callAIState = true
  381. break
  382. }
  383. }
  384. if !callAIState {
  385. vm.dnf.Dispatch("debug_event", "3 多轮次调用AI均未得到合理结果")
  386. return
  387. }
  388. //TODO 日期统计
  389. for _, r := range listResult {
  390. day, err := time.Parse("2006-01-02", r.ListPubTime)
  391. if err != nil {
  392. continue
  393. }
  394. if _, ok := tmp[r.Href]; ok { //去重
  395. continue
  396. }
  397. if day.After(startOfYesterday) && day.Before(endOfYesterday) {
  398. count += 1
  399. } else if day.Before(startOfYesterday) {
  400. return
  401. }
  402. }
  403. vm.dnf.Dispatch("debug_event", fmt.Sprintf("4 当前观测昨日信息发布量:%d ", count))
  404. //TODO 翻页
  405. //fmt.Println("下一页CSS选择器", currentSpiderConfig.ListNextPageCss)
  406. chromedp.Run(ctx, chromedp.Tasks{
  407. chromedp.Click(fmt.Sprintf(`document.querySelector("%s")`, currentSpiderConfig.ListNextPageCss),
  408. chromedp.ByJSPath),
  409. chromedp.Sleep(time.Duration(trunPageDelay) * time.Millisecond),
  410. })
  411. }
  412. }
  413. return
  414. }