vm.go 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. package vm
  2. import (
  3. "bytes"
  4. "context"
  5. _ "embed"
  6. "errors"
  7. "fmt"
  8. "io/ioutil"
  9. "log"
  10. "math/rand"
  11. "net/http"
  12. "os"
  13. be "spidercreator/backend"
  14. "strings"
  15. "text/template"
  16. "time"
  17. "github.com/chromedp/chromedp"
  18. "github.com/gabriel-vasile/mimetype"
  19. )
  20. const (
  21. MAX_TRUN_PAGE = 1000
  22. )
  23. type (
  24. //单一任务
  25. VM struct {
  26. attachesDir string
  27. dnf be.EventNotifyFace
  28. }
  29. //执行单元
  30. Worker struct {
  31. vm *VM
  32. baseCancel, incCancel context.CancelFunc
  33. ctx context.Context
  34. js string
  35. contentDelay int64
  36. }
  37. )
  38. var (
  39. //go:embed load_list_items.js
  40. loadListItemsJS string
  41. //go:embed load_content.js
  42. loadContentJS string
  43. )
  44. // renderJavascriptCoder
  45. func renderJavascriptCoder(tpl string, sc *be.SpiderConfig) string {
  46. t, err := template.New("").Parse(tpl)
  47. if err != nil {
  48. log.Println("创建JS代码模板失败", err.Error())
  49. return ""
  50. }
  51. buf := new(bytes.Buffer)
  52. err = t.Execute(buf, sc)
  53. if err != nil {
  54. log.Println("执行JS代码模板失败", err.Error())
  55. return ""
  56. }
  57. return buf.String()
  58. }
  59. // downloadAttaches 下载附件
  60. func downloadAttaches(v *be.ResultItem, attachesDir string) {
  61. client := &http.Client{
  62. Timeout: 30 * time.Second,
  63. }
  64. for _, attach := range v.AttachLinks {
  65. log.Println("准备下载附件,", attach.Href, attach.Title)
  66. req, err := http.NewRequest("GET", attach.Href, nil)
  67. if err != nil {
  68. log.Println(" 下载附件 构建req 出错:", attach.Href, attach.FileName, err.Error())
  69. continue
  70. }
  71. req.Header.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
  72. resp, err := client.Do(req)
  73. if err != nil {
  74. log.Println(" 下载附件 发送请求 出错:", attach.Href, attach.FileName, err.Error())
  75. continue
  76. }
  77. bs, err := ioutil.ReadAll(resp.Body)
  78. if err != nil {
  79. log.Println(" 下载附件 下载 出错:", attach.Href, attach.FileName, err.Error())
  80. continue
  81. }
  82. resp.Body.Close()
  83. //TODO 写入文件
  84. mtype := mimetype.Detect(bs)
  85. //不要HTML网页
  86. if strings.Contains(strings.ToLower(mtype.String()), "html") {
  87. continue
  88. }
  89. fileName := fmt.Sprintf("%s_%04d_%04d_%04d%s", time.Now().Format("20060102150405"), rand.Intn(9999),
  90. rand.Intn(9999), rand.Intn(9999), mtype.Extension())
  91. save2File := attachesDir + "/" + fileName
  92. fo, err := os.Create(save2File)
  93. if err != nil {
  94. log.Println(" 下载附件 生成文件 出错:", attach.Href, attach.FileName, save2File, err.Error())
  95. continue
  96. }
  97. fo.Write(bs)
  98. fo.Close()
  99. attach.FileName = fileName
  100. attach.FilePath = save2File
  101. attach.FileType = mtype.String()
  102. attach.FileSize = fmt.Sprintf("%.02fMB", float32(len(bs))/1024/1024)
  103. }
  104. //只过滤有效的附件
  105. newAttachesLinks := make([]*be.AttachLink, 0)
  106. for _, a := range v.AttachLinks {
  107. if a.FilePath != "" {
  108. newAttachesLinks = append(newAttachesLinks, a)
  109. }
  110. }
  111. v.AttachLinks = newAttachesLinks
  112. }
  113. // trunPage 翻页,需要作检查
  114. func trunPage(sc *be.SpiderConfig, delay int64, ctx context.Context) error {
  115. if sc.ListBodyCss == "" || (sc.ListNextPageCss == "" && sc.ListTrunPageJSCode == "") {
  116. return errors.New("当前爬虫配置,不具备翻页条件")
  117. }
  118. var runJs, result string = sc.ListTrunPageJSCode, ""
  119. if runJs == "" {
  120. runJs = fmt.Sprintf(`var link=document.querySelector("%s");if(link)link.click();""`, sc.ListNextPageCss)
  121. }
  122. log.Println("将要执行翻页的JS代码,", runJs)
  123. //TODO 1. 获取当前列表当前页的内容快照,以便与翻页后的结果对比
  124. var result1, result2 string
  125. var checkRunJs = fmt.Sprintf(`document.querySelector("%s").outerText`, sc.ListBodyCss)
  126. log.Println("检查翻页是否成功,执行的JS", checkRunJs)
  127. err := chromedp.Run(ctx, chromedp.Tasks{
  128. chromedp.Evaluate(checkRunJs, &result1),
  129. })
  130. if err != nil {
  131. log.Println("翻页检查1失败,", checkRunJs)
  132. return err
  133. }
  134. if runJs != "" {
  135. //可能就没有分页
  136. err = chromedp.Run(ctx, chromedp.Tasks{
  137. chromedp.Evaluate(runJs, &result),
  138. chromedp.Sleep(time.Duration(delay) * time.Millisecond),
  139. })
  140. if err != nil {
  141. log.Println("翻页操作失败,", runJs)
  142. return err
  143. }
  144. } else {
  145. return errors.New("trun page error ")
  146. }
  147. err = chromedp.Run(ctx, chromedp.Tasks{
  148. chromedp.Evaluate(checkRunJs, &result2),
  149. })
  150. if err != nil {
  151. log.Println("翻页检查2失败,", checkRunJs)
  152. return err
  153. }
  154. if result1 == "" || result2 == "" || result1 == result2 {
  155. return errors.New("翻页失败,两次翻页获取到的列表区域块不符合要求")
  156. }
  157. return nil
  158. }