vm.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. package vm
  2. import (
  3. "bytes"
  4. "context"
  5. _ "embed"
  6. "errors"
  7. "fmt"
  8. "io/ioutil"
  9. qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  10. "math/rand"
  11. "net/http"
  12. "net/url"
  13. "os"
  14. "regexp"
  15. be "spider_creator/backend"
  16. "strings"
  17. "text/template"
  18. "time"
  19. "github.com/chromedp/chromedp"
  20. "github.com/gabriel-vasile/mimetype"
  21. )
  22. const (
  23. MAX_TRUN_PAGE = 1000
  24. VERIVY_MAX_TRUN_PAGE = 3
  25. )
  26. var (
  27. Reg_Date = regexp.MustCompile(`\d`)
  28. Reg_File_ContentType = regexp.MustCompile(`(?i)^(application/(vnd\.(openxmlformats-officedocument|ms-excel)|msword|pdf)|image/(png|jpeg))`)
  29. Reg_File_Type = regexp.MustCompile(`(?i)\.(pdf|doc|docx|xls|xlsx|ppt|pptx|jpg|jpeg|png|gif|bmp|zip|rar|7z|gz|csv|swf)$`)
  30. )
  31. type (
  32. //单一任务
  33. VM struct {
  34. attachesDir string
  35. dnf be.EventNotifyFace
  36. }
  37. //执行单元
  38. Worker struct {
  39. vm *VM
  40. baseCancel, incCancel context.CancelFunc
  41. ctx context.Context
  42. js string
  43. contentDelay int64
  44. }
  45. )
  46. var (
  47. //go:embed load_list_items.js
  48. loadListItemsJS string
  49. //go:embed load_content.js
  50. loadContentJS string
  51. )
  52. // renderJavascriptCoder
  53. func renderJavascriptCoder(tpl string, sc *be.SpiderConfig) string {
  54. t, err := template.New("").Parse(tpl)
  55. if err != nil {
  56. qu.Debug("创建JS代码模板失败", err.Error())
  57. return ""
  58. }
  59. buf := new(bytes.Buffer)
  60. err = t.Execute(buf, sc)
  61. if err != nil {
  62. qu.Debug("执行JS代码模板失败", err.Error())
  63. return ""
  64. }
  65. return buf.String()
  66. }
  67. // downloadAttaches 下载附件
  68. func downloadAttaches(v *be.ResultItem, attachesDir string) {
  69. client := &http.Client{
  70. Timeout: 30 * time.Second,
  71. }
  72. for _, attach := range v.AttachLinks {
  73. qu.Debug("准备下载附件,", attach.Href, attach.Title)
  74. //if !Reg_File_Type.MatchString(attach.Title) {
  75. req, err := http.NewRequest("HEAD", attach.Href, nil)
  76. if err != nil {
  77. continue
  78. }
  79. resp, err := client.Do(req)
  80. if err != nil || resp.StatusCode != http.StatusOK {
  81. continue
  82. }
  83. ft := resp.Header.Get("Content-Type")
  84. fl := resp.Header.Get("Content-Length")
  85. qu.Debug("------------", ft, qu.IntAll(fl), qu.IntAll(fl)/1024)
  86. if !Reg_File_ContentType.MatchString(ft) || qu.IntAll(fl) < 1024*5 {
  87. continue
  88. }
  89. //}
  90. req, err = http.NewRequest("GET", attach.Href, nil)
  91. if err != nil {
  92. qu.Debug(" 下载附件 构建req 出错:", attach.Href, attach.FileName, err.Error())
  93. continue
  94. }
  95. //构造请求头
  96. var hostName string
  97. if parsedURL, err := url.Parse(attach.Href); err == nil {
  98. hostName = parsedURL.Host
  99. }
  100. req.Header.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
  101. req.Header.Add("host", hostName)
  102. req.Header.Add("referer", v.Href)
  103. resp, err = client.Do(req)
  104. if err != nil {
  105. qu.Debug(" 下载附件 发送请求 出错:", attach.Href, attach.FileName, err.Error())
  106. continue
  107. }
  108. bs, err := ioutil.ReadAll(resp.Body)
  109. if err != nil {
  110. qu.Debug(" 下载附件 下载 出错:", attach.Href, attach.FileName, err.Error())
  111. continue
  112. }
  113. resp.Body.Close()
  114. //TODO 写入文件
  115. mtype := mimetype.Detect(bs)
  116. //不要HTML网页
  117. if strings.Contains(strings.ToLower(mtype.String()), "html") {
  118. qu.Debug("附件为网页类型,过滤")
  119. continue
  120. }
  121. fileName := fmt.Sprintf("%s_%04d_%04d_%04d%s", time.Now().Format("20060102150405"), rand.Intn(9999),
  122. rand.Intn(9999), rand.Intn(9999), mtype.Extension())
  123. save2File := attachesDir + "/" + fileName
  124. fo, err := os.Create(save2File)
  125. if err != nil {
  126. qu.Debug(" 下载附件 生成文件 出错:", attach.Href, attach.FileName, save2File, err.Error())
  127. continue
  128. }
  129. fo.Write(bs)
  130. fo.Close()
  131. attach.FileName = fileName
  132. attach.FilePath = save2File
  133. attach.FileType = mtype.String()
  134. attach.FileSize = fmt.Sprintf("%.02fMB", float32(len(bs))/1024/1024)
  135. }
  136. //只过滤有效的附件
  137. newAttachesLinks := make([]*be.AttachLink, 0)
  138. for _, a := range v.AttachLinks {
  139. if a.FilePath != "" {
  140. newAttachesLinks = append(newAttachesLinks, a)
  141. }
  142. }
  143. v.AttachLinks = newAttachesLinks
  144. }
  145. // trunPage 翻页,需要作检查
  146. func trunPage(sc *be.SpiderConfig, delay int64, ctx context.Context) error {
  147. if sc.ListBodyCss == "" || (sc.ListNextPageCss == "" && sc.ListTurnPageJSCode == "") {
  148. return errors.New("当前爬虫配置,不具备翻页条件")
  149. }
  150. var runJs, result string = sc.ListTurnPageJSCode, ""
  151. if be.RegSpace.ReplaceAllString(runJs, "") == "" {
  152. runJs = fmt.Sprintf(`var link=document.querySelector("%s");if(link)link.click();""`, sc.ListNextPageCss)
  153. }
  154. qu.Debug("将要执行翻页的JS代码,", runJs)
  155. //TODO 1. 获取当前列表当前页的内容快照,以便与翻页后的结果对比
  156. var result1, result2 string
  157. var checkRunJs = fmt.Sprintf(`document.querySelector("%s").outerText`, sc.ListBodyCss)
  158. qu.Debug("获取当前页内容,执行的JS", checkRunJs)
  159. //获取当前页内容
  160. err := chromedp.Run(ctx, chromedp.Tasks{
  161. chromedp.Evaluate(checkRunJs, &result1),
  162. })
  163. if err != nil {
  164. qu.Debug("翻页检查1失败,", checkRunJs)
  165. return err
  166. }
  167. qu.Debug("第一页:", result1)
  168. qu.Debug("执行翻页JS:", runJs, delay)
  169. //执行翻页
  170. if runJs != "" {
  171. //可能就没有分页
  172. err = chromedp.Run(ctx, chromedp.Tasks{
  173. chromedp.Evaluate(runJs, &result),
  174. chromedp.Sleep(time.Duration(delay) * time.Millisecond),
  175. })
  176. if err != nil {
  177. qu.Debug("翻页操作失败,", runJs)
  178. return err
  179. }
  180. } else {
  181. return errors.New("trun page error ")
  182. }
  183. //获取翻页后内容
  184. err = chromedp.Run(ctx, chromedp.Tasks{
  185. chromedp.Evaluate(checkRunJs, &result2),
  186. })
  187. qu.Debug("第二页:", result2)
  188. if err != nil {
  189. qu.Debug("翻页检查2失败,", checkRunJs)
  190. return err
  191. }
  192. if result1 == "" || result2 == "" || result1 == result2 {
  193. return errors.New("翻页失败,两次翻页获取到的列表区域块不符合要求")
  194. }
  195. return nil
  196. }