123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- package vm
- import (
- "bytes"
- "context"
- _ "embed"
- "errors"
- "fmt"
- "io/ioutil"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "math/rand"
- "net/http"
- "os"
- be "spider_creator/backend"
- "strings"
- "text/template"
- "time"
- "github.com/chromedp/chromedp"
- "github.com/gabriel-vasile/mimetype"
- )
- const (
- MAX_TRUN_PAGE = 1000
- )
- type (
- //单一任务
- VM struct {
- attachesDir string
- dnf be.EventNotifyFace
- }
- //执行单元
- Worker struct {
- vm *VM
- baseCancel, incCancel context.CancelFunc
- ctx context.Context
- js string
- contentDelay int64
- }
- )
- var (
- //go:embed load_list_items.js
- loadListItemsJS string
- //go:embed load_content.js
- loadContentJS string
- )
- // renderJavascriptCoder
- func renderJavascriptCoder(tpl string, sc *be.SpiderConfig) string {
- t, err := template.New("").Parse(tpl)
- if err != nil {
- qu.Debug("创建JS代码模板失败", err.Error())
- return ""
- }
- buf := new(bytes.Buffer)
- err = t.Execute(buf, sc)
- if err != nil {
- qu.Debug("执行JS代码模板失败", err.Error())
- return ""
- }
- return buf.String()
- }
- // downloadAttaches 下载附件
- func downloadAttaches(v *be.ResultItem, attachesDir string) {
- client := &http.Client{
- Timeout: 30 * time.Second,
- }
- for _, attach := range v.AttachLinks {
- qu.Debug("准备下载附件,", attach.Href, attach.Title)
- req, err := http.NewRequest("GET", attach.Href, nil)
- if err != nil {
- qu.Debug(" 下载附件 构建req 出错:", attach.Href, attach.FileName, err.Error())
- continue
- }
- req.Header.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
- resp, err := client.Do(req)
- if err != nil {
- qu.Debug(" 下载附件 发送请求 出错:", attach.Href, attach.FileName, err.Error())
- continue
- }
- bs, err := ioutil.ReadAll(resp.Body)
- if err != nil {
- qu.Debug(" 下载附件 下载 出错:", attach.Href, attach.FileName, err.Error())
- continue
- }
- resp.Body.Close()
- //TODO 写入文件
- mtype := mimetype.Detect(bs)
- //不要HTML网页
- if strings.Contains(strings.ToLower(mtype.String()), "html") {
- continue
- }
- fileName := fmt.Sprintf("%s_%04d_%04d_%04d%s", time.Now().Format("20060102150405"), rand.Intn(9999),
- rand.Intn(9999), rand.Intn(9999), mtype.Extension())
- save2File := attachesDir + "/" + fileName
- fo, err := os.Create(save2File)
- if err != nil {
- qu.Debug(" 下载附件 生成文件 出错:", attach.Href, attach.FileName, save2File, err.Error())
- continue
- }
- fo.Write(bs)
- fo.Close()
- attach.FileName = fileName
- attach.FilePath = save2File
- attach.FileType = mtype.String()
- attach.FileSize = fmt.Sprintf("%.02fMB", float32(len(bs))/1024/1024)
- }
- //只过滤有效的附件
- newAttachesLinks := make([]*be.AttachLink, 0)
- for _, a := range v.AttachLinks {
- if a.FilePath != "" {
- newAttachesLinks = append(newAttachesLinks, a)
- }
- }
- v.AttachLinks = newAttachesLinks
- }
- // trunPage 翻页,需要作检查
- func trunPage(sc *be.SpiderConfig, delay int64, ctx context.Context) error {
- if sc.ListBodyCss == "" || (sc.ListNextPageCss == "" && sc.ListTurnPageJSCode == "") {
- return errors.New("当前爬虫配置,不具备翻页条件")
- }
- var runJs, result string = sc.ListTurnPageJSCode, ""
- if runJs == "" {
- runJs = fmt.Sprintf(`var link=document.querySelector("%s");if(link)link.click();""`, sc.ListNextPageCss)
- }
- qu.Debug("将要执行翻页的JS代码,", runJs)
- //TODO 1. 获取当前列表当前页的内容快照,以便与翻页后的结果对比
- var result1, result2 string
- var checkRunJs = fmt.Sprintf(`document.querySelector("%s").outerText`, sc.ListBodyCss)
- qu.Debug("检查翻页是否成功,执行的JS", checkRunJs)
- err := chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(checkRunJs, &result1),
- })
- if err != nil {
- qu.Debug("翻页检查1失败,", checkRunJs)
- return err
- }
- qu.Debug("第一页:", result1)
- qu.Debug("runJs:", runJs, delay)
- if runJs != "" {
- //可能就没有分页
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(runJs, &result),
- chromedp.Sleep(time.Duration(delay) * time.Millisecond),
- })
- if err != nil {
- qu.Debug("翻页操作失败,", runJs)
- return err
- }
- } else {
- return errors.New("trun page error ")
- }
- qu.Debug("--------------------------")
- err = chromedp.Run(ctx, chromedp.Tasks{
- chromedp.Evaluate(checkRunJs, &result2),
- })
- qu.Debug("第二页:", result2)
- if err != nil {
- qu.Debug("翻页检查2失败,", checkRunJs)
- return err
- }
- if result1 == "" || result2 == "" || result1 == result2 {
- return errors.New("翻页失败,两次翻页获取到的列表区域块不符合要求")
- }
- return nil
- }
|