package vm import ( "bytes" "context" _ "embed" "errors" "fmt" "io/ioutil" qu "jygit.jydev.jianyu360.cn/data_processing/common_utils" "math/rand" "net/http" "os" be "spider_creator/backend" "strings" "text/template" "time" "github.com/chromedp/chromedp" "github.com/gabriel-vasile/mimetype" ) const ( MAX_TRUN_PAGE = 1000 ) type ( //单一任务 VM struct { attachesDir string dnf be.EventNotifyFace } //执行单元 Worker struct { vm *VM baseCancel, incCancel context.CancelFunc ctx context.Context js string contentDelay int64 } ) var ( //go:embed load_list_items.js loadListItemsJS string //go:embed load_content.js loadContentJS string ) // renderJavascriptCoder func renderJavascriptCoder(tpl string, sc *be.SpiderConfig) string { t, err := template.New("").Parse(tpl) if err != nil { qu.Debug("创建JS代码模板失败", err.Error()) return "" } buf := new(bytes.Buffer) err = t.Execute(buf, sc) if err != nil { qu.Debug("执行JS代码模板失败", err.Error()) return "" } return buf.String() } // downloadAttaches 下载附件 func downloadAttaches(v *be.ResultItem, attachesDir string) { client := &http.Client{ Timeout: 30 * time.Second, } for _, attach := range v.AttachLinks { qu.Debug("准备下载附件,", attach.Href, attach.Title) req, err := http.NewRequest("GET", attach.Href, nil) if err != nil { qu.Debug(" 下载附件 构建req 出错:", attach.Href, attach.FileName, err.Error()) continue } req.Header.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36") resp, err := client.Do(req) if err != nil { qu.Debug(" 下载附件 发送请求 出错:", attach.Href, attach.FileName, err.Error()) continue } bs, err := ioutil.ReadAll(resp.Body) if err != nil { qu.Debug(" 下载附件 下载 出错:", attach.Href, attach.FileName, err.Error()) continue } resp.Body.Close() //TODO 写入文件 mtype := mimetype.Detect(bs) //不要HTML网页 if strings.Contains(strings.ToLower(mtype.String()), "html") { continue } fileName := fmt.Sprintf("%s_%04d_%04d_%04d%s", time.Now().Format("20060102150405"), rand.Intn(9999), rand.Intn(9999), rand.Intn(9999), mtype.Extension()) save2File := attachesDir + "/" + fileName fo, err := os.Create(save2File) if err != nil { qu.Debug(" 下载附件 生成文件 出错:", attach.Href, attach.FileName, save2File, err.Error()) continue } fo.Write(bs) fo.Close() attach.FileName = fileName attach.FilePath = save2File attach.FileType = mtype.String() attach.FileSize = fmt.Sprintf("%.02fMB", float32(len(bs))/1024/1024) } //只过滤有效的附件 newAttachesLinks := make([]*be.AttachLink, 0) for _, a := range v.AttachLinks { if a.FilePath != "" { newAttachesLinks = append(newAttachesLinks, a) } } v.AttachLinks = newAttachesLinks } // trunPage 翻页,需要作检查 func trunPage(sc *be.SpiderConfig, delay int64, ctx context.Context) error { if sc.ListBodyCss == "" || (sc.ListNextPageCss == "" && sc.ListTurnPageJSCode == "") { return errors.New("当前爬虫配置,不具备翻页条件") } var runJs, result string = sc.ListTurnPageJSCode, "" if runJs == "" { runJs = fmt.Sprintf(`var link=document.querySelector("%s");if(link)link.click();""`, sc.ListNextPageCss) } qu.Debug("将要执行翻页的JS代码,", runJs) //TODO 1. 获取当前列表当前页的内容快照,以便与翻页后的结果对比 var result1, result2 string var checkRunJs = fmt.Sprintf(`document.querySelector("%s").outerText`, sc.ListBodyCss) qu.Debug("获取当前页内容,执行的JS", checkRunJs) //获取当前页内容 err := chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(checkRunJs, &result1), }) if err != nil { qu.Debug("翻页检查1失败,", checkRunJs) return err } qu.Debug("第一页:", result1) qu.Debug("执行翻页JS:", runJs, delay) //执行翻页 if runJs != "" { //可能就没有分页 err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(runJs, &result), chromedp.Sleep(time.Duration(delay) * time.Millisecond), }) if err != nil { qu.Debug("翻页操作失败,", runJs) return err } } else { return errors.New("trun page error ") } qu.Debug("--------------------------") //获取翻页后内容 err = chromedp.Run(ctx, chromedp.Tasks{ chromedp.Evaluate(checkRunJs, &result2), }) qu.Debug("第二页:", result2) if err != nil { qu.Debug("翻页检查2失败,", checkRunJs) return err } if result1 == "" || result2 == "" || result1 == result2 { return errors.New("翻页失败,两次翻页获取到的列表区域块不符合要求") } return nil }