data_processing
/
spider_creater


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
							package vm

import (
	"bytes"
	"context"
	_ "embed"
	"errors"
	"fmt"
	"io/ioutil"
	"log"
	"math/rand"
	"net/http"
	"os"
	be "spidercreator/backend"
	"strings"
	"text/template"
	"time"

	"github.com/chromedp/chromedp"

	"github.com/gabriel-vasile/mimetype"
)

const (
	MAX_TRUN_PAGE = 1000
)

type (
	//单一任务
	VM struct {
		attachesDir string
		dnf         be.EventNotifyFace
	}
	//执行单元
	Worker struct {
		vm                    *VM
		baseCancel, incCancel context.CancelFunc
		ctx                   context.Context
		js                    string
		contentDelay          int64
	}
)

var (
	//go:embed load_list_items.js
	loadListItemsJS string
	//go:embed load_content.js
	loadContentJS string
)

// renderJavascriptCoder
func renderJavascriptCoder(tpl string, sc *be.SpiderConfig) string {
	t, err := template.New("").Parse(tpl)
	if err != nil {
		log.Println("创建JS代码模板失败", err.Error())
		return ""
	}
	buf := new(bytes.Buffer)
	err = t.Execute(buf, sc)
	if err != nil {
		log.Println("执行JS代码模板失败", err.Error())
		return ""
	}
	return buf.String()
}

// downloadAttaches 下载附件
func downloadAttaches(v *be.ResultItem, attachesDir string) {
	client := &http.Client{
		Timeout: 30 * time.Second,
	}
	for _, attach := range v.AttachLinks {
		log.Println("准备下载附件,", attach.Href, attach.Title)
		req, err := http.NewRequest("GET", attach.Href, nil)
		if err != nil {
			log.Println(" 下载附件 构建req 出错:", attach.Href, attach.FileName, err.Error())
			continue
		}
		req.Header.Add("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
		resp, err := client.Do(req)
		if err != nil {
			log.Println(" 下载附件 发送请求 出错:", attach.Href, attach.FileName, err.Error())
			continue
		}
		bs, err := ioutil.ReadAll(resp.Body)
		if err != nil {
			log.Println(" 下载附件 下载 出错:", attach.Href, attach.FileName, err.Error())
			continue
		}
		resp.Body.Close()
		//TODO 写入文件
		mtype := mimetype.Detect(bs)
		//不要HTML网页
		if strings.Contains(strings.ToLower(mtype.String()), "html") {
			continue
		}
		fileName := fmt.Sprintf("%s_%04d_%04d_%04d%s", time.Now().Format("20060102150405"), rand.Intn(9999),
			rand.Intn(9999), rand.Intn(9999), mtype.Extension())
		save2File := attachesDir + "/" + fileName
		fo, err := os.Create(save2File)
		if err != nil {
			log.Println(" 下载附件 生成文件 出错:", attach.Href, attach.FileName, save2File, err.Error())
			continue
		}
		fo.Write(bs)
		fo.Close()
		attach.FileName = fileName
		attach.FilePath = save2File
		attach.FileType = mtype.String()
		attach.FileSize = fmt.Sprintf("%.02fMB", float32(len(bs))/1024/1024)
	}
	//只过滤有效的附件
	newAttachesLinks := make([]*be.AttachLink, 0)
	for _, a := range v.AttachLinks {
		if a.FilePath != "" {
			newAttachesLinks = append(newAttachesLinks, a)
		}
	}
	v.AttachLinks = newAttachesLinks
}

// trunPage 翻页，需要作检查
func trunPage(sc *be.SpiderConfig, delay int64, ctx context.Context) error {
	if sc.ListBodyCss == "" || (sc.ListNextPageCss == "" && sc.ListTrunPageJSCode == "") {
		return errors.New("当前爬虫配置，不具备翻页条件")
	}
	var runJs, result string = sc.ListTrunPageJSCode, ""
	if runJs == "" {
		runJs = fmt.Sprintf(`var link=document.querySelector("%s");if(link)link.click();""`, sc.ListNextPageCss)
	}
	log.Println("将要执行翻页的JS代码,", runJs)
	//TODO 1. 获取当前列表当前页的内容快照，以便与翻页后的结果对比
	var result1, result2 string
	var checkRunJs = fmt.Sprintf(`document.querySelector("%s").outerText`, sc.ListBodyCss)
	log.Println("检查翻页是否成功，执行的JS", checkRunJs)
	err := chromedp.Run(ctx, chromedp.Tasks{
		chromedp.Evaluate(checkRunJs, &result1),
	})
	if err != nil {
		log.Println("翻页检查1失败，", checkRunJs)
		return err
	}
	if runJs != "" {
		//可能就没有分页
		err = chromedp.Run(ctx, chromedp.Tasks{
			chromedp.Evaluate(runJs, &result),
			chromedp.Sleep(time.Duration(delay) * time.Millisecond),
		})
		if err != nil {
			log.Println("翻页操作失败，", runJs)
			return err
		}
	} else {
		return errors.New("trun page error ")
	}
	err = chromedp.Run(ctx, chromedp.Tasks{
		chromedp.Evaluate(checkRunJs, &result2),
	})

	if err != nil {
		log.Println("翻页检查2失败，", checkRunJs)
		return err
	}
	if result1 == "" || result2 == "" || result1 == result2 {
		return errors.New("翻页失败，两次翻页获取到的列表区域块不符合要求")
	}
	return nil
}