123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327 |
- package main
- import (
- "context"
- "fmt"
- "github.com/chromedp/cdproto/cdp"
- "github.com/chromedp/chromedp"
- "github.com/donnie4w/go-logger/logger"
- "time"
- )
- const (
- ActionTypeNavigate string = "navigate" //打开
- ActionTypeClick string = "click" //点击
- ActionTypeOuterHTML string = "outerhtml" //返回html
- ActionTypeEvaluate string = "evaluate" //执行js
- ActionTypeWaitReady string = "waitready" //等待元素加载完毕
- ActionTypeWaitVisible string = "waitvisible" //等待元素可见
- ActionTypeSleep string = "wait" //等待休息
- ActionTypeNodes string = "listhref" //匹配所有a链接节点
- ActionTypeList string = "listhtml" //匹配列表要保留的信息
- ActionTypeChangeIp string = "changeip" //切换IP
- //ActionTypeInput string = "input"
- //ActionTypeScroll string = "scroll"
- //ActionTypeAssert string = "assert"
- //ActionTypeClose string = "close"
- GetListHtmlJS string = `
- function getParentHTML(xpath) {
- let result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
- let node = result.singleNodeValue;
- return node ? node%s.outerHTML : '';
- }
- getParentHTML("%s");`
- )
- //选择器
- var (
- Selectors = map[string]chromedp.QueryOption{
- //"": chromedp.BySearch, //默认值
- "ByQuery": chromedp.ByQuery, //根据document.querySelector的规则选择元素,返回单个节点
- "ByID": chromedp.ByID, //只id来选择元素
- "ByJsPath": chromedp.ByJSPath, //执行js
- "BySearch": chromedp.BySearch, //如果不写,默认会使用这个选择器,document.performSearch()选择元素,效果等同于`document.querySelector(...)`
- "ByQueryAll": chromedp.ByQueryAll, //根据document.querySelectorAll返回所有匹配的节点
- "ByNodeID": chromedp.ByNodeID, //检索特定节点(必须先有分配的节点ID),注意:必须与 []cdp.NodeID 一起使用
- "NodeNotPresent": chromedp.NodeNotPresent, //用于等待直到不存在与查询匹配的元素
- "NodeNotVisible": chromedp.NodeNotVisible, //用于等待所有查询的元素节点都已被浏览器发送并且不可见
- "NodeVisible": chromedp.NodeVisible, //用于等待所有查询的元素节点都已被浏览器发送并可见
- "NodeReady": chromedp.NodeReady, //用于等待浏览器发送所有查询的元素节点
- "NodeSelected": chromedp.NodeSelected, //用于等待浏览器发送所有查询的元素节点并选择它们(即具有“已选择”属性)
- }
- )
- type ChromedpTask struct {
- //Stype string `json:"stype"` //表示当前请求是下载的列表页、详情页还是列表+详情页(list、detail、list_detail)
- Flow bool `json:"flow"` //是否是顺序采集
- RunRedis bool `json:"runredis"` //是否执行redis判重(只用于顺序采集)
- TimeOut int64 `json:"timeout"` //超时时间
- Actions []Actions `json:"actions"` //动作集
- //顺序采集时需要下方采集详情页参数
- OtherTimeOut int64 `json:"othertimeout"` //超时时间`
- OtherActions []Actions `json:"otheractions"` //动作集
- }
- type Actions struct {
- Action string `json:"action"` //执行动作
- Param interface{} `json:"param"` //选择器语句
- Selector string `json:"selector"` //选择器Selectors
- }
- //执行动作流程,下载html
- func DownloadHtmlByChromedp(task *ChromedpTask) (result []string) {
- defer Catch()
- if len(task.Actions) == 0 {
- return
- }
- //获取一个浏览器实例
- browser := <-BrowserGroup
- defer func() {
- BrowserGroup <- browser.Revert()
- }()
- //封装浏览器动作
- acts, _ := task.AssembleTasks(task.Actions, &result, nil, browser)
- //执行动作,下载
- browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
- defer func() {
- exit <- true
- }()
- ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
- err := chromedp.Run(ctx, acts...)
- if err != nil {
- logger.Info("Chromedp Run Task Error :", err)
- //if strings.Contains(err.Error(), "ERR_PROXY_CONNECTION_FAILED") { //代理异常,切换IP
- browser.ChangeIp()
- //}
- }
- }, task.TimeOut)
- return
- }
- //执行动作流程,下载html
- func DownloadHtmlByChromedpForFlow(task *ChromedpTask) (result []string) {
- defer Catch()
- if len(task.Actions) == 0 || len(task.OtherActions) == 0 {
- return
- }
- if task.TimeOut == 0 {
- task.TimeOut = Int64All(ChromedpConfig["timeout"])
- }
- //获取一个浏览器实例
- browser := <-BrowserGroup
- defer func() {
- BrowserGroup <- browser.Revert()
- }()
- result = []string{} //记录动作集返回的html
- resultMap := map[int64]string{} //记录列表页nodes信息,用于匹配详情页tab
- //封装浏览器动作
- acts, _ := task.AssembleTasks(task.Actions, &result, resultMap, browser)
- //执行动作,下载
- browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
- defer func() {
- exit <- true
- }()
- ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
- //执行动作
- err := chromedp.Run(ctx, acts...)
- if err != nil {
- logger.Info("Chromedp Run Task Error :", err)
- //if strings.Contains(err.Error(), "ERR_PROXY_CONNECTION_FAILED") { //代理异常
- browser.ChangeIp()
- //}
- }
- }, task.TimeOut)
- return
- }
- //执行动作流程,下载html
- //func DownloadHtmlByChromedpForFlow_back(task *ChromedpTask) (ret []string) {
- // defer util.Catch()
- // if len(task.Actions) == 0 || len(task.OtherActions) == 0 {
- // return
- // }
- // //获取一个浏览器实例
- // browser := <-BrowserGroup
- // defer func() {
- // BrowserGroup <- browser
- // }()
- // result := &[2]*[]string{} //记录动作集返回的html
- // resultMap := map[int64]string{} //记录列表页nodes信息,用于匹配详情页tab
- // //封装浏览器动作
- // acts, oacts, _ := AssembleTasks(task, result, resultMap, browser)
- // //执行动作,下载
- // fmt.Println("Running...")
- // browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
- // //defer func() {
- // // exit <- true
- // //}()
- // ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
- // //执行列表页动作
- // err := chromedp.Run(ctx, acts...)
- // if err != nil {
- // fmt.Println("Chromedp Run Error :", err)
- // }
- // //获取详情页tab信息
- // targets, _ := chromedp.Targets(ctx) //targets无序不能一一对应nodes顺序
- // for _, target := range targets {
- // if target.OpenerFrameID == "" { //判断是否是子标签页
- // continue
- // }
- // fmt.Println("target id:", target.TargetID, target.URL, target.OpenerFrameID, target.OpenerID, target.BrowserContextID)
- // continue
- // newCtx, _ := chromedp.NewContext(ctx, chromedp.WithTargetID(target.TargetID)) //新标签页ctx
- // fmt.Println("11111111111111111", target.URL)
- // fmt.Println("==============================================================================================")
- // continue
- // err := chromedp.Run(newCtx, oacts...)
- // if err != nil {
- // fmt.Println("Chromedp Run Son Tab Error :", err)
- // }
- // }
- // }, task.TimeOut)
- // fmt.Println("Finish...")
- // return
- //}
- //生成动作集
- func (task *ChromedpTask) AssembleTasks(actions []Actions, result *[]string, resultMap map[int64]string, browser *Browser) (acts []chromedp.Action, nodes *[]*cdp.Node) {
- defer Catch()
- nodes = &[]*cdp.Node{}
- for _, act := range actions {
- switch act.Action {
- case ActionTypeNavigate: //打开网页
- acts = append(acts, chromedp.Navigate(fmt.Sprint(act.Param)))
- case ActionTypeClick: //点击
- acts = append(acts, chromedp.Click(fmt.Sprint(act.Param), Selectors[act.Selector]))
- case ActionTypeOuterHTML: //输出html
- acts = append(acts, OuterHTMLFunc(fmt.Sprint(act.Param), act.Selector, result))
- case ActionTypeEvaluate: //执行javascript
- acts = append(acts, EvaluateFunc(fmt.Sprint(act.Param), result))
- case ActionTypeWaitReady: //等待元素加载完毕
- acts = append(acts, chromedp.WaitReady(fmt.Sprint(act.Param), Selectors[act.Selector]))
- case ActionTypeWaitVisible: //等待元素可见
- acts = append(acts, chromedp.WaitVisible(fmt.Sprint(act.Param), Selectors[act.Selector]))
- case ActionTypeNodes: //匹配节点元素
- acts = append(acts, chromedp.Nodes(fmt.Sprint(act.Param), nodes))
- case ActionTypeList: //匹配列表要保留的信息
- parentNodeJs := ""
- for i := 0; i < IntAll(act.Param); i++ {
- parentNodeJs += ".parentNode"
- }
- htmlJsFormat := fmt.Sprintf(GetListHtmlJS, parentNodeJs, "%s")
- acts = append(acts, task.ClickNodesAndGetHtml(browser, nodes, htmlJsFormat, result, resultMap))
- case ActionTypeSleep:
- acts = append(acts, CdpSleep(IntAll(act.Param)))
- case ActionTypeChangeIp:
- acts = append(acts, ChangeIp(browser))
- }
- }
- return
- }
- //切换IP
- func ChangeIp(b *Browser) chromedp.ActionFunc {
- defer Catch()
- return func(ctx context.Context) (err error) {
- b.ChangeIp()
- return
- }
- }
- func CdpSleep(sleep int) chromedp.Action {
- defer Catch()
- if sleep < 1 {
- sleep = 1
- }
- return chromedp.Sleep(time.Duration(sleep) * time.Second)
- }
- //OuterHTML获取html
- func OuterHTMLFunc(sel, selector string, result *[]string) chromedp.ActionFunc {
- defer Catch()
- return func(ctx context.Context) (err error) {
- var html string
- //chromedp.OuterHTML(sel, &html).Do(ctx)
- chromedp.OuterHTML(sel, &html, Selectors[selector]).Do(ctx)
- *result = append(*result, html)
- return
- }
- }
- //Evaluate获取js执行结果
- func EvaluateFunc(sel string, result *[]string) chromedp.ActionFunc {
- defer Catch()
- return func(ctx context.Context) (err error) {
- var res string
- chromedp.Evaluate(sel, &res).Do(ctx)
- *result = append(*result, res)
- return
- }
- }
- //点击所有nodes节点并返回html
- func (task *ChromedpTask) ClickNodesAndGetHtml(browser *Browser, nodes *[]*cdp.Node, jsFormat string, result *[]string, resultMap map[int64]string) chromedp.ActionFunc {
- defer Catch()
- return func(ctx context.Context) (err error) {
- detailResult := []string{} //详情页html信息
- targetIdMap := map[string]string{}
- oacts, _ := task.AssembleTasks(task.OtherActions, &detailResult, nil, browser) //组合下载详情页动作集
- for i, node := range *nodes {
- chromedp.MouseClickNode(node).Do(ctx) //点击
- var listHtml string
- xpath := node.FullXPath()
- js := fmt.Sprintf(jsFormat, xpath)
- chromedp.Evaluate(js, &listHtml).Do(ctx)
- if listHtml != "" {
- //resultMap[node.NodeID.Int64()] = "<listhtml>" + html + "</listhtml>"
- listHtml = "<listhtml>" + listHtml + "</listhtml>"
- //获取多有tab信息
- targets, _ := chromedp.Targets(ctx) //
- for _, tgt := range targets {
- if tgt.Type != "page" || tgt.TargetID == "" || tgt.OpenerFrameID == "" {
- continue
- }
- targetID := tgt.TargetID.String()
- if targetIdMap[targetID] != "" { //过滤重复target(目前未能关闭指定target只能通过targeID过滤)
- continue
- }
- if tgt.Type != "page" || targetID == "" || tgt.OpenerFrameID == "" {
- continue
- }
- hashHref := HexText(tgt.URL)
- if task.RunRedis { //执行redis判重
- exists := RedisExist("list", "list_"+hashHref)
- if exists { //redis判重
- logger.Info("redis exists:", tgt.URL)
- //err := chromedp.Run(ctx, target.CloseTarget(tar.TargetID))
- continue
- }
- }
- targetIdMap[targetID] = targetID
- RunWithTimeoutByTatgetID(tgt.TargetID, ctx, func(ctx context.Context, exit chan<- bool) {
- defer func() {
- exit <- true
- }()
- ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
- //执行动作
- err := chromedp.Run(ctx, oacts...)
- if err != nil { //获取三级页信息失败
- detailResult = append(detailResult, "") //取值失败赋空值,保证下方取值不报错
- logger.Info("Get Detail Infor Chromedp Run Error :", err)
- }
- }, task.OtherTimeOut) //设置超时时间
- detailHtml := detailResult[i] //详情页信息
- if task.RunRedis && detailHtml != "" { //执行redis判重且详情页获取到信息
- RedisSet("list", "list_"+hashHref, "", 86400*365*2)
- }
- detailHtml = listHtml + detailHtml //拼接列表页信息一起返回
- *result = append(*result, detailHtml)
- }
- }
- }
- return nil
- }
- }
|