package main import ( "context" "fmt" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/chromedp" "github.com/donnie4w/go-logger/logger" "time" ) const ( ActionTypeNavigate string = "navigate" //打开 ActionTypeClick string = "click" //点击 ActionTypeOuterHTML string = "outerhtml" //返回html ActionTypeEvaluate string = "evaluate" //执行js ActionTypeWaitReady string = "waitready" //等待元素加载完毕 ActionTypeWaitVisible string = "waitvisible" //等待元素可见 ActionTypeSleep string = "wait" //等待休息 ActionTypeNodes string = "listhref" //匹配所有a链接节点 ActionTypeList string = "listhtml" //匹配列表要保留的信息 ActionTypeChangeIp string = "changeip" //切换IP //ActionTypeInput string = "input" //ActionTypeScroll string = "scroll" //ActionTypeAssert string = "assert" //ActionTypeClose string = "close" GetListHtmlJS string = ` function getParentHTML(xpath) { let result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null); let node = result.singleNodeValue; return node ? node%s.outerHTML : ''; } getParentHTML("%s");` ) //选择器 var ( Selectors = map[string]chromedp.QueryOption{ //"": chromedp.BySearch, //默认值 "ByQuery": chromedp.ByQuery, //根据document.querySelector的规则选择元素,返回单个节点 "ByID": chromedp.ByID, //只id来选择元素 "ByJsPath": chromedp.ByJSPath, //执行js "BySearch": chromedp.BySearch, //如果不写,默认会使用这个选择器,document.performSearch()选择元素,效果等同于`document.querySelector(...)` "ByQueryAll": chromedp.ByQueryAll, //根据document.querySelectorAll返回所有匹配的节点 "ByNodeID": chromedp.ByNodeID, //检索特定节点(必须先有分配的节点ID),注意:必须与 []cdp.NodeID 一起使用 "NodeNotPresent": chromedp.NodeNotPresent, //用于等待直到不存在与查询匹配的元素 "NodeNotVisible": chromedp.NodeNotVisible, //用于等待所有查询的元素节点都已被浏览器发送并且不可见 "NodeVisible": chromedp.NodeVisible, //用于等待所有查询的元素节点都已被浏览器发送并可见 "NodeReady": chromedp.NodeReady, //用于等待浏览器发送所有查询的元素节点 "NodeSelected": chromedp.NodeSelected, //用于等待浏览器发送所有查询的元素节点并选择它们(即具有“已选择”属性) } ) type ChromedpTask struct { //Stype string `json:"stype"` //表示当前请求是下载的列表页、详情页还是列表+详情页(list、detail、list_detail) Flow bool `json:"flow"` //是否是顺序采集 RunRedis bool `json:"runredis"` //是否执行redis判重(只用于顺序采集) TimeOut int64 `json:"timeout"` //超时时间 Actions []Actions `json:"actions"` //动作集 //顺序采集时需要下方采集详情页参数 OtherTimeOut int64 `json:"othertimeout"` //超时时间` OtherActions []Actions `json:"otheractions"` //动作集 } type Actions struct { Action string `json:"action"` //执行动作 Param interface{} `json:"param"` //选择器语句 Selector string `json:"selector"` //选择器Selectors } //执行动作流程,下载html func DownloadHtmlByChromedp(task *ChromedpTask) (result []string) { defer Catch() if len(task.Actions) == 0 { return } //获取一个浏览器实例 browser := <-BrowserGroup defer func() { BrowserGroup <- browser.Revert() }() //封装浏览器动作 acts, _ := task.AssembleTasks(task.Actions, &result, nil, browser) //执行动作,下载 browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) { defer func() { exit <- true }() ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端 err := chromedp.Run(ctx, acts...) if err != nil { logger.Info("Chromedp Run Task Error :", err) //if strings.Contains(err.Error(), "ERR_PROXY_CONNECTION_FAILED") { //代理异常,切换IP browser.ChangeIp() //} } }, task.TimeOut) return } //执行动作流程,下载html func DownloadHtmlByChromedpForFlow(task *ChromedpTask) (result []string) { defer Catch() if len(task.Actions) == 0 || len(task.OtherActions) == 0 { return } if task.TimeOut == 0 { task.TimeOut = Int64All(ChromedpConfig["timeout"]) } //获取一个浏览器实例 browser := <-BrowserGroup defer func() { BrowserGroup <- browser.Revert() }() result = []string{} //记录动作集返回的html resultMap := map[int64]string{} //记录列表页nodes信息,用于匹配详情页tab //封装浏览器动作 acts, _ := task.AssembleTasks(task.Actions, &result, resultMap, browser) //执行动作,下载 browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) { defer func() { exit <- true }() ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端 //执行动作 err := chromedp.Run(ctx, acts...) if err != nil { logger.Info("Chromedp Run Task Error :", err) //if strings.Contains(err.Error(), "ERR_PROXY_CONNECTION_FAILED") { //代理异常 browser.ChangeIp() //} } }, task.TimeOut) return } //执行动作流程,下载html //func DownloadHtmlByChromedpForFlow_back(task *ChromedpTask) (ret []string) { // defer util.Catch() // if len(task.Actions) == 0 || len(task.OtherActions) == 0 { // return // } // //获取一个浏览器实例 // browser := <-BrowserGroup // defer func() { // BrowserGroup <- browser // }() // result := &[2]*[]string{} //记录动作集返回的html // resultMap := map[int64]string{} //记录列表页nodes信息,用于匹配详情页tab // //封装浏览器动作 // acts, oacts, _ := AssembleTasks(task, result, resultMap, browser) // //执行动作,下载 // fmt.Println("Running...") // browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) { // //defer func() { // // exit <- true // //}() // ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端 // //执行列表页动作 // err := chromedp.Run(ctx, acts...) // if err != nil { // fmt.Println("Chromedp Run Error :", err) // } // //获取详情页tab信息 // targets, _ := chromedp.Targets(ctx) //targets无序不能一一对应nodes顺序 // for _, target := range targets { // if target.OpenerFrameID == "" { //判断是否是子标签页 // continue // } // fmt.Println("target id:", target.TargetID, target.URL, target.OpenerFrameID, target.OpenerID, target.BrowserContextID) // continue // newCtx, _ := chromedp.NewContext(ctx, chromedp.WithTargetID(target.TargetID)) //新标签页ctx // fmt.Println("11111111111111111", target.URL) // fmt.Println("==============================================================================================") // continue // err := chromedp.Run(newCtx, oacts...) // if err != nil { // fmt.Println("Chromedp Run Son Tab Error :", err) // } // } // }, task.TimeOut) // fmt.Println("Finish...") // return //} //生成动作集 func (task *ChromedpTask) AssembleTasks(actions []Actions, result *[]string, resultMap map[int64]string, browser *Browser) (acts []chromedp.Action, nodes *[]*cdp.Node) { defer Catch() nodes = &[]*cdp.Node{} for _, act := range actions { switch act.Action { case ActionTypeNavigate: //打开网页 acts = append(acts, chromedp.Navigate(fmt.Sprint(act.Param))) case ActionTypeClick: //点击 acts = append(acts, chromedp.Click(fmt.Sprint(act.Param), Selectors[act.Selector])) case ActionTypeOuterHTML: //输出html acts = append(acts, OuterHTMLFunc(fmt.Sprint(act.Param), act.Selector, result)) case ActionTypeEvaluate: //执行javascript acts = append(acts, EvaluateFunc(fmt.Sprint(act.Param), result)) case ActionTypeWaitReady: //等待元素加载完毕 acts = append(acts, chromedp.WaitReady(fmt.Sprint(act.Param), Selectors[act.Selector])) case ActionTypeWaitVisible: //等待元素可见 acts = append(acts, chromedp.WaitVisible(fmt.Sprint(act.Param), Selectors[act.Selector])) case ActionTypeNodes: //匹配节点元素 acts = append(acts, chromedp.Nodes(fmt.Sprint(act.Param), nodes)) case ActionTypeList: //匹配列表要保留的信息 parentNodeJs := "" for i := 0; i < IntAll(act.Param); i++ { parentNodeJs += ".parentNode" } htmlJsFormat := fmt.Sprintf(GetListHtmlJS, parentNodeJs, "%s") acts = append(acts, task.ClickNodesAndGetHtml(browser, nodes, htmlJsFormat, result, resultMap)) case ActionTypeSleep: acts = append(acts, CdpSleep(IntAll(act.Param))) case ActionTypeChangeIp: acts = append(acts, ChangeIp(browser)) } } return } //切换IP func ChangeIp(b *Browser) chromedp.ActionFunc { defer Catch() return func(ctx context.Context) (err error) { b.ChangeIp() return } } func CdpSleep(sleep int) chromedp.Action { defer Catch() if sleep < 1 { sleep = 1 } return chromedp.Sleep(time.Duration(sleep) * time.Second) } //OuterHTML获取html func OuterHTMLFunc(sel, selector string, result *[]string) chromedp.ActionFunc { defer Catch() return func(ctx context.Context) (err error) { var html string //chromedp.OuterHTML(sel, &html).Do(ctx) chromedp.OuterHTML(sel, &html, Selectors[selector]).Do(ctx) *result = append(*result, html) return } } //Evaluate获取js执行结果 func EvaluateFunc(sel string, result *[]string) chromedp.ActionFunc { defer Catch() return func(ctx context.Context) (err error) { var res string chromedp.Evaluate(sel, &res).Do(ctx) *result = append(*result, res) return } } //点击所有nodes节点并返回html func (task *ChromedpTask) ClickNodesAndGetHtml(browser *Browser, nodes *[]*cdp.Node, jsFormat string, result *[]string, resultMap map[int64]string) chromedp.ActionFunc { defer Catch() return func(ctx context.Context) (err error) { detailResult := []string{} //详情页html信息 targetIdMap := map[string]string{} oacts, _ := task.AssembleTasks(task.OtherActions, &detailResult, nil, browser) //组合下载详情页动作集 for i, node := range *nodes { chromedp.MouseClickNode(node).Do(ctx) //点击 var listHtml string xpath := node.FullXPath() js := fmt.Sprintf(jsFormat, xpath) chromedp.Evaluate(js, &listHtml).Do(ctx) if listHtml != "" { //resultMap[node.NodeID.Int64()] = "" + html + "" listHtml = "" + listHtml + "" //获取多有tab信息 targets, _ := chromedp.Targets(ctx) // for _, tgt := range targets { if tgt.Type != "page" || tgt.TargetID == "" || tgt.OpenerFrameID == "" { continue } targetID := tgt.TargetID.String() if targetIdMap[targetID] != "" { //过滤重复target(目前未能关闭指定target只能通过targeID过滤) continue } if tgt.Type != "page" || targetID == "" || tgt.OpenerFrameID == "" { continue } hashHref := HexText(tgt.URL) if task.RunRedis { //执行redis判重 exists := RedisExist("list", "list_"+hashHref) if exists { //redis判重 logger.Info("redis exists:", tgt.URL) //err := chromedp.Run(ctx, target.CloseTarget(tar.TargetID)) continue } } targetIdMap[targetID] = targetID RunWithTimeoutByTatgetID(tgt.TargetID, ctx, func(ctx context.Context, exit chan<- bool) { defer func() { exit <- true }() ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端 //执行动作 err := chromedp.Run(ctx, oacts...) if err != nil { //获取三级页信息失败 detailResult = append(detailResult, "") //取值失败赋空值,保证下方取值不报错 logger.Info("Get Detail Infor Chromedp Run Error :", err) } }, task.OtherTimeOut) //设置超时时间 detailHtml := detailResult[i] //详情页信息 if task.RunRedis && detailHtml != "" { //执行redis判重且详情页获取到信息 RedisSet("list", "list_"+hashHref, "", 86400*365*2) } detailHtml = listHtml + detailHtml //拼接列表页信息一起返回 *result = append(*result, detailHtml) } } } return nil } }