package main
import (
"context"
"fmt"
"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/chromedp"
"github.com/donnie4w/go-logger/logger"
"time"
)
const (
ActionTypeNavigate string = "navigate" //打开
ActionTypeClick string = "click" //点击
ActionTypeOuterHTML string = "outerhtml" //返回html
ActionTypeEvaluate string = "evaluate" //执行js
ActionTypeWaitReady string = "waitready" //等待元素加载完毕
ActionTypeWaitVisible string = "waitvisible" //等待元素可见
ActionTypeSleep string = "wait" //等待休息
ActionTypeNodes string = "listhref" //匹配所有a链接节点
ActionTypeList string = "listhtml" //匹配列表要保留的信息
ActionTypeChangeIp string = "changeip" //切换IP
//ActionTypeInput string = "input"
//ActionTypeScroll string = "scroll"
//ActionTypeAssert string = "assert"
//ActionTypeClose string = "close"
GetListHtmlJS string = `
function getParentHTML(xpath) {
let result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
let node = result.singleNodeValue;
return node ? node%s.outerHTML : '';
}
getParentHTML("%s");`
)
//选择器
var (
Selectors = map[string]chromedp.QueryOption{
//"": chromedp.BySearch, //默认值
"ByQuery": chromedp.ByQuery, //根据document.querySelector的规则选择元素,返回单个节点
"ByID": chromedp.ByID, //只id来选择元素
"ByJsPath": chromedp.ByJSPath, //执行js
"BySearch": chromedp.BySearch, //如果不写,默认会使用这个选择器,document.performSearch()选择元素,效果等同于`document.querySelector(...)`
"ByQueryAll": chromedp.ByQueryAll, //根据document.querySelectorAll返回所有匹配的节点
"ByNodeID": chromedp.ByNodeID, //检索特定节点(必须先有分配的节点ID),注意:必须与 []cdp.NodeID 一起使用
"NodeNotPresent": chromedp.NodeNotPresent, //用于等待直到不存在与查询匹配的元素
"NodeNotVisible": chromedp.NodeNotVisible, //用于等待所有查询的元素节点都已被浏览器发送并且不可见
"NodeVisible": chromedp.NodeVisible, //用于等待所有查询的元素节点都已被浏览器发送并可见
"NodeReady": chromedp.NodeReady, //用于等待浏览器发送所有查询的元素节点
"NodeSelected": chromedp.NodeSelected, //用于等待浏览器发送所有查询的元素节点并选择它们(即具有“已选择”属性)
}
)
type ChromedpTask struct {
//Stype string `json:"stype"` //表示当前请求是下载的列表页、详情页还是列表+详情页(list、detail、list_detail)
Flow bool `json:"flow"` //是否是顺序采集
RunRedis bool `json:"runredis"` //是否执行redis判重(只用于顺序采集)
TimeOut int64 `json:"timeout"` //超时时间
Actions []Actions `json:"actions"` //动作集
//顺序采集时需要下方采集详情页参数
OtherTimeOut int64 `json:"othertimeout"` //超时时间`
OtherActions []Actions `json:"otheractions"` //动作集
}
type Actions struct {
Action string `json:"action"` //执行动作
Param interface{} `json:"param"` //选择器语句
Selector string `json:"selector"` //选择器Selectors
}
//执行动作流程,下载html
func DownloadHtmlByChromedp(task *ChromedpTask) (result []string) {
defer Catch()
if len(task.Actions) == 0 {
return
}
//获取一个浏览器实例
browser := <-BrowserGroup
defer func() {
BrowserGroup <- browser.Revert()
}()
//封装浏览器动作
acts, _ := task.AssembleTasks(task.Actions, &result, nil, browser)
//执行动作,下载
browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
defer func() {
exit <- true
}()
ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
err := chromedp.Run(ctx, acts...)
if err != nil {
logger.Info("Chromedp Run Task Error :", err)
//if strings.Contains(err.Error(), "ERR_PROXY_CONNECTION_FAILED") { //代理异常,切换IP
browser.ChangeIp()
//}
}
}, task.TimeOut)
return
}
//执行动作流程,下载html
func DownloadHtmlByChromedpForFlow(task *ChromedpTask) (result []string) {
defer Catch()
if len(task.Actions) == 0 || len(task.OtherActions) == 0 {
return
}
if task.TimeOut == 0 {
task.TimeOut = Int64All(ChromedpConfig["timeout"])
}
//获取一个浏览器实例
browser := <-BrowserGroup
defer func() {
BrowserGroup <- browser.Revert()
}()
result = []string{} //记录动作集返回的html
resultMap := map[int64]string{} //记录列表页nodes信息,用于匹配详情页tab
//封装浏览器动作
acts, _ := task.AssembleTasks(task.Actions, &result, resultMap, browser)
//执行动作,下载
browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
defer func() {
exit <- true
}()
ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
//执行动作
err := chromedp.Run(ctx, acts...)
if err != nil {
logger.Info("Chromedp Run Task Error :", err)
//if strings.Contains(err.Error(), "ERR_PROXY_CONNECTION_FAILED") { //代理异常
browser.ChangeIp()
//}
}
}, task.TimeOut)
return
}
//执行动作流程,下载html
//func DownloadHtmlByChromedpForFlow_back(task *ChromedpTask) (ret []string) {
// defer util.Catch()
// if len(task.Actions) == 0 || len(task.OtherActions) == 0 {
// return
// }
// //获取一个浏览器实例
// browser := <-BrowserGroup
// defer func() {
// BrowserGroup <- browser
// }()
// result := &[2]*[]string{} //记录动作集返回的html
// resultMap := map[int64]string{} //记录列表页nodes信息,用于匹配详情页tab
// //封装浏览器动作
// acts, oacts, _ := AssembleTasks(task, result, resultMap, browser)
// //执行动作,下载
// fmt.Println("Running...")
// browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
// //defer func() {
// // exit <- true
// //}()
// ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
// //执行列表页动作
// err := chromedp.Run(ctx, acts...)
// if err != nil {
// fmt.Println("Chromedp Run Error :", err)
// }
// //获取详情页tab信息
// targets, _ := chromedp.Targets(ctx) //targets无序不能一一对应nodes顺序
// for _, target := range targets {
// if target.OpenerFrameID == "" { //判断是否是子标签页
// continue
// }
// fmt.Println("target id:", target.TargetID, target.URL, target.OpenerFrameID, target.OpenerID, target.BrowserContextID)
// continue
// newCtx, _ := chromedp.NewContext(ctx, chromedp.WithTargetID(target.TargetID)) //新标签页ctx
// fmt.Println("11111111111111111", target.URL)
// fmt.Println("==============================================================================================")
// continue
// err := chromedp.Run(newCtx, oacts...)
// if err != nil {
// fmt.Println("Chromedp Run Son Tab Error :", err)
// }
// }
// }, task.TimeOut)
// fmt.Println("Finish...")
// return
//}
//生成动作集
func (task *ChromedpTask) AssembleTasks(actions []Actions, result *[]string, resultMap map[int64]string, browser *Browser) (acts []chromedp.Action, nodes *[]*cdp.Node) {
defer Catch()
nodes = &[]*cdp.Node{}
for _, act := range actions {
switch act.Action {
case ActionTypeNavigate: //打开网页
acts = append(acts, chromedp.Navigate(fmt.Sprint(act.Param)))
case ActionTypeClick: //点击
acts = append(acts, chromedp.Click(fmt.Sprint(act.Param), Selectors[act.Selector]))
case ActionTypeOuterHTML: //输出html
acts = append(acts, OuterHTMLFunc(fmt.Sprint(act.Param), act.Selector, result))
case ActionTypeEvaluate: //执行javascript
acts = append(acts, EvaluateFunc(fmt.Sprint(act.Param), result))
case ActionTypeWaitReady: //等待元素加载完毕
acts = append(acts, chromedp.WaitReady(fmt.Sprint(act.Param), Selectors[act.Selector]))
case ActionTypeWaitVisible: //等待元素可见
acts = append(acts, chromedp.WaitVisible(fmt.Sprint(act.Param), Selectors[act.Selector]))
case ActionTypeNodes: //匹配节点元素
acts = append(acts, chromedp.Nodes(fmt.Sprint(act.Param), nodes))
case ActionTypeList: //匹配列表要保留的信息
parentNodeJs := ""
for i := 0; i < IntAll(act.Param); i++ {
parentNodeJs += ".parentNode"
}
htmlJsFormat := fmt.Sprintf(GetListHtmlJS, parentNodeJs, "%s")
acts = append(acts, task.ClickNodesAndGetHtml(browser, nodes, htmlJsFormat, result, resultMap))
case ActionTypeSleep:
acts = append(acts, CdpSleep(IntAll(act.Param)))
case ActionTypeChangeIp:
acts = append(acts, ChangeIp(browser))
}
}
return
}
//切换IP
func ChangeIp(b *Browser) chromedp.ActionFunc {
defer Catch()
return func(ctx context.Context) (err error) {
b.ChangeIp()
return
}
}
func CdpSleep(sleep int) chromedp.Action {
defer Catch()
if sleep < 1 {
sleep = 1
}
return chromedp.Sleep(time.Duration(sleep) * time.Second)
}
//OuterHTML获取html
func OuterHTMLFunc(sel, selector string, result *[]string) chromedp.ActionFunc {
defer Catch()
return func(ctx context.Context) (err error) {
var html string
//chromedp.OuterHTML(sel, &html).Do(ctx)
chromedp.OuterHTML(sel, &html, Selectors[selector]).Do(ctx)
*result = append(*result, html)
return
}
}
//Evaluate获取js执行结果
func EvaluateFunc(sel string, result *[]string) chromedp.ActionFunc {
defer Catch()
return func(ctx context.Context) (err error) {
var res string
chromedp.Evaluate(sel, &res).Do(ctx)
*result = append(*result, res)
return
}
}
//点击所有nodes节点并返回html
func (task *ChromedpTask) ClickNodesAndGetHtml(browser *Browser, nodes *[]*cdp.Node, jsFormat string, result *[]string, resultMap map[int64]string) chromedp.ActionFunc {
defer Catch()
return func(ctx context.Context) (err error) {
detailResult := []string{} //详情页html信息
targetIdMap := map[string]string{}
oacts, _ := task.AssembleTasks(task.OtherActions, &detailResult, nil, browser) //组合下载详情页动作集
for i, node := range *nodes {
chromedp.MouseClickNode(node).Do(ctx) //点击
var listHtml string
xpath := node.FullXPath()
js := fmt.Sprintf(jsFormat, xpath)
chromedp.Evaluate(js, &listHtml).Do(ctx)
if listHtml != "" {
//resultMap[node.NodeID.Int64()] = "" + html + ""
listHtml = "" + listHtml + ""
//获取多有tab信息
targets, _ := chromedp.Targets(ctx) //
for _, tgt := range targets {
if tgt.Type != "page" || tgt.TargetID == "" || tgt.OpenerFrameID == "" {
continue
}
targetID := tgt.TargetID.String()
if targetIdMap[targetID] != "" { //过滤重复target(目前未能关闭指定target只能通过targeID过滤)
continue
}
if tgt.Type != "page" || targetID == "" || tgt.OpenerFrameID == "" {
continue
}
hashHref := HexText(tgt.URL)
if task.RunRedis { //执行redis判重
exists := RedisExist("list", "list_"+hashHref)
if exists { //redis判重
logger.Info("redis exists:", tgt.URL)
//err := chromedp.Run(ctx, target.CloseTarget(tar.TargetID))
continue
}
}
targetIdMap[targetID] = targetID
RunWithTimeoutByTatgetID(tgt.TargetID, ctx, func(ctx context.Context, exit chan<- bool) {
defer func() {
exit <- true
}()
ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
//执行动作
err := chromedp.Run(ctx, oacts...)
if err != nil { //获取三级页信息失败
detailResult = append(detailResult, "") //取值失败赋空值,保证下方取值不报错
logger.Info("Get Detail Infor Chromedp Run Error :", err)
}
}, task.OtherTimeOut) //设置超时时间
detailHtml := detailResult[i] //详情页信息
if task.RunRedis && detailHtml != "" { //执行redis判重且详情页获取到信息
RedisSet("list", "list_"+hashHref, "", 86400*365*2)
}
detailHtml = listHtml + detailHtml //拼接列表页信息一起返回
*result = append(*result, detailHtml)
}
}
}
return nil
}
}