|
@@ -0,0 +1,217 @@
|
|
|
+package spiderutil
|
|
|
+
|
|
|
+type ChromeActions struct {
|
|
|
+ Action string `json:"action"` //执行动作
|
|
|
+ Param string `json:"param"` //选择器语句
|
|
|
+ Selector string `json:"selector"` //选择器Selector
|
|
|
+}
|
|
|
+
|
|
|
+type ChromeTask struct {
|
|
|
+ Flow bool `json:"flow"` //是否是顺序采集
|
|
|
+ RunRedis bool `json:"runredis"` //是否执行redis判重(只用于顺序采集)
|
|
|
+ TimeOut int64 `json:"timeout"` //超时时间
|
|
|
+ Actions []ChromeActions `json:"actions"` //动作集
|
|
|
+ //顺序采集时需要下方采集详情页参数
|
|
|
+ OtherTimeOut int64 `json:"othertimeout"` //超时时间`
|
|
|
+ OtherActions []ChromeActions `json:"otheractions"` //动作集
|
|
|
+}
|
|
|
+
|
|
|
+//const (
|
|
|
+// ActionTypeNavigate string = "navigate"
|
|
|
+// ActionTypeClick string = "click"
|
|
|
+// ActionTypeOuterHTML string = "outerhtml"
|
|
|
+// ActionTypeEvaluate string = "evaluate"
|
|
|
+// ActionTypeWaitReady string = "waitready"
|
|
|
+// ActionTypeWaitVisible string = "waitvisible"
|
|
|
+// ActionTypeSleep string = "wait"
|
|
|
+//
|
|
|
+// //ActionTypeInput string = "input"
|
|
|
+// //ActionTypeScroll string = "scroll"
|
|
|
+// //ActionTypeAssert string = "assert"
|
|
|
+// //ActionTypeClose string = "close"
|
|
|
+//)
|
|
|
+//
|
|
|
+//type ChromedpCase struct {
|
|
|
+// TimeOunt int64
|
|
|
+// IsProxy bool
|
|
|
+// Actions []Action
|
|
|
+// //Flags map[string]interface{}
|
|
|
+// //Options []Option
|
|
|
+//}
|
|
|
+//
|
|
|
+//type Action struct {
|
|
|
+// Type string //动作类型
|
|
|
+// Params map[string]interface{} //参数
|
|
|
+//
|
|
|
+// //Name string //动作名称
|
|
|
+// //Wait int64 //延时
|
|
|
+// //WaitReady string //页面等待加载
|
|
|
+// //FailContinue bool //检索失败是否继续
|
|
|
+// //Checks *Check //检查点
|
|
|
+//}
|
|
|
+//
|
|
|
+//func GetChromedpCase(timeout int64, isProxy bool, task []map[string]interface{}) *ChromedpCase {
|
|
|
+// chromeCase := &ChromedpCase{
|
|
|
+// TimeOunt: timeout,
|
|
|
+// IsProxy: isProxy,
|
|
|
+// Actions: GetActions(task),
|
|
|
+// //Flags: map[string]interface{}{},
|
|
|
+// }
|
|
|
+// //if blink != "" {
|
|
|
+// // chromeCase.Flags = map[string]interface{}{"disable-blink-features": "AutomationControlled"}
|
|
|
+// //}
|
|
|
+// return chromeCase
|
|
|
+//}
|
|
|
+//
|
|
|
+//func GetActions(task []map[string]interface{}) (acts []Action) {
|
|
|
+// for _, method_param := range task {
|
|
|
+// for method, param := range method_param {
|
|
|
+// switch method {
|
|
|
+// case ActionTypeNavigate: //打开网页
|
|
|
+// acts = append(acts, Action{
|
|
|
+// Type: method,
|
|
|
+// Params: map[string]interface{}{"url": param},
|
|
|
+// })
|
|
|
+// case ActionTypeClick: //点击
|
|
|
+// acts = append(acts, Action{
|
|
|
+// Type: method,
|
|
|
+// Params: map[string]interface{}{"selector": param},
|
|
|
+// })
|
|
|
+// case ActionTypeOuterHTML: //输出html
|
|
|
+// acts = append(acts, Action{
|
|
|
+// Type: method,
|
|
|
+// Params: map[string]interface{}{"selector": param},
|
|
|
+// })
|
|
|
+// case ActionTypeEvaluate: //执行javascript
|
|
|
+// acts = append(acts, Action{
|
|
|
+// Type: method,
|
|
|
+// Params: map[string]interface{}{"selector": param},
|
|
|
+// })
|
|
|
+// case ActionTypeWaitReady: //等待元素加载完毕
|
|
|
+// acts = append(acts, Action{
|
|
|
+// Type: method,
|
|
|
+// Params: map[string]interface{}{"selector": param},
|
|
|
+// })
|
|
|
+// case ActionTypeWaitVisible: //等待元素可见
|
|
|
+// acts = append(acts, Action{
|
|
|
+// Type: method,
|
|
|
+// Params: map[string]interface{}{"selector": param},
|
|
|
+// })
|
|
|
+// case ActionTypeSleep:
|
|
|
+// acts = append(acts, Action{
|
|
|
+// Type: method,
|
|
|
+// Params: map[string]interface{}{"duration": qu.Int64All(param)},
|
|
|
+// })
|
|
|
+// }
|
|
|
+//
|
|
|
+// }
|
|
|
+// }
|
|
|
+// return
|
|
|
+//}
|
|
|
+//
|
|
|
+//func DownloadByChromedp(chromeCase *ChromedpCase) (resultHtml []string) {
|
|
|
+// if chromeCase != nil {
|
|
|
+// //1、设置浏览器
|
|
|
+// options := []chromedp.ExecAllocatorOption{
|
|
|
+// chromedp.Flag("headless", false),
|
|
|
+// chromedp.Flag("disable-blink-features", "AutomationControlled"), //headless参数是用来控制Chrome/Chromium是否以无头模式运行的
|
|
|
+// chromedp.Flag("disable-gpu", true), //关闭gpu
|
|
|
+// chromedp.Flag("disable-dev-shm-usage", true), //chromedp禁用系统文件存储/dev/shm
|
|
|
+// chromedp.Flag("default-browser-check", true), //禁用默认浏览器检查
|
|
|
+// chromedp.Flag("disable-plugins", true), //禁用插件
|
|
|
+// chromedp.Flag("ignore-certificate-errors", true), //忽略错误
|
|
|
+// chromedp.Flag("disable-web-security", true), //禁用网络安全标志
|
|
|
+// chromedp.Flag("mute-audio", true), // 关闭声音
|
|
|
+// chromedp.Flag("accept-language", `zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6`), //
|
|
|
+// //chromedp.Flag("blink-settings", "imageEnable=false"),//禁用页面图片
|
|
|
+// //chromedp.Flag("user-agent", ""), //客户端的类型和版本号
|
|
|
+// }
|
|
|
+// //其他设置
|
|
|
+// if chromeCase.IsProxy {
|
|
|
+// proxyAddr := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
|
|
|
+// proxyauthor := "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
|
|
|
+// proxyIp := GetProxyAddr(proxyAddr, proxyauthor)
|
|
|
+// options = append(options, chromedp.ProxyServer(proxyIp))
|
|
|
+// }
|
|
|
+// //for k, v := range chromeCase.Flags {
|
|
|
+// // if vv, ok := v.(string); ok {
|
|
|
+// // options = append(options, chromedp.Flag(k, vv))
|
|
|
+// // } else if vv, ok := v.(bool); ok {
|
|
|
+// // options = append(options, chromedp.Flag(k, vv))
|
|
|
+// // }
|
|
|
+// //}
|
|
|
+// options = append(chromedp.DefaultExecAllocatorOptions[:], options...)
|
|
|
+// //2、创建上下文
|
|
|
+// var ctx context.Context
|
|
|
+// var cancel context.CancelFunc
|
|
|
+// ctx, cancel = chromedp.NewExecAllocator(context.Background(), options...)
|
|
|
+// ctx, cancel = chromedp.NewContext(ctx)
|
|
|
+// if chromeCase.TimeOunt > 0 { //设置页面打开时长
|
|
|
+// ctx, cancel = context.WithTimeout(ctx, time.Duration(chromeCase.TimeOunt)*time.Second)
|
|
|
+// } else { //设置默认打开时长
|
|
|
+// ctx, cancel = context.WithTimeout(ctx, 30*time.Second)
|
|
|
+// }
|
|
|
+// defer cancel() //关闭浏览器
|
|
|
+// //3、执行动作集
|
|
|
+// act := []chromedp.Action{}
|
|
|
+// for _, action := range chromeCase.Actions {
|
|
|
+// switch action.Type {
|
|
|
+// case ActionTypeNavigate: //打开网页
|
|
|
+// act = append(act, chromedp.Navigate(action.Params["url"].(string)))
|
|
|
+// case ActionTypeClick: //点击
|
|
|
+// act = append(act, chromedp.Click(action.Params["selector"].(string)))
|
|
|
+// case ActionTypeOuterHTML: //输出html
|
|
|
+// //act = append(act, chromedp.OuterHTML(action.Params["selector"].(string), nil))
|
|
|
+// act = append(act, OuterHTMLFunc(action.Params["selector"].(string), &resultHtml))
|
|
|
+// case ActionTypeEvaluate: //执行javascript
|
|
|
+// //act = append(act, chromedp.Evaluate(action.Params["selector"].(string), nil))
|
|
|
+// act = append(act, EvaluateFunc(action.Params["selector"].(string), &resultHtml))
|
|
|
+// case ActionTypeWaitReady: //等待元素加载完毕
|
|
|
+// act = append(act, chromedp.WaitReady(action.Params["selector"].(string)))
|
|
|
+// case ActionTypeWaitVisible: //等待元素可见
|
|
|
+// act = append(act, chromedp.WaitVisible(action.Params["selector"].(string)))
|
|
|
+// case ActionTypeSleep:
|
|
|
+// act = append(act, CdpSleep(action.Params["duration"].(int64)))
|
|
|
+// }
|
|
|
+// }
|
|
|
+// err := chromedp.Run(ctx, act...)
|
|
|
+// if err != nil {
|
|
|
+// fmt.Println("Chromedp Run Error :", err)
|
|
|
+// }
|
|
|
+//
|
|
|
+// return
|
|
|
+// }
|
|
|
+// return
|
|
|
+//}
|
|
|
+//
|
|
|
+//func CdpSleep(sleep int64) chromedp.Action {
|
|
|
+// if sleep < 1 {
|
|
|
+// sleep = 1
|
|
|
+// }
|
|
|
+// return chromedp.Sleep(time.Duration(sleep) * time.Second)
|
|
|
+//}
|
|
|
+//
|
|
|
+////OuterHTML获取html
|
|
|
+//func OuterHTMLFunc(sel string, result *[]string) chromedp.ActionFunc {
|
|
|
+// return func(ctx context.Context) (err error) {
|
|
|
+// var html string
|
|
|
+// //chromedp.OuterHTML(sel, &html).Do(ctx)
|
|
|
+// chromedp.OuterHTML(sel, &html, chromedp.ByQuery).Do(ctx)
|
|
|
+// if html != "" {
|
|
|
+// *result = append(*result, html)
|
|
|
+// }
|
|
|
+// return
|
|
|
+// }
|
|
|
+//}
|
|
|
+//
|
|
|
+////Evaluate获取js执行结果
|
|
|
+//func EvaluateFunc(sel string, result *[]string) chromedp.ActionFunc {
|
|
|
+// return func(ctx context.Context) (err error) {
|
|
|
+// var res string
|
|
|
+// chromedp.Evaluate(sel, &res).Do(ctx)
|
|
|
+// if res != "" {
|
|
|
+// *result = append(*result, res)
|
|
|
+// }
|
|
|
+// return
|
|
|
+// }
|
|
|
+//}
|