123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196 |
- package backend
- import (
- "context"
- "encoding/json"
- "fmt"
- "io/ioutil"
- "math/rand"
- "net/http"
- "strings"
- "github.com/chromedp/cdproto/cdp"
- "github.com/chromedp/cdproto/network"
- "github.com/chromedp/cdproto/fetch"
- "github.com/chromedp/cdproto/page"
- "github.com/chromedp/chromedp"
- )
- var (
- useragent = []string{
- "Chrome: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
- "Firefox: Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0",
- //"Safari: Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_5 like Mac OS X) AppleWebKit/604.5.6 (KHTML, like Gecko) Version/11.0 Mobile/15D60 Safari/604.1",
- "MacOSX: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
- "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
- "Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
- "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
- "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
- "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
- //"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)",
- "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.70 Safari/537.36",
- "Chrome 9 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
- "Safari Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
- "Safari Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
- "Safari 11 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15 QQBrowserLite/1.3.0",
- "Chrome 9 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
- "Chrome 59 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
- "Chrome 9 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
- "Safari 11 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5",
- "Firefox 9 Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:93.0) Gecko/20100101 Firefox/93.0",
- "Safari Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
- "Chrome 8 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
- "Chrome Mozilla/5.0 (X11; U; U; Linux x86_64; zh-my) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 Puffin/8.3.1.41624AP",
- //"Opera 28 Mozilla/5.0 (Linux; BRAVIA 4K 2015 Build/LMY48E.S265) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36 OPR/28.0.1754.0",
- "Safari Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 HeyTapBrowser/40.7.29.1",
- "Chrome 9 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.58 Safari/537.36 Edg/93.0.961.33",
- "Chrome 9 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/15.0 Chrome/90.0.4430.210 Safari/537.36",
- "Chrome 9 Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
- "Chrome Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
- "Microsoft Edge Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
- //"Chrome 8 Mozilla/5.0 (Windows NT 10.0; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
- //"Chrome 8 Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
- "Chrome 9 Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36",
- //"Chrome 8 Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
- "Chrome 9 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
- "Chrome Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
- //"Firefox 7 Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0",
- "Chrome 9 Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
- "Internet Explorer 11 Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; LCJB; rv:11.0) like Gecko",
- "Chrome 9 Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
- "Firefox 36 Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
- "Chrome Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400",
- "Chrome 58 Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
- "Firefox 9 Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0",
- //"Chrome 8 Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
- "Chrome 9 Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38",
- }
- )
- func NewBrowser(headless bool, showImage bool, proxyServe bool, baseUrl string) (context.Context, context.CancelFunc, context.Context, context.CancelFunc, context.Context, context.CancelFunc) {
- ignoreCertificateErrors := false
- if strings.HasPrefix(baseUrl, "https") {
- ignoreCertificateErrors = true
- }
- ctx, cancelFn := chromedp.NewContext(context.Background())
- chromeOptions := append(chromedp.DefaultExecAllocatorOptions[:],
- chromedp.NoDefaultBrowserCheck, //不检查默认浏览器
- chromedp.Flag("enable-automation", false), // 防止监测webdriver
- chromedp.Flag("force-dev-mode-highlighting", true),
- //--设置禁止HTTP转HTTPS
- chromedp.Flag("disable-extensions", true), //是否禁用扩展
- chromedp.Flag("disable-blink-features", "AutomationControlled"), //禁用 blink 特征
- chromedp.Flag("disable-features", "SSLForcedForSafety"), // 禁用某些安全特性
- chromedp.Flag("disable-features", "SSLForced"),
- chromedp.Flag("disable-features", "AutoupgradeToHTTPS"),
- chromedp.Flag("disable-features", "ImprovedHTTPSUpgrade"),
- chromedp.Flag("ssl-protocol", "any"),
- chromedp.Flag("ignore-certificate-errors-spki-list", true),
- //--置禁止HTTP转HTTPS 结束
- chromedp.Flag("headless", headless),
- chromedp.Flag("user-agent", useragent[rand.Intn(20)]), //搞到底还是要在这里设置useragent
- chromedp.Flag("disable-keep-alive", true),
- chromedp.Flag("disable-gpu", true),
- chromedp.Flag("no-sandbox", true),
- chromedp.Flag("disable-dev-shm-usage", "false"),
- chromedp.Flag("default-browser-check", "false"),
- chromedp.Flag("mute-audio", "false"),
- chromedp.Flag("disable-web-security", true),
- chromedp.Flag("accept-language", `zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6`),
- )
- if proxyServe {
- chromeOptions = append(chromeOptions,
- //chromedp.ProxyServer(fmt.Sprintf("socks5://%s", proxyServe)),
- chromedp.ProxyServer(GetProxyAddr()),
- )
- }
- if ignoreCertificateErrors {
- chromeOptions = append(chromeOptions,
- // ignore-certificate-errors
- chromedp.Flag("ignore-certificate-errors", true),
- )
- }
- if showImage {
- chromeOptions = append(chromeOptions,
- chromedp.Flag("blink-settings", "imagesEnabled=true"),
- )
- } else {
- chromeOptions = append(chromeOptions,
- chromedp.Flag("blink-settings", "imagesEnabled=false"),
- )
- }
- allocCtx, allocCancelFn := chromedp.NewExecAllocator(ctx, chromeOptions...)
- // 创建一个浏览器实例
- incCtx, incCancelFn := chromedp.NewContext(allocCtx,
- chromedp.WithLogf(nil))
- trie := NewTrie()
- //TODO 这里默认构建通用的资源加载排除,最好是单个网站可以定制,
- // 对于纯后端渲染网站,可以屏蔽所有资源加载,达到平台最高性能目的
- trie.BatchInsert(Cfg.DisableLoadResource)
- chromedp.ListenTarget(ctx, func(event interface{}) {
- switch ev := event.(type) {
- case *fetch.EventRequestPaused:
- go func() {
- c := chromedp.FromContext(ctx)
- ctx := cdp.WithExecutor(ctx, c.Target)
- if trie.HasKeyword(ev.Request.URL) {
- fetch.FailRequest(ev.RequestID, network.ErrorReasonBlockedByClient).Do(ctx)
- } else {
- fetch.ContinueRequest(ev.RequestID).Do(ctx)
- }
- }()
- }
- })
- //
- chromedp.Run(ctx,
- fetch.Enable(),
- chromedp.ActionFunc(func(cxt context.Context) error {
- _, err := page.AddScriptToEvaluateOnNewDocument("Object.defineProperty(navigator, 'webdriver', { get: () => false, });").Do(cxt)
- return err
- }),
- )
- return ctx, cancelFn, allocCtx, allocCancelFn, incCtx, incCancelFn
- }
- func GetProxyAddr() string {
- proxyAddr := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
- roxyAuthor := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
- //获取代理
- req, err := http.NewRequest(http.MethodGet, proxyAddr, nil)
- if err != nil {
- fmt.Println("get proxy request err:", err)
- return ""
- }
- //添加请求头
- req.Header.Add("Authorization", roxyAuthor)
- client := http.Client{}
- //发送请求
- resp, err := client.Do(req)
- if err != nil {
- fmt.Println("get proxy client err:", err)
- return ""
- }
- defer resp.Body.Close()
- bodyByte, err := ioutil.ReadAll(resp.Body)
- if err != nil {
- fmt.Println("get proxy read body err:", err)
- return ""
- }
- tmp := map[string]interface{}{}
- if json.Unmarshal(bodyByte, &tmp) != nil {
- return ""
- }
- if data, ok := tmp["data"].(map[string]interface{}); ok && len(data) > 0 {
- if httpProxy, ok := data["http"].(string); ok {
- return httpProxy
- } else if httpsProxy, ok := data["https"].(string); ok {
- return httpsProxy
- }
- }
- return ""
- }
|