browser.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. package backend
  2. import (
  3. "context"
  4. "encoding/json"
  5. "fmt"
  6. "github.com/chromedp/cdproto/cdp"
  7. "github.com/chromedp/cdproto/network"
  8. "io/ioutil"
  9. "math/rand"
  10. "net/http"
  11. "strings"
  12. "github.com/chromedp/cdproto/fetch"
  13. "github.com/chromedp/cdproto/page"
  14. "github.com/chromedp/chromedp"
  15. )
  16. var (
  17. useragent = []string{
  18. "Chrome: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
  19. "Firefox: Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0",
  20. //"Safari: Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_5 like Mac OS X) AppleWebKit/604.5.6 (KHTML, like Gecko) Version/11.0 Mobile/15D60 Safari/604.1",
  21. "MacOSX: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14",
  22. "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
  23. "Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
  24. "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
  25. "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
  26. "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11",
  27. //"Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)",
  28. "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.70 Safari/537.36",
  29. "Chrome 9 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
  30. "Safari Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
  31. "Safari Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
  32. "Safari 11 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15 QQBrowserLite/1.3.0",
  33. "Chrome 9 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
  34. "Chrome 59 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
  35. "Chrome 9 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
  36. "Safari 11 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5",
  37. "Firefox 9 Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:93.0) Gecko/20100101 Firefox/93.0",
  38. "Safari Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
  39. "Chrome 8 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
  40. "Chrome Mozilla/5.0 (X11; U; U; Linux x86_64; zh-my) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 Puffin/8.3.1.41624AP",
  41. //"Opera 28 Mozilla/5.0 (Linux; BRAVIA 4K 2015 Build/LMY48E.S265) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36 OPR/28.0.1754.0",
  42. "Safari Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Safari/537.36 HeyTapBrowser/40.7.29.1",
  43. "Chrome 9 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.58 Safari/537.36 Edg/93.0.961.33",
  44. "Chrome 9 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/15.0 Chrome/90.0.4430.210 Safari/537.36",
  45. "Chrome 9 Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36",
  46. "Chrome Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
  47. "Microsoft Edge Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
  48. //"Chrome 8 Mozilla/5.0 (Windows NT 10.0; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
  49. //"Chrome 8 Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
  50. "Chrome 9 Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36",
  51. //"Chrome 8 Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
  52. "Chrome 9 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  53. "Chrome Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
  54. //"Firefox 7 Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0",
  55. "Chrome 9 Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36",
  56. "Internet Explorer 11 Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; LCJB; rv:11.0) like Gecko",
  57. "Chrome 9 Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",
  58. "Firefox 36 Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
  59. "Chrome Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3870.400 QQBrowser/10.8.4405.400",
  60. "Chrome 58 Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
  61. "Firefox 9 Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0",
  62. //"Chrome 8 Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
  63. "Chrome 9 Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38",
  64. }
  65. )
  66. func NewBrowser(headless bool, showImage bool, proxyServe bool, baseUrl string, filterResource string) (context.Context, context.CancelFunc, context.Context, context.CancelFunc, context.Context, context.CancelFunc) {
  67. ignoreCertificateErrors := false
  68. if strings.HasPrefix(baseUrl, "https") {
  69. ignoreCertificateErrors = true
  70. }
  71. ctx, cancelFn := chromedp.NewContext(context.Background())
  72. chromeOptions := append(chromedp.DefaultExecAllocatorOptions[:],
  73. chromedp.NoDefaultBrowserCheck, //不检查默认浏览器
  74. chromedp.Flag("enable-automation", false), // 防止监测webdriver
  75. chromedp.Flag("force-dev-mode-highlighting", true),
  76. //--设置禁止HTTP转HTTPS
  77. chromedp.Flag("disable-extensions", true), //是否禁用扩展
  78. chromedp.Flag("disable-blink-features", "AutomationControlled"), //禁用 blink 特征
  79. chromedp.Flag("disable-features", "SSLForcedForSafety"), // 禁用某些安全特性
  80. chromedp.Flag("disable-features", "SSLForced"),
  81. chromedp.Flag("disable-features", "AutoupgradeToHTTPS"),
  82. chromedp.Flag("disable-features", "ImprovedHTTPSUpgrade"),
  83. chromedp.Flag("ssl-protocol", "any"),
  84. chromedp.Flag("ignore-certificate-errors-spki-list", true),
  85. //--置禁止HTTP转HTTPS 结束
  86. chromedp.Flag("headless", headless),
  87. chromedp.Flag("user-agent", useragent[rand.Intn(20)]), //搞到底还是要在这里设置useragent
  88. chromedp.Flag("disable-keep-alive", true),
  89. chromedp.Flag("disable-gpu", true),
  90. chromedp.Flag("no-sandbox", true),
  91. chromedp.Flag("disable-dev-shm-usage", "false"),
  92. chromedp.Flag("default-browser-check", "false"),
  93. chromedp.Flag("mute-audio", "false"),
  94. chromedp.Flag("disable-web-security", true),
  95. chromedp.Flag("accept-language", `zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6`),
  96. )
  97. if proxyServe {
  98. chromeOptions = append(chromeOptions,
  99. //chromedp.ProxyServer(fmt.Sprintf("socks5://%s", proxyServe)),
  100. chromedp.ProxyServer(GetProxyAddr()),
  101. )
  102. }
  103. if ignoreCertificateErrors {
  104. chromeOptions = append(chromeOptions,
  105. // ignore-certificate-errors
  106. chromedp.Flag("ignore-certificate-errors", true),
  107. )
  108. }
  109. if showImage {
  110. chromeOptions = append(chromeOptions,
  111. chromedp.Flag("blink-settings", "imagesEnabled=true"),
  112. )
  113. } else {
  114. chromeOptions = append(chromeOptions,
  115. chromedp.Flag("blink-settings", "imagesEnabled=false"),
  116. )
  117. }
  118. allocCtx, allocCancelFn := chromedp.NewExecAllocator(ctx, chromeOptions...)
  119. // 创建一个浏览器实例
  120. incCtx, incCancelFn := chromedp.NewContext(allocCtx,
  121. chromedp.WithLogf(nil))
  122. trie := NewTrie()
  123. //TODO 这里默认构建通用的资源加载排除,最好是单个网站可以定制,
  124. // 对于纯后端渲染网站,可以屏蔽所有资源加载,达到平台最高性能目的
  125. trie.BatchInsert(Cfg.DisableLoadResource) //全局过滤
  126. trie.BatchInsert(filterResource) //指定过滤
  127. chromedp.ListenTarget(incCtx, func(event interface{}) {
  128. switch ev := event.(type) {
  129. case *fetch.EventRequestPaused:
  130. go func() {
  131. c := chromedp.FromContext(incCtx)
  132. _ctx := cdp.WithExecutor(incCtx, c.Target)
  133. if trie.HasKeyword(ev.Request.URL) {
  134. fetch.FailRequest(ev.RequestID, network.ErrorReasonBlockedByClient).Do(_ctx)
  135. } else {
  136. fetch.ContinueRequest(ev.RequestID).Do(_ctx)
  137. }
  138. }()
  139. }
  140. })
  141. //
  142. chromedp.Run(incCtx,
  143. fetch.Enable(),
  144. chromedp.ActionFunc(func(cxt context.Context) error {
  145. _, err := page.AddScriptToEvaluateOnNewDocument("Object.defineProperty(navigator, 'webdriver', { get: () => false, });").Do(cxt)
  146. return err
  147. }),
  148. )
  149. return ctx, cancelFn, allocCtx, allocCancelFn, incCtx, incCancelFn
  150. }
  151. func GetProxyAddr() string {
  152. proxyAddr := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
  153. proxyAuthor := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
  154. //获取代理
  155. req, err := http.NewRequest(http.MethodGet, proxyAddr, nil)
  156. if err != nil {
  157. fmt.Println("get proxy request err:", err)
  158. return ""
  159. }
  160. //添加请求头
  161. req.Header.Add("Authorization", proxyAuthor)
  162. client := http.Client{}
  163. //发送请求
  164. resp, err := client.Do(req)
  165. if err != nil {
  166. fmt.Println("get proxy client err:", err)
  167. return ""
  168. }
  169. defer resp.Body.Close()
  170. bodyByte, err := ioutil.ReadAll(resp.Body)
  171. if err != nil {
  172. fmt.Println("get proxy read body err:", err)
  173. return ""
  174. }
  175. tmp := map[string]interface{}{}
  176. if json.Unmarshal(bodyByte, &tmp) != nil {
  177. return ""
  178. }
  179. if data, ok := tmp["data"].(map[string]interface{}); ok && len(data) > 0 {
  180. if httpProxy, ok := data["http"].(string); ok {
  181. return httpProxy
  182. } else if httpsProxy, ok := data["https"].(string); ok {
  183. return httpsProxy
  184. }
  185. }
  186. return ""
  187. }