张金坤 9 years ago
parent
commit
8e4409edd6

+ 55 - 0
common/src/github.com/surfer/README.md

@@ -0,0 +1,55 @@
+# surfer    [![GoDoc](https://godoc.org/github.com/tsuna/gohbase?status.png)](https://godoc.org/github.com/henrylee2cn/surfer) [![GitHub release](https://img.shields.io/github/release/henrylee2cn/surfer.svg)](https://github.com/henrylee2cn/surfer/releases)
+
+A high level concurrency downloader.
+
+</br>
+surfer是一款Go语言编写的高并发爬虫下载器,拥有surf与phantom两种下载内核。
+
+</br>
+支持固定UserAgent自动保存cookie与随机大量UserAgent禁用cookie两种模式,高度模拟浏览器行为,可实现模拟登录等功能。
+
+</br>
+高并发爬虫[Pholcus](https://github.com/henrylee2cn/pholcus)的专用下载器。(官方QQ群:Go大数据 42731170,欢迎加入我们的讨论)
+</br>
+
+
+
+### Usage
+
+```
+package main
+
+import (
+    "github.com/henrylee2cn/surfer"
+    "io/ioutil"
+    "log"
+)
+
+func main() {
+    // 默认使用surf内核下载
+    resp, err := surfer.Download(&surfer.DefaultRequest{
+        Url: "http://github.com/henrylee2cn/surfer",
+    })
+    if err != nil {
+        log.Fatal(err)
+    }
+    b, err := ioutil.ReadAll(resp.Body)
+    log.Println(string(b), err)
+
+    // 指定使用phantomjs内核下载
+    resp, err = surfer.Download(&surfer.DefaultRequest{
+        Url:          "http://github.com/henrylee2cn",
+        DownloaderID: 1,
+    })
+    if err != nil {
+        log.Fatal(err)
+    }
+    b, err = ioutil.ReadAll(resp.Body)
+    log.Println(string(b), err)
+
+    resp.Body.Close()
+    surfer.DestroyJsFiles()
+}
+```
+
+详情参考:[example.go](https://github.com/henrylee2cn/surfer/blob/master/example/example.go)

+ 320 - 0
common/src/github.com/surfer/agent/agent.go

@@ -0,0 +1,320 @@
+// Package agent generates user agents strings for well known browsers
+// and for custom browsers.
+//
+// When submitting patches to add user agents formats, please *always* include
+// "{{.Coms}}" between the opening ( and closing ) braces, even if you're
+// sure the browser would never have additional comments.
+package agent
+
+import (
+	"bytes"
+	"math/rand"
+	"runtime"
+	"strings"
+	"text/template"
+	"time"
+)
+
+const (
+	// Windows operating system.
+	Windows int = iota
+	// Linux based operating system.
+	Linux
+	// Macintosh/OS X operating system.
+	Macintosh
+)
+
+// TemplateData structure for template data.
+type TemplateData struct {
+	Name string
+	Ver  string
+	OSN  string
+	OSV  string
+	Coms string
+}
+
+// OSAttributes stores OS attributes.
+type OSAttributes struct {
+	// OSName is the operating system name.
+	OSName string
+	// OSVersion is the operating system version.
+	OSVersion string
+	// Comments are additional comments to add to a user agent string.
+	Comments []string
+}
+
+// DefaultOSAttributes stores default OS attributes.
+var DefaultOSAttributes = map[int]OSAttributes{
+	Windows:   {"Windows NT", "6.3", []string{"x64"}},
+	Linux:     {"Linux", "3.16.1", []string{"x64"}},
+	Macintosh: {"Intel Mac OS X", "10_6_8", []string{}},
+}
+
+// Formats is a collection of UA format strings.
+// key is the browser version.
+// value is the browser info.
+type Formats map[string]string
+
+// UAData stores information on a browser user agent.
+type UAData struct {
+	TopVersion string
+	DefaultOS  int
+	Formats    Formats
+}
+
+// UATable is a collection of UAData values.
+// key is the name of the browser.
+type UATable map[string]UAData
+
+// Database is the "database" of user agents.
+var Database = UATable{
+	"chrome": {
+		"37.0.2049.0",
+		Windows,
+		Formats{
+			"37": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
+			"36": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
+			"35": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
+			"34": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
+			"33": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
+			"32": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
+			"31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
+			"30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
+		},
+	},
+	"firefox": {
+		"31.0",
+		Windows,
+		Formats{
+			"31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:31.0) Gecko/20100101 Firefox/{{.Ver}}",
+			"30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:30.0) Gecko/20120101 Firefox/{{.Ver}}",
+			"29": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:29.0) Gecko/20120101 Firefox/{{.Ver}}",
+			"28": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:28.0) Gecko/20100101 Firefox/{{.Ver}}",
+			"27": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:27.0) Gecko/20130101 Firefox/{{.Ver}}",
+			"26": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:26.0) Gecko/20121011 Firefox/{{.Ver}}",
+			"25": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:25.0) Gecko/20100101 Firefox/{{.Ver}}",
+		},
+	},
+	"msie": {
+		"10.0",
+		Windows,
+		Formats{
+			"10": "Mozilla/5.0 (compatible; MSIE 10.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.5.30729)",
+			"9":  "Mozilla/5.0 (compatible; MSIE 9.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.0.30729)",
+			"8":  "Mozilla/5.0 (compatible; MSIE 8.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/4.0; .NET CLR 3.0.04320)",
+			"7":  "Mozilla/4.0 (compatible; MSIE 7.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}.NET CLR 2.0.50727)",
+		},
+	},
+	"opera": {
+		"12.14",
+		Windows,
+		Formats{
+			"12": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.9.181 Version/{{.Ver}}",
+			"11": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.7.62 Version/{{.Ver}}",
+			"10": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.2.15 Version/{{.Ver}}",
+			"9":  "Opera/9.00 ({{.OSN}} {{.OSV}}; U{{.Coms}})",
+		},
+	},
+	"safari": {
+		"6.0",
+		Macintosh,
+		Formats{
+			"6": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/536.26 (KHTML, like Gecko) Version/{{.Ver}} Safari/8536.25",
+			"5": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/531.2+ (KHTML, like Gecko) Version/{{.Ver}} Safari/531.2+",
+			"4": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/528.16 (KHTML, like Gecko) Version/{{.Ver}} Safari/528.16",
+		},
+	},
+	"itunes": {
+		"9.1.1",
+		Macintosh,
+		Formats{
+			"9": "iTunes/{{.Ver}}",
+			"8": "iTunes/{{.Ver}}",
+			"7": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.7{{.Coms}})",
+			"6": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.5{{.Coms}})",
+		},
+	},
+	"aol": {
+		"9.7",
+		Windows,
+		Formats{
+			"9": "Mozilla/5.0 (compatible; MSIE 9.0; AOL {{.Ver}}; AOLBuild 4343.19; {{.OSN}} {{.OSV}}; WOW64; Trident/5.0; FunWebProducts{{.Coms}})",
+			"8": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0.50727{{.Coms}})",
+			"7": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; FunWebProducts{{.Coms}})",
+			"6": "Mozilla/4.0 (compatible; MSIE 6.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}{{.Coms}})",
+		},
+	},
+	"konqueror": {
+		"4.9",
+		Linux,
+		Formats{
+			"4": "Mozilla/5.0 (compatible; Konqueror/4.0; {{.OSN}}{{.Coms}}) KHTML/4.0.3 (like Gecko)",
+			"3": "Mozilla/5.0 (compatible; Konqueror/3.0-rc6; i686 {{.OSN}}; 20021127{{.Coms}})",
+			"2": "Mozilla/5.0 (compatible; Konqueror/2.1.1; {{.OSN}}{{.Coms}})",
+		},
+	},
+	"netscape": {
+		"9.1.0285",
+		Windows,
+		Formats{
+			"9": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.9.2.4{{.Coms}}) Gecko/20070321 Netscape/{{.Ver}}",
+			"8": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.7.5{{.Coms}}) Gecko/20050519 Netscape/{{.Ver}}",
+			"7": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.0.1{{.Coms}}) Gecko/20020921 Netscape/{{.Ver}}",
+		},
+	},
+	"lynx": {
+		"2.8.8dev.3",
+		Linux,
+		Formats{
+			"2": "Lynx/{{.Ver}} libwww-FM/2.14 SSL-MM/1.4.1",
+			"1": "Lynx (textmode)",
+		},
+	},
+	"googlebot": {
+		"2.1",
+		Linux,
+		Formats{
+			"2": "Mozilla/5.0 (compatible; Googlebot/{{.Ver}}; +http://www.google.com/bot.html{{.Coms}})",
+			"1": "Googlebot/{{.Ver}} (+http://www.google.com/bot.html{{.Coms}})",
+		},
+	},
+	"bingbot": {
+		"2.0",
+		Windows,
+		Formats{
+			"2": "Mozilla/5.0 (compatible; bingbot/{{.Ver}}; +http://www.bing.com/bingbot.htm{{.Coms}})",
+		},
+	},
+	"yahoobot": {
+		"2.0",
+		Linux,
+		Formats{
+			"2": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp{{.Coms}})",
+		},
+	},
+	"default": {
+		"1.0",
+		Linux,
+		Formats{
+			"1": "{{.Name}}/{{.Ver}} ({{.OSN}} {{.OSV}}{{.Coms}})",
+		},
+	},
+}
+
+// 全部UserAgent
+var UserAgents = map[string][]string{}
+
+func init() {
+	for browser, userAgentData := range Database {
+		if browser == "default" {
+			continue
+		}
+		os := userAgentData.DefaultOS
+		osAttribs := DefaultOSAttributes[os]
+		for version, _ := range userAgentData.Formats {
+			ua := createFromDetails(
+				browser,
+				version,
+				osAttribs.OSName,
+				osAttribs.OSVersion,
+				osAttribs.Comments)
+			UserAgents["all"] = append(UserAgents["all"], ua)
+
+			if browser != "itunes" && browser != "lynx" && browser != "googlebot" && browser != "bingbot" && browser != "yahoobot" {
+				UserAgents["common"] = append(UserAgents["common"], ua)
+			}
+		}
+	}
+	l := len(UserAgents["common"])
+	r := rand.New(rand.NewSource(time.Now().UnixNano()))
+	idx := r.Intn(l)
+	UserAgents["all"][0], UserAgents["all"][idx] = UserAgents["all"][idx], UserAgents["all"][0]
+	UserAgents["common"][0], UserAgents["common"][idx] = UserAgents["common"][idx], UserAgents["common"][0]
+}
+
+// Create generates and returns a complete user agent string.
+func CreateReal() string {
+	return createFromDetails("Surfer", "1.0", osName(), osVersion(), []string{runtime.Version()})
+}
+
+// CreateDefault returns a user agent string using default values.
+func CreateDefault(browser string) string {
+	bn := strings.ToLower(browser)
+	data := Database[bn]
+	os := data.DefaultOS
+	osAttribs := DefaultOSAttributes[os]
+
+	return createFromDetails(
+		browser,
+		data.TopVersion,
+		osAttribs.OSName,
+		osAttribs.OSVersion,
+		osAttribs.Comments)
+}
+
+// CreateVersion generates and returns a complete user agent string for a specific browser version.
+func CreateVersion(browser, version string) string {
+	bn := strings.ToLower(browser)
+	data := Database[bn]
+	os := data.DefaultOS
+	osAttribs := DefaultOSAttributes[os]
+
+	return createFromDetails(
+		browser,
+		version,
+		osAttribs.OSName,
+		osAttribs.OSVersion,
+		osAttribs.Comments)
+}
+
+// TopVersion returns the most recent version for the given browser name.
+func TopVersion(bname string) string {
+	bname = strings.ToLower(bname)
+	data, ok := Database[bname]
+	if ok {
+		return data.TopVersion
+	}
+	return Database["default"].TopVersion
+}
+
+// Format returns the format string for the given browser name and version.
+//
+// When a format can't be found for a version, the first format string for the browser
+// is returned. When a format can't be found for the browser the default format is
+// returned.
+func Format(bname, bver string) string {
+	bname = strings.ToLower(bname)
+	majVer := strings.Split(bver, ".")[0]
+	data, ok := Database[bname]
+	if ok {
+		format, ok := data.Formats[majVer]
+		if ok {
+			return format
+		} else {
+			top := TopVersion(bname)
+			majVer = strings.Split(top, ".")[0]
+			return data.Formats[majVer]
+		}
+	}
+
+	return Database["default"].Formats["1"]
+}
+
+// createFromDetails generates and returns a complete user agent string.
+func createFromDetails(bname, bver, osname, osver string, c []string) string {
+	if bver == "" {
+		bver = TopVersion(bname)
+	}
+	comments := strings.Join(c, "; ")
+	if comments != "" {
+		comments = "; " + comments
+	}
+
+	data := TemplateData{bname, bver, osname, osver, comments}
+	buff := &bytes.Buffer{}
+	t := template.New("formatter")
+	t.Parse(Format(bname, bver))
+	t.Execute(buff, data)
+
+	return buff.String()
+}

+ 26 - 0
common/src/github.com/surfer/agent/agent_bsd.go

@@ -0,0 +1,26 @@
+// +build darwin dragonfly freebsd netbsd openbsd
+
+package agent
+
+import (
+	"runtime"
+	"syscall"
+)
+
+// osName returns the name of the OS.
+func osName() string {
+	name, err := syscall.Sysctl("kern.ostype")
+	if err != nil {
+		return runtime.GOOS
+	}
+	return name
+}
+
+// osVersion returns the OS version.
+func osVersion() string {
+	release, err := syscall.Sysctl("kern.osrelease")
+	if err != nil {
+		return "0.0"
+	}
+	return release
+}

+ 41 - 0
common/src/github.com/surfer/agent/agent_linux.go

@@ -0,0 +1,41 @@
+// +build linux
+
+package agent
+
+import (
+	"runtime"
+	"syscall"
+)
+
+// osName returns the name of the OS.
+func osName() string {
+	buf := &syscall.Utsname{}
+	err := syscall.Uname(buf)
+	if err != nil {
+		return runtime.GOOS
+	}
+	return charsToString(buf.Sysname)
+}
+
+// osVersion returns the OS version.
+func osVersion() string {
+	buf := &syscall.Utsname{}
+	err := syscall.Uname(buf)
+	if err != nil {
+		return "0.0"
+	}
+	return charsToString(buf.Release)
+}
+
+// charsToString converts a [65]int8 byte array into a string.
+func charsToString(ca [65]int8) string {
+	s := make([]byte, len(ca))
+	var lens int
+	for ; lens < len(ca); lens++ {
+		if ca[lens] == 0 {
+			break
+		}
+		s[lens] = uint8(ca[lens])
+	}
+	return string(s[0:lens])
+}

+ 25 - 0
common/src/github.com/surfer/agent/agent_windows.go

@@ -0,0 +1,25 @@
+// +build windows
+
+package agent
+
+import (
+	"fmt"
+	"runtime"
+	"syscall"
+)
+
+// osName returns the name of the OS.
+func osName() string {
+	return runtime.GOOS
+}
+
+// osVersion returns the OS version.
+func osVersion() string {
+	v, err := syscall.GetVersion()
+	if err != nil {
+		return "0.0"
+	}
+	major := uint8(v)
+	minor := uint8(v >> 8)
+	return fmt.Sprintf("%d.%d", major, minor)
+}

+ 84 - 0
common/src/github.com/surfer/example/example.go

@@ -0,0 +1,84 @@
+package main
+
+import (
+	"github.com/henrylee2cn/surfer"
+	"io/ioutil"
+	"log"
+	"time"
+)
+
+func main() {
+	// 默认使用surf内核下载
+	log.Println("********************************************* surf内核GET下载测试开始 *********************************************")
+	resp, err := surfer.Download(&surfer.DefaultRequest{
+		Url: "http://www.baidu.com/",
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+	log.Println(resp.Header)
+
+	b, err := ioutil.ReadAll(resp.Body)
+	log.Println(string(b), err)
+
+	log.Println("********************************************* surf内核GET下载测试完毕 *********************************************")
+
+	// 默认使用surf内核下载
+	log.Println("********************************************* surf内核POST下载测试开始 *********************************************")
+	resp, err = surfer.Download(&surfer.DefaultRequest{
+		Url:      "http://accounts.lewaos.com/",
+		Method:   "POST",
+		PostData: "username=123456@qq.com&password=123456&login_btn=login_btn&submit=login_btn",
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+	log.Println(resp.Header)
+
+	b, err = ioutil.ReadAll(resp.Body)
+	log.Println(string(b), err)
+
+	log.Println("********************************************* surf内核POST下载测试完毕 *********************************************")
+
+	log.Println("********************************************* phantomjs内核GET下载测试开始 *********************************************")
+
+	// 指定使用phantomjs内核下载
+	resp, err = surfer.Download(&surfer.DefaultRequest{
+		Url:          "http://www.baidu.com/",
+		DownloaderID: 1,
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+	log.Println(resp.Header)
+
+	b, err = ioutil.ReadAll(resp.Body)
+	log.Println(string(b), err)
+
+	log.Println("********************************************* phantomjs内核GET下载测试完毕 *********************************************")
+
+	log.Println("********************************************* phantomjs内核POST下载测试开始 *********************************************")
+
+	// 指定使用phantomjs内核下载
+	resp, err = surfer.Download(&surfer.DefaultRequest{
+		DownloaderID: 1,
+		Url:          "http://accounts.lewaos.com/",
+		Method:       "POST",
+		PostData:     "username=123456@qq.com&password=123456&login_btn=login_btn&submit=login_btn",
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+	log.Println(resp.Header)
+
+	b, err = ioutil.ReadAll(resp.Body)
+	log.Println(string(b), err)
+
+	log.Println("********************************************* phantomjs内核POST下载测试完毕 *********************************************")
+
+	resp.Body.Close()
+
+	surfer.DestroyJsFiles()
+
+	time.Sleep(600e9)
+}

+ 141 - 0
common/src/github.com/surfer/param.go

@@ -0,0 +1,141 @@
+package surfer
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"math/rand"
+	"mime/multipart"
+	"net/http"
+	"net/url"
+	"strings"
+	"time"
+
+	"github.com/surfer/agent"
+	"github.com/surfer/util"
+)
+
+type Param struct {
+	method        string
+	url           *url.URL
+	proxy         *url.URL
+	contentType   string
+	body          io.Reader
+	header        http.Header
+	enableCookie  bool
+	dialTimeout   time.Duration
+	connTimeout   time.Duration
+	tryTimes      int
+	retryPause    time.Duration
+	redirectTimes int
+	client        *http.Client
+}
+
+func NewParam(req Request) (param *Param, err error) {
+	param = new(Param)
+	param.url, err = util.UrlEncode(req.GetUrl())
+	if err != nil {
+		return nil, err
+	}
+
+	if req.GetProxy() != "" {
+		if param.proxy, err = url.Parse(req.GetProxy()); err != nil {
+			return nil, err
+		}
+	}
+
+	switch method := strings.ToUpper(req.GetMethod()); method {
+	case "GET", "HEAD":
+		param.method = method
+	case "POST":
+		param.method = method
+		param.contentType = "application/x-www-form-urlencoded"
+		param.body = strings.NewReader(req.GetPostData())
+	case "POST-M":
+		param.method = "POST"
+		body := &bytes.Buffer{}
+		writer := multipart.NewWriter(body)
+		values, _ := url.ParseQuery(req.GetPostData())
+		for k, vs := range values {
+			for _, v := range vs {
+				writer.WriteField(k, v)
+			}
+		}
+		err := writer.Close()
+		if err != nil {
+			return nil, err
+		}
+		param.contentType = writer.FormDataContentType()
+		param.body = body
+
+	default:
+		param.method = "GET"
+	}
+
+	param.header = make(http.Header)
+
+	if param.contentType != "" {
+		param.header.Set("Content-Type", param.contentType)
+	}
+
+	for k, v := range req.GetHeader() {
+		for _, vv := range v {
+			param.header.Add(k, vv)
+		}
+	}
+
+	param.enableCookie = req.GetEnableCookie()
+
+	if len(param.header.Get("User-Agent")) == 0 {
+		if param.enableCookie {
+			param.header.Set("User-Agent", agent.UserAgents["common"][0])
+		} else {
+			l := len(agent.UserAgents["common"])
+			r := rand.New(rand.NewSource(time.Now().UnixNano()))
+			param.header.Set("User-Agent", agent.UserAgents["common"][r.Intn(l)])
+		}
+	}
+
+	param.dialTimeout = req.GetDialTimeout()
+	if param.dialTimeout < 0 {
+		param.dialTimeout = 0
+	}
+
+	param.connTimeout = req.GetConnTimeout()
+	param.tryTimes = req.GetTryTimes()
+	param.retryPause = req.GetRetryPause()
+	param.redirectTimes = req.GetRedirectTimes()
+	return
+}
+
+// 回写Request内容
+func (self *Param) writeback(resp *http.Response) *http.Response {
+	if resp == nil {
+		resp = new(http.Response)
+		resp.Request = new(http.Request)
+	} else if resp.Request == nil {
+		resp.Request = new(http.Request)
+	}
+
+	resp.Request.Method = self.method
+	resp.Request.Header = self.header
+	resp.Request.Host = self.url.Host
+
+	return resp
+}
+
+// checkRedirect is used as the value to http.Client.CheckRedirect
+// when redirectTimes equal 0, redirect times is ∞
+// when redirectTimes less than 0, not allow redirects
+func (self *Param) checkRedirect(req *http.Request, via []*http.Request) error {
+	if self.redirectTimes == 0 {
+		return nil
+	}
+	if len(via) >= self.redirectTimes {
+		if self.redirectTimes < 0 {
+			return fmt.Errorf("not allow redirects.")
+		}
+		return fmt.Errorf("stopped after %v redirects.", self.redirectTimes)
+	}
+	return nil
+}

+ 188 - 0
common/src/github.com/surfer/request.go

@@ -0,0 +1,188 @@
+package surfer
+
+import (
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+)
+
+type Request interface {
+	// url
+	GetUrl() string
+	// GET POST POST-M HEAD
+	GetMethod() string
+	// POST values
+	GetPostData() string
+	// http header
+	GetHeader() http.Header
+	// enable http cookies
+	GetEnableCookie() bool
+	// dial tcp: i/o timeout
+	GetDialTimeout() time.Duration
+	// WSARecv tcp: i/o timeout
+	GetConnTimeout() time.Duration
+	// the max times of download
+	GetTryTimes() int
+	// the pause time of retry
+	GetRetryPause() time.Duration
+	// the download ProxyHost
+	GetProxy() string
+	// max redirect times
+	GetRedirectTimes() int
+	// select Surf ro PhomtomJS
+	GetDownloaderID() int
+}
+
+const (
+	SurfID             = 0               // Surf下载器标识符
+	PhomtomJsID        = 1               // PhomtomJs下载器标识符
+	DefaultMethod      = "GET"           // 默认请求方法
+	DefaultDialTimeout = 2 * time.Minute // 默认请求服务器超时
+	DefaultConnTimeout = 2 * time.Minute // 默认下载超时
+	DefaultTryTimes    = 3               // 默认最大下载次数
+	DefaultRetryPause  = 2 * time.Second // 默认重新下载前停顿时长
+)
+
+// 默认实现的Request
+type DefaultRequest struct {
+	// url (必须填写)
+	Url string
+	// GET POST POST-M HEAD (默认为GET)
+	Method string
+	// http header
+	Header http.Header
+	// 是否使用cookies,在Spider的EnableCookie设置
+	EnableCookie bool
+	// POST values
+	PostData string
+	// dial tcp: i/o timeout
+	DialTimeout time.Duration
+	// WSARecv tcp: i/o timeout
+	ConnTimeout time.Duration
+	// the max times of download
+	TryTimes int
+	// how long pause when retry
+	RetryPause time.Duration
+	// max redirect times
+	// when RedirectTimes equal 0, redirect times is ∞
+	// when RedirectTimes less than 0, redirect times is 0
+	RedirectTimes int
+	// the download ProxyHost
+	Proxy string
+
+	// 指定下载器ID
+	// 0为Surf高并发下载器,各种控制功能齐全
+	// 1为PhantomJS下载器,特点破防力强,速度慢,低并发
+	DownloaderID int
+
+	// 保证prepare只调用一次
+	once sync.Once
+}
+
+func (self *DefaultRequest) prepare() {
+	if self.Method == "" {
+		self.Method = DefaultMethod
+	}
+	self.Method = strings.ToUpper(self.Method)
+
+	if self.Header == nil {
+		self.Header = make(http.Header)
+	}
+
+	if self.DialTimeout < 0 {
+		self.DialTimeout = 0
+	} else if self.DialTimeout == 0 {
+		self.DialTimeout = DefaultDialTimeout
+	}
+
+	if self.ConnTimeout < 0 {
+		self.ConnTimeout = 0
+	} else if self.ConnTimeout == 0 {
+		self.ConnTimeout = DefaultConnTimeout
+	}
+
+	if self.TryTimes == 0 {
+		self.TryTimes = DefaultTryTimes
+	}
+
+	if self.RetryPause <= 0 {
+		self.RetryPause = DefaultRetryPause
+	}
+
+	if self.DownloaderID != PhomtomJsID {
+		self.DownloaderID = SurfID
+	}
+}
+
+// url
+func (self *DefaultRequest) GetUrl() string {
+	self.once.Do(self.prepare)
+	return self.Url
+}
+
+// GET POST POST-M HEAD
+func (self *DefaultRequest) GetMethod() string {
+	self.once.Do(self.prepare)
+	return self.Method
+}
+
+// POST values
+func (self *DefaultRequest) GetPostData() string {
+	self.once.Do(self.prepare)
+	return self.PostData
+}
+
+// http header
+func (self *DefaultRequest) GetHeader() http.Header {
+	self.once.Do(self.prepare)
+	return self.Header
+}
+
+// enable http cookies
+func (self *DefaultRequest) GetEnableCookie() bool {
+	self.once.Do(self.prepare)
+	return self.EnableCookie
+}
+
+// dial tcp: i/o timeout
+func (self *DefaultRequest) GetDialTimeout() time.Duration {
+	self.once.Do(self.prepare)
+	return self.DialTimeout
+}
+
+// WSARecv tcp: i/o timeout
+func (self *DefaultRequest) GetConnTimeout() time.Duration {
+	self.once.Do(self.prepare)
+	return self.ConnTimeout
+}
+
+// the max times of download
+func (self *DefaultRequest) GetTryTimes() int {
+	self.once.Do(self.prepare)
+	return self.TryTimes
+}
+
+// the pause time of retry
+func (self *DefaultRequest) GetRetryPause() time.Duration {
+	self.once.Do(self.prepare)
+	return self.RetryPause
+}
+
+// the download ProxyHost
+func (self *DefaultRequest) GetProxy() string {
+	self.once.Do(self.prepare)
+	return self.Proxy
+}
+
+// max redirect times
+func (self *DefaultRequest) GetRedirectTimes() int {
+	self.once.Do(self.prepare)
+	return self.RedirectTimes
+}
+
+// select Surf ro PhomtomJS
+func (self *DefaultRequest) GetDownloaderID() int {
+	self.once.Do(self.prepare)
+	return self.DownloaderID
+}

+ 117 - 0
common/src/github.com/surfer/surf.go

@@ -0,0 +1,117 @@
+package surfer
+
+import (
+	"crypto/tls"
+	"math/rand"
+	"net"
+	"net/http"
+	"net/http/cookiejar"
+	"strings"
+	"time"
+
+	"github.com/surfer/agent"
+)
+
+// Default is the default Download implementation.
+type Surf struct {
+	cookieJar *cookiejar.Jar
+}
+
+func New() Surfer {
+	s := new(Surf)
+	s.cookieJar, _ = cookiejar.New(nil)
+	return s
+}
+
+func (self *Surf) Download(req Request) (resp *http.Response, err error) {
+	param, err := NewParam(req)
+	if err != nil {
+		return nil, err
+	}
+	param.client = self.buildClient(param)
+	resp, err = self.httpRequest(param)
+	resp = param.writeback(resp)
+	// if err != nil {
+	// 	resp.Status = "200 OK"
+	// 	resp.StatusCode = 200
+	// }
+	return
+}
+
+// buildClient creates, configures, and returns a *http.Client type.
+func (self *Surf) buildClient(param *Param) *http.Client {
+	client := &http.Client{
+		CheckRedirect: param.checkRedirect,
+	}
+
+	if param.enableCookie {
+		client.Jar = self.cookieJar
+	}
+
+	transport := &http.Transport{
+		DisableKeepAlives: true,
+		Dial: func(network, addr string) (net.Conn, error) {
+			c, err := net.DialTimeout(network, addr, param.dialTimeout)
+			if err != nil {
+				return nil, err
+			}
+			if param.connTimeout > 0 {
+				c.SetDeadline(time.Now().Add(param.connTimeout))
+			}
+			return c, nil
+		},
+	}
+
+	if param.proxy != nil {
+		transport.Proxy = http.ProxyURL(param.proxy)
+	}
+
+	if strings.ToLower(param.url.Scheme) == "https" {
+		transport.TLSClientConfig = &tls.Config{RootCAs: nil, InsecureSkipVerify: true}
+		transport.DisableCompression = true
+	}
+	client.Transport = transport
+	return client
+}
+
+// send uses the given *http.Request to make an HTTP request.
+func (self *Surf) httpRequest(param *Param) (resp *http.Response, err error) {
+	req, err := http.NewRequest(param.method, param.url.String(), param.body)
+	if err != nil {
+		return nil, err
+	}
+
+	req.Header = param.header
+
+	if param.tryTimes <= 0 {
+		for {
+			resp, err = param.client.Do(req)
+			if err != nil {
+				if !param.enableCookie {
+					l := len(agent.UserAgents["common"])
+					r := rand.New(rand.NewSource(time.Now().UnixNano()))
+					req.Header.Set("User-Agent", agent.UserAgents["common"][r.Intn(l)])
+				}
+				time.Sleep(param.retryPause)
+				continue
+			}
+			break
+		}
+	} else {
+		for i := 0; i < param.tryTimes; i++ {
+			resp, err = param.client.Do(req)
+			if err != nil {
+				if !param.enableCookie {
+					l := len(agent.UserAgents["common"])
+					r := rand.New(rand.NewSource(time.Now().UnixNano()))
+					req.Header.Set("User-Agent", agent.UserAgents["common"][r.Intn(l)])
+				}
+				time.Sleep(param.retryPause)
+				continue
+			}
+			break
+		}
+	}
+
+	return resp, err
+}

+ 36 - 0
common/src/github.com/surfer/surfer.go

@@ -0,0 +1,36 @@
+// surfer是一款Go语言编写的高并发爬虫下载器,支持 GET/POST/HEAD 方法及 http/https 协议,同时支持固定UserAgent自动保存cookie与随机大量UserAgent禁用cookie两种模式,高度模拟浏览器行为,可实现模拟登录等功能。
+package surfer
+
+import (
+	"net/http"
+	"os"
+	"sync"
+)
+
+var (
+	surf          Surfer
+	phantom       Surfer
+	once_surf     sync.Once
+	once_phantom  sync.Once
+	tempJsDir     = "./tmp"
+	phantomjsFile = os.Getenv("GOPATH") + `\src\github.com\surfer\phantomjs\phantomjs`
+)
+
+func Download(req Request) (resp *http.Response, err error) {
+	switch req.GetDownloaderID() {
+	case SurfID:
+		once_surf.Do(func() { surf = New() })
+		resp, err = surf.Download(req)
+
+	}
+	return
+}
+
+// Downloader represents an core of HTTP web browser for crawler.
+type Surfer interface {
+	// GET @param url string, header http.Header, cookies []*http.Cookie
+	// HEAD @param url string, header http.Header, cookies []*http.Cookie
+	// POST PostForm @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
+	// POST-M PostMultipart @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
+	Download(Request) (resp *http.Response, err error)
+}

+ 94 - 0
common/src/github.com/surfer/util/util.go

@@ -0,0 +1,94 @@
+// Package util contains some utility methods used by other packages.
+package util
+
+import (
+	"fmt"
+	"hash/crc32"
+	"log"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// 返回编码后的url.URL指针、及解析错误
+func UrlEncode(urlStr string) (*url.URL, error) {
+	urlObj, err := url.Parse(urlStr)
+	urlObj.RawQuery = urlObj.Query().Encode()
+	return urlObj, err
+}
+
+// 制作特征值
+func MakeHash(s string) string {
+	const IEEE = 0xedb88320
+	var IEEETable = crc32.MakeTable(IEEE)
+	hash := fmt.Sprintf("%x", crc32.Checksum([]byte(s), IEEETable))
+	return hash
+}
+
+// The GetWDPath gets the work directory path.
+func GetWDPath() string {
+	wd := os.Getenv("GOPATH")
+	if wd == "" {
+		panic("GOPATH is not setted in env.")
+	}
+	return wd
+}
+
+// The IsDirExists judges path is directory or not.
+func IsDirExists(path string) bool {
+	fi, err := os.Stat(path)
+
+	if err != nil {
+		return os.IsExist(err)
+	} else {
+		return fi.IsDir()
+	}
+
+	panic("util isDirExists not reached")
+}
+
+// The IsFileExists judges path is file or not.
+func IsFileExists(path string) bool {
+	fi, err := os.Stat(path)
+
+	if err != nil {
+		return os.IsExist(err)
+	} else {
+		return !fi.IsDir()
+	}
+
+	panic("util isFileExists not reached")
+}
+
+// 遍历目录,可指定后缀
+func WalkDir(targpath string, suffixes ...string) (dirlist []string) {
+	if !filepath.IsAbs(targpath) {
+		targpath, _ = filepath.Abs(targpath)
+	}
+	err := filepath.Walk(targpath, func(retpath string, f os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if !f.IsDir() {
+			return nil
+		}
+		if len(suffixes) == 0 {
+			dirlist = append(dirlist, retpath)
+			return nil
+		}
+		for _, suffix := range suffixes {
+			if strings.HasSuffix(retpath, suffix) {
+				dirlist = append(dirlist, retpath)
+			}
+		}
+		return nil
+	})
+
+	if err != nil {
+		log.Printf("utils.WalkDir: %v\n", err)
+		return
+	}
+
+	return
+}