123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398 |
- package main
- import (
- "encoding/json"
- "fmt"
- "log"
- util "spider_chromedp/chromedp/mfw"
- "time"
- )
- type ChromedpParam struct {
- RunRedis bool `json:"runredis"` //是否redis判重
- TimeOut int64 `json:"timeout"` //超时时间
- OtherTimeOut int64 `json:"othertimeout"` //其他超时时间
- Actions []Actions `json:"actions"` //动作集
- OtherActions []Actions `json:"otheractions"` //其他动作集
- }
- type Actions struct {
- Action string `json:"action"` //执行动作
- Param interface{} `json:"param"` //选择器语句
- Selector string `json:"selector"` //选择器Selectors
- }
- type DynamicIPMap struct {
- Code string
- InvalidTime int64
- }
- var TimeChan = make(chan bool, 1)
- var Alldownloader map[string]DynamicIPMap = make(map[string]DynamicIPMap)
- var Msclient *util.Client
- func main() {
- InitMsgClient("127.0.0.1:801", "123")
- go Download6()
- //go Download5()
- //go Download1()
- //go Download1()
- //go Download2()
- //go Download3()
- //go Download4() //顺序采集
- ch := make(chan bool)
- <-ch
- }
- func Download6() {
- msgid := util.UUID(8)
- param := ChromedpParam{
- TimeOut: 60,
- Actions: []Actions{
- Actions{
- Action: "navigate",
- Param: "http://www.tnmg.com.cn/information/info_zxzb.aspx?classid=826&classname=%e8%af%a2%e4%bb%b7%e5%87%bd",
- Selector: "",
- },
- Actions{
- Action: "waitready",
- Param: "#GridView1_KXPortal_Pager1_btnNext",
- Selector: "ByID",
- },
- Actions{
- Action: "click",
- Param: `#GridView1_KXPortal_Pager1_btnNext`,
- Selector: "ByID",
- },
- Actions{
- Action: "wait",
- Param: 2,
- },
- Actions{
- Action: "waitready",
- Param: `#GridView1`,
- Selector: "ByID",
- },
- Actions{
- Action: "outerhtml",
- Param: `#GridView1`,
- Selector: "ByID",
- },
- },
- }
- ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
- tmp := []string{}
- json.Unmarshal(ret, &tmp)
- fmt.Println(err, tmp)
- }
- func Download5() {
- msgid := util.UUID(8)
- param := ChromedpParam{
- TimeOut: 60,
- Actions: []Actions{
- Actions{
- Action: "navigate",
- Param: "https://zbgl.ahmu.edu.cn/sfw_cms/e?page=cms.detail&cid=49832&nextcid=4181&aid=9652",
- Selector: "",
- },
- Actions{
- Action: "waitvisible",
- Param: "#main > div > div > div.contant > div > div.msbox > div > iframe",
- Selector: "ByQuery",
- },
- Actions{
- Action: "evaluate",
- //Param: "document.querySelector('iframe').contentDocument.body.children/[0/].contentWindow.document.body.outerHTML",
- Param: `document.querySelector("#main > div > div > div.contant > div > div.msbox > div > iframe").contentDocument.body.outerHTML;`,
- Selector: "",
- },
- },
- }
- ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
- tmp := []string{}
- json.Unmarshal(ret, &tmp)
- fmt.Println(err, tmp)
- }
- func Download() {
- msgid := util.UUID(8)
- param := ChromedpParam{
- TimeOut: 60,
- OtherTimeOut: 30,
- Actions: []Actions{
- Actions{
- Action: "navigate",
- Param: "https://neep.shop/html/portal/notice.html?type=rfqAnno&nodeurl=callback_list_enquiry_anno¬iceMoreUrl=https://gd-prod.oss-cn-beijing.aliyuncs.com/upload/cms/column/inquireListFive/index.html&pageTag=undefined&menu_code=&parent_menu_code=&root_menu_code=&tdsourcetag=s_pcqq_aiomsg",
- Selector: "",
- },
- Actions{
- Action: "waitready",
- Param: "#table > tbody > tr",
- Selector: "ByQuery",
- },
- Actions{
- Action: "listhref",
- Param: "#table > tbody > tr > td:nth-child(3) > a",
- Selector: "ByQuery",
- },
- Actions{
- Action: "listhtml",
- Param: 0,
- Selector: "",
- },
- },
- OtherActions: []Actions{
- Actions{
- Action: "waitready",
- Param: "#root > div.container.details-page > div.details-content > div",
- Selector: "ByQuery",
- },
- //Actions{
- // Action: "wait",
- // Param: 5,
- // Selector: "ByQuery",
- //},
- Actions{
- Action: "outerhtml",
- Param: "#root > div.container.details-page > div.details-content",
- Selector: "ByQuery",
- },
- },
- }
- ret, _ := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
- result := []string{}
- json.Unmarshal(ret, &result)
- for _, r := range result {
- log.Println(r)
- log.Println("==================================================================================================")
- }
- }
- func Download4() {
- msgid := util.UUID(8)
- param := ChromedpParam{
- TimeOut: 60,
- OtherTimeOut: 30,
- Actions: []Actions{
- Actions{
- Action: "navigate",
- Param: "http://www.ltcost.com/news/zaojiaxiehui/list_27_1.html",
- Selector: "",
- },
- Actions{
- Action: "waitready",
- Param: "#root > div.introduction > div.container.clearfix > div.right-content.fl > div > ul > li",
- Selector: "ByQuery",
- },
- Actions{
- Action: "listhref",
- Param: "#root > div.introduction > div.container.clearfix > div.right-content.fl > div > ul > li > h4 > a",
- Selector: "ByQuery",
- },
- Actions{
- Action: "listhtml",
- Param: 0,
- Selector: "",
- },
- },
- OtherActions: []Actions{
- Actions{
- Action: "waitready",
- Param: "#root > div.container.details-page > div.details-content > div",
- Selector: "ByQuery",
- },
- //Actions{
- // Action: "wait",
- // Param: 5,
- // Selector: "ByQuery",
- //},
- Actions{
- Action: "outerhtml",
- Param: "#root > div.container.details-page > div.details-content",
- Selector: "ByQuery",
- },
- },
- }
- ret, _ := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
- result := []string{}
- json.Unmarshal(ret, &result)
- for _, r := range result {
- log.Println(r)
- log.Println("==================================================================================================")
- }
- }
- func Download1() {
- msgid := util.UUID(8)
- param := ChromedpParam{
- TimeOut: 60,
- Actions: []Actions{
- //Actions{
- // Action: "changeip",
- // Param: "",
- // Selector: "",
- //},
- Actions{
- Action: "navigate",
- Param: "https://www.sprtc.com/index/qrtwo.htm?id=c1d01625213f11ee95a2d7772ab577a8",
- Selector: "",
- },
- Actions{
- Action: "waitready",
- Param: "#iframe",
- Selector: "ByQuery",
- },
- Actions{
- Action: "wait",
- Param: 5,
- Selector: "",
- },
- Actions{
- Action: "evaluate",
- //Param: "document.querySelector('iframe').contentDocument.body.children/[0/].contentWindow.document.body.outerHTML",
- Param: `document.querySelector('iframe').contentDocument.body.children[0].contentWindow.document.body.outerHTML`,
- Selector: "",
- },
- },
- }
- ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
- tmp := []string{}
- json.Unmarshal(ret, &tmp)
- fmt.Println(err, tmp)
- }
- func Download2() {
- msgid := util.UUID(8)
- param := ChromedpParam{
- TimeOut: 30,
- Actions: []Actions{
- //Actions{
- // Action: "navigate",
- // Param: "https://web.uutool.cn",
- // //Param: "http://www.baidu.com",
- // Selector: "",
- //},
- //Actions{
- // Action: "wait",
- // Param: 5,
- // Selector: "",
- //},
- //Actions{
- // Action: "changeip",
- // Param: "",
- // Selector: "",
- //},
- //Actions{
- // Action: "navigate",
- // Param: "https://web.uutool.cn",
- // //Param: "http://www.baidu.com",
- // Selector: "",
- //},
- //Actions{
- // Action: "wait",
- // Param: 5,
- // Selector: "",
- //},
- },
- }
- for i := 1; i <= 50; i++ {
- param.Actions = append(param.Actions, Actions{
- Action: "changeip",
- Param: "",
- Selector: "",
- })
- param.Actions = append(param.Actions, Actions{
- Action: "navigate",
- Param: "https://web.uutool.cn",
- //Param: "http://www.baidu.com",
- Selector: "",
- })
- param.Actions = append(param.Actions, Actions{
- Action: "wait",
- Param: 5,
- Selector: "",
- })
- }
- ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
- tmp := map[string]interface{}{}
- json.Unmarshal(ret, &tmp)
- fmt.Println(err, tmp)
- }
- func Download3() {
- time.Sleep(5 * time.Second)
- msgid := util.UUID(8)
- param := ChromedpParam{
- TimeOut: 600,
- Actions: []Actions{
- Actions{
- Action: "navigate",
- Param: "http://www.baidu.com",
- Selector: "",
- },
- },
- }
- ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
- tmp := map[string]interface{}{}
- json.Unmarshal(ret, &tmp)
- fmt.Println(err, tmp)
- }
- //初始化,启动消息客户端
- func InitMsgClient(serveraddr, name string) {
- Msclient, _ = util.NewClient(&util.ClientConfig{ClientName: name,
- MsgServerAddr: serveraddr,
- EventHandler: processevent,
- OnRequestConnect: func() {
- log.Println("重连", serveraddr, name)
- },
- OnConnectSuccess: func() {
- log.Println("重连成功")
- },
- CanHandleEvents: []int{util.SERVICE_DOWNLOAD_APPEND_NODE, util.SERVICE_DOWNLOAD_DELETE_NODE},
- ReadBufferSize: 500,
- WriteBufferSize: 500,
- })
- go gc4Alldownloader()
- }
- func processevent(p *util.Packet) {
- var data []byte
- switch p.Event {
- case util.SERVICE_DOWNLOAD_APPEND_NODE:
- data = p.GetBusinessData()
- //log.Println("获取动态地址:", len(data), string(data))
- for i := 0; i < len(data)/8; i++ {
- code := string(data[i*8 : (i+1)*8])
- Alldownloader[code] = DynamicIPMap{
- Code: code,
- InvalidTime: time.Now().Unix() + 60*10,
- }
- }
- case util.SERVICE_DOWNLOAD_DELETE_NODE:
- data = p.GetBusinessData()
- //log.Println("删除动态地址:", len(data), string(data))
- for i := 0; i < len(data)/8; i++ {
- code := string(data[i*8 : (i+1)*8])
- delete(Alldownloader, code)
- }
- }
- }
- func gc4Alldownloader() {
- n := time.Now().Unix()
- for _, v := range Alldownloader {
- if v.InvalidTime < n {
- delete(Alldownloader, v.Code)
- }
- }
- TimeAfterFunc(1*time.Minute, gc4Alldownloader, TimeChan)
- }
- func TimeAfterFunc(td time.Duration, f func(), ch chan bool) {
- ch <- true
- time.Sleep(10 * time.Millisecond)
- <-ch
- time.AfterFunc(td, func() {
- f()
- })
- }
|