main.go 10.0 KB


  1. package main
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "log"
  6. util "spider_chromedp/chromedp/mfw"
  7. "time"
  8. )
  9. type ChromedpParam struct {
  10. RunRedis bool `json:"runredis"` //是否redis判重
  11. TimeOut int64 `json:"timeout"` //超时时间
  12. OtherTimeOut int64 `json:"othertimeout"` //其他超时时间
  13. Actions []Actions `json:"actions"` //动作集
  14. OtherActions []Actions `json:"otheractions"` //其他动作集
  15. }
  16. type Actions struct {
  17. Action string `json:"action"` //执行动作
  18. Param interface{} `json:"param"` //选择器语句
  19. Selector string `json:"selector"` //选择器Selectors
  20. }
  21. type DynamicIPMap struct {
  22. Code string
  23. InvalidTime int64
  24. }
  25. var TimeChan = make(chan bool, 1)
  26. var Alldownloader map[string]DynamicIPMap = make(map[string]DynamicIPMap)
  27. var Msclient *util.Client
  28. func main() {
  29. InitMsgClient("127.0.0.1:801", "123")
  30. go Download6()
  31. //go Download5()
  32. //go Download1()
  33. //go Download1()
  34. //go Download2()
  35. //go Download3()
  36. //go Download4() //顺序采集
  37. ch := make(chan bool)
  38. <-ch
  39. }
  40. func Download6() {
  41. msgid := util.UUID(8)
  42. param := ChromedpParam{
  43. TimeOut: 60,
  44. Actions: []Actions{
  45. Actions{
  46. Action: "navigate",
  47. Param: "http://www.tnmg.com.cn/information/info_zxzb.aspx?classid=826&classname=%e8%af%a2%e4%bb%b7%e5%87%bd",
  48. Selector: "",
  49. },
  50. Actions{
  51. Action: "waitready",
  52. Param: "#GridView1_KXPortal_Pager1_btnNext",
  53. Selector: "ByID",
  54. },
  55. Actions{
  56. Action: "click",
  57. Param: `#GridView1_KXPortal_Pager1_btnNext`,
  58. Selector: "ByID",
  59. },
  60. Actions{
  61. Action: "wait",
  62. Param: 2,
  63. },
  64. Actions{
  65. Action: "waitready",
  66. Param: `#GridView1`,
  67. Selector: "ByID",
  68. },
  69. Actions{
  70. Action: "outerhtml",
  71. Param: `#GridView1`,
  72. Selector: "ByID",
  73. },
  74. },
  75. }
  76. ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
  77. tmp := []string{}
  78. json.Unmarshal(ret, &tmp)
  79. fmt.Println(err, tmp)
  80. }
  81. func Download5() {
  82. msgid := util.UUID(8)
  83. param := ChromedpParam{
  84. TimeOut: 60,
  85. Actions: []Actions{
  86. Actions{
  87. Action: "navigate",
  88. Param: "https://zbgl.ahmu.edu.cn/sfw_cms/e?page=cms.detail&cid=49832&nextcid=4181&aid=9652",
  89. Selector: "",
  90. },
  91. Actions{
  92. Action: "waitvisible",
  93. Param: "#main > div > div > div.contant > div > div.msbox > div > iframe",
  94. Selector: "ByQuery",
  95. },
  96. Actions{
  97. Action: "evaluate",
  98. //Param: "document.querySelector('iframe').contentDocument.body.children/[0/].contentWindow.document.body.outerHTML",
  99. Param: `document.querySelector("#main > div > div > div.contant > div > div.msbox > div > iframe").contentDocument.body.outerHTML;`,
  100. Selector: "",
  101. },
  102. },
  103. }
  104. ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
  105. tmp := []string{}
  106. json.Unmarshal(ret, &tmp)
  107. fmt.Println(err, tmp)
  108. }
  109. func Download() {
  110. msgid := util.UUID(8)
  111. param := ChromedpParam{
  112. TimeOut: 60,
  113. OtherTimeOut: 30,
  114. Actions: []Actions{
  115. Actions{
  116. Action: "navigate",
  117. Param: "https://neep.shop/html/portal/notice.html?type=rfqAnno&nodeurl=callback_list_enquiry_anno&noticeMoreUrl=https://gd-prod.oss-cn-beijing.aliyuncs.com/upload/cms/column/inquireListFive/index.html&pageTag=undefined&menu_code=&parent_menu_code=&root_menu_code=&tdsourcetag=s_pcqq_aiomsg",
  118. Selector: "",
  119. },
  120. Actions{
  121. Action: "waitready",
  122. Param: "#table > tbody > tr",
  123. Selector: "ByQuery",
  124. },
  125. Actions{
  126. Action: "listhref",
  127. Param: "#table > tbody > tr > td:nth-child(3) > a",
  128. Selector: "ByQuery",
  129. },
  130. Actions{
  131. Action: "listhtml",
  132. Param: 0,
  133. Selector: "",
  134. },
  135. },
  136. OtherActions: []Actions{
  137. Actions{
  138. Action: "waitready",
  139. Param: "#root > div.container.details-page > div.details-content > div",
  140. Selector: "ByQuery",
  141. },
  142. //Actions{
  143. // Action: "wait",
  144. // Param: 5,
  145. // Selector: "ByQuery",
  146. //},
  147. Actions{
  148. Action: "outerhtml",
  149. Param: "#root > div.container.details-page > div.details-content",
  150. Selector: "ByQuery",
  151. },
  152. },
  153. }
  154. ret, _ := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
  155. result := []string{}
  156. json.Unmarshal(ret, &result)
  157. for _, r := range result {
  158. log.Println(r)
  159. log.Println("==================================================================================================")
  160. }
  161. }
  162. func Download4() {
  163. msgid := util.UUID(8)
  164. param := ChromedpParam{
  165. TimeOut: 60,
  166. OtherTimeOut: 30,
  167. Actions: []Actions{
  168. Actions{
  169. Action: "navigate",
  170. Param: "http://www.ltcost.com/news/zaojiaxiehui/list_27_1.html",
  171. Selector: "",
  172. },
  173. Actions{
  174. Action: "waitready",
  175. Param: "#root > div.introduction > div.container.clearfix > div.right-content.fl > div > ul > li",
  176. Selector: "ByQuery",
  177. },
  178. Actions{
  179. Action: "listhref",
  180. Param: "#root > div.introduction > div.container.clearfix > div.right-content.fl > div > ul > li > h4 > a",
  181. Selector: "ByQuery",
  182. },
  183. Actions{
  184. Action: "listhtml",
  185. Param: 0,
  186. Selector: "",
  187. },
  188. },
  189. OtherActions: []Actions{
  190. Actions{
  191. Action: "waitready",
  192. Param: "#root > div.container.details-page > div.details-content > div",
  193. Selector: "ByQuery",
  194. },
  195. //Actions{
  196. // Action: "wait",
  197. // Param: 5,
  198. // Selector: "ByQuery",
  199. //},
  200. Actions{
  201. Action: "outerhtml",
  202. Param: "#root > div.container.details-page > div.details-content",
  203. Selector: "ByQuery",
  204. },
  205. },
  206. }
  207. ret, _ := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
  208. result := []string{}
  209. json.Unmarshal(ret, &result)
  210. for _, r := range result {
  211. log.Println(r)
  212. log.Println("==================================================================================================")
  213. }
  214. }
  215. func Download1() {
  216. msgid := util.UUID(8)
  217. param := ChromedpParam{
  218. TimeOut: 60,
  219. Actions: []Actions{
  220. //Actions{
  221. // Action: "changeip",
  222. // Param: "",
  223. // Selector: "",
  224. //},
  225. Actions{
  226. Action: "navigate",
  227. Param: "https://www.sprtc.com/index/qrtwo.htm?id=c1d01625213f11ee95a2d7772ab577a8",
  228. Selector: "",
  229. },
  230. Actions{
  231. Action: "waitready",
  232. Param: "#iframe",
  233. Selector: "ByQuery",
  234. },
  235. Actions{
  236. Action: "wait",
  237. Param: 5,
  238. Selector: "",
  239. },
  240. Actions{
  241. Action: "evaluate",
  242. //Param: "document.querySelector('iframe').contentDocument.body.children/[0/].contentWindow.document.body.outerHTML",
  243. Param: `document.querySelector('iframe').contentDocument.body.children[0].contentWindow.document.body.outerHTML`,
  244. Selector: "",
  245. },
  246. },
  247. }
  248. ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
  249. tmp := []string{}
  250. json.Unmarshal(ret, &tmp)
  251. fmt.Println(err, tmp)
  252. }
  253. func Download2() {
  254. msgid := util.UUID(8)
  255. param := ChromedpParam{
  256. TimeOut: 30,
  257. Actions: []Actions{
  258. //Actions{
  259. // Action: "navigate",
  260. // Param: "https://web.uutool.cn",
  261. // //Param: "http://www.baidu.com",
  262. // Selector: "",
  263. //},
  264. //Actions{
  265. // Action: "wait",
  266. // Param: 5,
  267. // Selector: "",
  268. //},
  269. //Actions{
  270. // Action: "changeip",
  271. // Param: "",
  272. // Selector: "",
  273. //},
  274. //Actions{
  275. // Action: "navigate",
  276. // Param: "https://web.uutool.cn",
  277. // //Param: "http://www.baidu.com",
  278. // Selector: "",
  279. //},
  280. //Actions{
  281. // Action: "wait",
  282. // Param: 5,
  283. // Selector: "",
  284. //},
  285. },
  286. }
  287. for i := 1; i <= 50; i++ {
  288. param.Actions = append(param.Actions, Actions{
  289. Action: "changeip",
  290. Param: "",
  291. Selector: "",
  292. })
  293. param.Actions = append(param.Actions, Actions{
  294. Action: "navigate",
  295. Param: "https://web.uutool.cn",
  296. //Param: "http://www.baidu.com",
  297. Selector: "",
  298. })
  299. param.Actions = append(param.Actions, Actions{
  300. Action: "wait",
  301. Param: 5,
  302. Selector: "",
  303. })
  304. }
  305. ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
  306. tmp := map[string]interface{}{}
  307. json.Unmarshal(ret, &tmp)
  308. fmt.Println(err, tmp)
  309. }
  310. func Download3() {
  311. time.Sleep(5 * time.Second)
  312. msgid := util.UUID(8)
  313. param := ChromedpParam{
  314. TimeOut: 600,
  315. Actions: []Actions{
  316. Actions{
  317. Action: "navigate",
  318. Param: "http://www.baidu.com",
  319. Selector: "",
  320. },
  321. },
  322. }
  323. ret, err := Msclient.Call("", msgid, util.SERVICE_DOWNLOAD, util.SENDTO_TYPE_RAND_RECIVER, param, 300)
  324. tmp := map[string]interface{}{}
  325. json.Unmarshal(ret, &tmp)
  326. fmt.Println(err, tmp)
  327. }
  328. //初始化,启动消息客户端
  329. func InitMsgClient(serveraddr, name string) {
  330. Msclient, _ = util.NewClient(&util.ClientConfig{ClientName: name,
  331. MsgServerAddr: serveraddr,
  332. EventHandler: processevent,
  333. OnRequestConnect: func() {
  334. log.Println("重连", serveraddr, name)
  335. },
  336. OnConnectSuccess: func() {
  337. log.Println("重连成功")
  338. },
  339. CanHandleEvents: []int{util.SERVICE_DOWNLOAD_APPEND_NODE, util.SERVICE_DOWNLOAD_DELETE_NODE},
  340. ReadBufferSize: 500,
  341. WriteBufferSize: 500,
  342. })
  343. go gc4Alldownloader()
  344. }
  345. func processevent(p *util.Packet) {
  346. var data []byte
  347. switch p.Event {
  348. case util.SERVICE_DOWNLOAD_APPEND_NODE:
  349. data = p.GetBusinessData()
  350. //log.Println("获取动态地址:", len(data), string(data))
  351. for i := 0; i < len(data)/8; i++ {
  352. code := string(data[i*8 : (i+1)*8])
  353. Alldownloader[code] = DynamicIPMap{
  354. Code: code,
  355. InvalidTime: time.Now().Unix() + 60*10,
  356. }
  357. }
  358. case util.SERVICE_DOWNLOAD_DELETE_NODE:
  359. data = p.GetBusinessData()
  360. //log.Println("删除动态地址:", len(data), string(data))
  361. for i := 0; i < len(data)/8; i++ {
  362. code := string(data[i*8 : (i+1)*8])
  363. delete(Alldownloader, code)
  364. }
  365. }
  366. }
  367. func gc4Alldownloader() {
  368. n := time.Now().Unix()
  369. for _, v := range Alldownloader {
  370. if v.InvalidTime < n {
  371. delete(Alldownloader, v.Code)
  372. }
  373. }
  374. TimeAfterFunc(1*time.Minute, gc4Alldownloader, TimeChan)
  375. }
  376. func TimeAfterFunc(td time.Duration, f func(), ch chan bool) {
  377. ch <- true
  378. time.Sleep(10 * time.Millisecond)
  379. <-ch
  380. time.AfterFunc(td, func() {
  381. f()
  382. })
  383. }