server.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. package main
  2. import (
  3. "context"
  4. "fmt"
  5. "github.com/chromedp/cdproto/cdp"
  6. "github.com/chromedp/chromedp"
  7. "github.com/donnie4w/go-logger/logger"
  8. "time"
  9. )
  10. const (
  11. ActionTypeNavigate string = "navigate" //打开
  12. ActionTypeClick string = "click" //点击
  13. ActionTypeOuterHTML string = "outerhtml" //返回html
  14. ActionTypeEvaluate string = "evaluate" //执行js
  15. ActionTypeWaitReady string = "waitready" //等待元素加载完毕
  16. ActionTypeWaitVisible string = "waitvisible" //等待元素可见
  17. ActionTypeSleep string = "wait" //等待休息
  18. ActionTypeNodes string = "listhref" //匹配所有a链接节点
  19. ActionTypeList string = "listhtml" //匹配列表要保留的信息
  20. ActionTypeChangeIp string = "changeip" //切换IP
  21. //ActionTypeInput string = "input"
  22. //ActionTypeScroll string = "scroll"
  23. //ActionTypeAssert string = "assert"
  24. //ActionTypeClose string = "close"
  25. GetListHtmlJS string = `
  26. function getParentHTML(xpath) {
  27. let result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
  28. let node = result.singleNodeValue;
  29. return node ? node%s.outerHTML : '';
  30. }
  31. getParentHTML("%s");`
  32. )
  33. //选择器
  34. var (
  35. Selectors = map[string]chromedp.QueryOption{
  36. //"": chromedp.BySearch, //默认值
  37. "ByQuery": chromedp.ByQuery, //根据document.querySelector的规则选择元素,返回单个节点
  38. "ByID": chromedp.ByID, //只id来选择元素
  39. "ByJsPath": chromedp.ByJSPath, //执行js
  40. "BySearch": chromedp.BySearch, //如果不写,默认会使用这个选择器,document.performSearch()选择元素,效果等同于`document.querySelector(...)`
  41. "ByQueryAll": chromedp.ByQueryAll, //根据document.querySelectorAll返回所有匹配的节点
  42. "ByNodeID": chromedp.ByNodeID, //检索特定节点(必须先有分配的节点ID),注意:必须与 []cdp.NodeID 一起使用
  43. "NodeNotPresent": chromedp.NodeNotPresent, //用于等待直到不存在与查询匹配的元素
  44. "NodeNotVisible": chromedp.NodeNotVisible, //用于等待所有查询的元素节点都已被浏览器发送并且不可见
  45. "NodeVisible": chromedp.NodeVisible, //用于等待所有查询的元素节点都已被浏览器发送并可见
  46. "NodeReady": chromedp.NodeReady, //用于等待浏览器发送所有查询的元素节点
  47. "NodeSelected": chromedp.NodeSelected, //用于等待浏览器发送所有查询的元素节点并选择它们(即具有“已选择”属性)
  48. }
  49. )
  50. type ChromedpTask struct {
  51. //Stype string `json:"stype"` //表示当前请求是下载的列表页、详情页还是列表+详情页(list、detail、list_detail)
  52. Flow bool `json:"flow"` //是否是顺序采集
  53. RunRedis bool `json:"runredis"` //是否执行redis判重(只用于顺序采集)
  54. TimeOut int64 `json:"timeout"` //超时时间
  55. Actions []Actions `json:"actions"` //动作集
  56. //顺序采集时需要下方采集详情页参数
  57. OtherTimeOut int64 `json:"othertimeout"` //超时时间`
  58. OtherActions []Actions `json:"otheractions"` //动作集
  59. }
  60. type Actions struct {
  61. Action string `json:"action"` //执行动作
  62. Param interface{} `json:"param"` //选择器语句
  63. Selector string `json:"selector"` //选择器Selectors
  64. }
  65. //执行动作流程,下载html
  66. func DownloadHtmlByChromedp(task *ChromedpTask) (result []string) {
  67. defer Catch()
  68. if len(task.Actions) == 0 {
  69. return
  70. }
  71. //获取一个浏览器实例
  72. browser := <-BrowserGroup
  73. defer func() {
  74. BrowserGroup <- browser.Revert()
  75. }()
  76. //封装浏览器动作
  77. acts, _ := task.AssembleTasks(task.Actions, &result, nil, browser)
  78. //执行动作,下载
  79. browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
  80. defer func() {
  81. exit <- true
  82. }()
  83. ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
  84. err := chromedp.Run(ctx, acts...)
  85. if err != nil {
  86. logger.Info("Chromedp Run Task Error :", err)
  87. //if strings.Contains(err.Error(), "ERR_PROXY_CONNECTION_FAILED") { //代理异常,切换IP
  88. browser.ChangeIp()
  89. //}
  90. }
  91. }, task.TimeOut)
  92. return
  93. }
  94. //执行动作流程,下载html
  95. func DownloadHtmlByChromedpForFlow(task *ChromedpTask) (result []string) {
  96. defer Catch()
  97. if len(task.Actions) == 0 || len(task.OtherActions) == 0 {
  98. return
  99. }
  100. if task.TimeOut == 0 {
  101. task.TimeOut = Int64All(ChromedpConfig["timeout"])
  102. }
  103. //获取一个浏览器实例
  104. browser := <-BrowserGroup
  105. defer func() {
  106. BrowserGroup <- browser.Revert()
  107. }()
  108. result = []string{} //记录动作集返回的html
  109. resultMap := map[int64]string{} //记录列表页nodes信息,用于匹配详情页tab
  110. //封装浏览器动作
  111. acts, _ := task.AssembleTasks(task.Actions, &result, resultMap, browser)
  112. //执行动作,下载
  113. browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
  114. defer func() {
  115. exit <- true
  116. }()
  117. ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
  118. //执行动作
  119. err := chromedp.Run(ctx, acts...)
  120. if err != nil {
  121. logger.Info("Chromedp Run Task Error :", err)
  122. //if strings.Contains(err.Error(), "ERR_PROXY_CONNECTION_FAILED") { //代理异常
  123. browser.ChangeIp()
  124. //}
  125. }
  126. }, task.TimeOut)
  127. return
  128. }
  129. //执行动作流程,下载html
  130. //func DownloadHtmlByChromedpForFlow_back(task *ChromedpTask) (ret []string) {
  131. // defer util.Catch()
  132. // if len(task.Actions) == 0 || len(task.OtherActions) == 0 {
  133. // return
  134. // }
  135. // //获取一个浏览器实例
  136. // browser := <-BrowserGroup
  137. // defer func() {
  138. // BrowserGroup <- browser
  139. // }()
  140. // result := &[2]*[]string{} //记录动作集返回的html
  141. // resultMap := map[int64]string{} //记录列表页nodes信息,用于匹配详情页tab
  142. // //封装浏览器动作
  143. // acts, oacts, _ := AssembleTasks(task, result, resultMap, browser)
  144. // //执行动作,下载
  145. // fmt.Println("Running...")
  146. // browser.RunWithTimeout(func(ctx context.Context, exit chan<- bool) {
  147. // //defer func() {
  148. // // exit <- true
  149. // //}()
  150. // ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
  151. // //执行列表页动作
  152. // err := chromedp.Run(ctx, acts...)
  153. // if err != nil {
  154. // fmt.Println("Chromedp Run Error :", err)
  155. // }
  156. // //获取详情页tab信息
  157. // targets, _ := chromedp.Targets(ctx) //targets无序不能一一对应nodes顺序
  158. // for _, target := range targets {
  159. // if target.OpenerFrameID == "" { //判断是否是子标签页
  160. // continue
  161. // }
  162. // fmt.Println("target id:", target.TargetID, target.URL, target.OpenerFrameID, target.OpenerID, target.BrowserContextID)
  163. // continue
  164. // newCtx, _ := chromedp.NewContext(ctx, chromedp.WithTargetID(target.TargetID)) //新标签页ctx
  165. // fmt.Println("11111111111111111", target.URL)
  166. // fmt.Println("==============================================================================================")
  167. // continue
  168. // err := chromedp.Run(newCtx, oacts...)
  169. // if err != nil {
  170. // fmt.Println("Chromedp Run Son Tab Error :", err)
  171. // }
  172. // }
  173. // }, task.TimeOut)
  174. // fmt.Println("Finish...")
  175. // return
  176. //}
  177. //生成动作集
  178. func (task *ChromedpTask) AssembleTasks(actions []Actions, result *[]string, resultMap map[int64]string, browser *Browser) (acts []chromedp.Action, nodes *[]*cdp.Node) {
  179. defer Catch()
  180. nodes = &[]*cdp.Node{}
  181. for _, act := range actions {
  182. switch act.Action {
  183. case ActionTypeNavigate: //打开网页
  184. acts = append(acts, chromedp.Navigate(fmt.Sprint(act.Param)))
  185. case ActionTypeClick: //点击
  186. acts = append(acts, chromedp.Click(fmt.Sprint(act.Param), Selectors[act.Selector]))
  187. case ActionTypeOuterHTML: //输出html
  188. acts = append(acts, OuterHTMLFunc(fmt.Sprint(act.Param), act.Selector, result))
  189. case ActionTypeEvaluate: //执行javascript
  190. acts = append(acts, EvaluateFunc(fmt.Sprint(act.Param), result))
  191. case ActionTypeWaitReady: //等待元素加载完毕
  192. acts = append(acts, chromedp.WaitReady(fmt.Sprint(act.Param), Selectors[act.Selector]))
  193. case ActionTypeWaitVisible: //等待元素可见
  194. acts = append(acts, chromedp.WaitVisible(fmt.Sprint(act.Param), Selectors[act.Selector]))
  195. case ActionTypeNodes: //匹配节点元素
  196. acts = append(acts, chromedp.Nodes(fmt.Sprint(act.Param), nodes))
  197. case ActionTypeList: //匹配列表要保留的信息
  198. parentNodeJs := ""
  199. for i := 0; i < IntAll(act.Param); i++ {
  200. parentNodeJs += ".parentNode"
  201. }
  202. htmlJsFormat := fmt.Sprintf(GetListHtmlJS, parentNodeJs, "%s")
  203. acts = append(acts, task.ClickNodesAndGetHtml(browser, nodes, htmlJsFormat, result, resultMap))
  204. case ActionTypeSleep:
  205. acts = append(acts, CdpSleep(IntAll(act.Param)))
  206. case ActionTypeChangeIp:
  207. acts = append(acts, ChangeIp(browser))
  208. }
  209. }
  210. return
  211. }
  212. //切换IP
  213. func ChangeIp(b *Browser) chromedp.ActionFunc {
  214. defer Catch()
  215. return func(ctx context.Context) (err error) {
  216. b.ChangeIp()
  217. return
  218. }
  219. }
  220. func CdpSleep(sleep int) chromedp.Action {
  221. defer Catch()
  222. if sleep < 1 {
  223. sleep = 1
  224. }
  225. return chromedp.Sleep(time.Duration(sleep) * time.Second)
  226. }
  227. //OuterHTML获取html
  228. func OuterHTMLFunc(sel, selector string, result *[]string) chromedp.ActionFunc {
  229. defer Catch()
  230. return func(ctx context.Context) (err error) {
  231. var html string
  232. //chromedp.OuterHTML(sel, &html).Do(ctx)
  233. chromedp.OuterHTML(sel, &html, Selectors[selector]).Do(ctx)
  234. *result = append(*result, html)
  235. return
  236. }
  237. }
  238. //Evaluate获取js执行结果
  239. func EvaluateFunc(sel string, result *[]string) chromedp.ActionFunc {
  240. defer Catch()
  241. return func(ctx context.Context) (err error) {
  242. var res string
  243. chromedp.Evaluate(sel, &res).Do(ctx)
  244. *result = append(*result, res)
  245. return
  246. }
  247. }
  248. //点击所有nodes节点并返回html
  249. func (task *ChromedpTask) ClickNodesAndGetHtml(browser *Browser, nodes *[]*cdp.Node, jsFormat string, result *[]string, resultMap map[int64]string) chromedp.ActionFunc {
  250. defer Catch()
  251. return func(ctx context.Context) (err error) {
  252. detailResult := []string{} //详情页html信息
  253. targetIdMap := map[string]string{}
  254. oacts, _ := task.AssembleTasks(task.OtherActions, &detailResult, nil, browser) //组合下载详情页动作集
  255. for i, node := range *nodes {
  256. chromedp.MouseClickNode(node).Do(ctx) //点击
  257. var listHtml string
  258. xpath := node.FullXPath()
  259. js := fmt.Sprintf(jsFormat, xpath)
  260. chromedp.Evaluate(js, &listHtml).Do(ctx)
  261. if listHtml != "" {
  262. //resultMap[node.NodeID.Int64()] = "<listhtml>" + html + "</listhtml>"
  263. listHtml = "<listhtml>" + listHtml + "</listhtml>"
  264. //获取多有tab信息
  265. targets, _ := chromedp.Targets(ctx) //
  266. for _, tgt := range targets {
  267. if tgt.Type != "page" || tgt.TargetID == "" || tgt.OpenerFrameID == "" {
  268. continue
  269. }
  270. targetID := tgt.TargetID.String()
  271. if targetIdMap[targetID] != "" { //过滤重复target(目前未能关闭指定target只能通过targeID过滤)
  272. continue
  273. }
  274. if tgt.Type != "page" || targetID == "" || tgt.OpenerFrameID == "" {
  275. continue
  276. }
  277. hashHref := HexText(tgt.URL)
  278. if task.RunRedis { //执行redis判重
  279. exists := RedisExist("list", "list_"+hashHref)
  280. if exists { //redis判重
  281. logger.Info("redis exists:", tgt.URL)
  282. //err := chromedp.Run(ctx, target.CloseTarget(tar.TargetID))
  283. continue
  284. }
  285. }
  286. targetIdMap[targetID] = targetID
  287. RunWithTimeoutByTatgetID(tgt.TargetID, ctx, func(ctx context.Context, exit chan<- bool) {
  288. defer func() {
  289. exit <- true
  290. }()
  291. ChangeBrowserDevice(ctx, BROWSER_DEVICE_TYPE_PC) //浏览器类型,pc端
  292. //执行动作
  293. err := chromedp.Run(ctx, oacts...)
  294. if err != nil { //获取三级页信息失败
  295. detailResult = append(detailResult, "") //取值失败赋空值,保证下方取值不报错
  296. logger.Info("Get Detail Infor Chromedp Run Error :", err)
  297. }
  298. }, task.OtherTimeOut) //设置超时时间
  299. detailHtml := detailResult[i] //详情页信息
  300. if task.RunRedis && detailHtml != "" { //执行redis判重且详情页获取到信息
  301. RedisSet("list", "list_"+hashHref, "", 86400*365*2)
  302. }
  303. detailHtml = listHtml + detailHtml //拼接列表页信息一起返回
  304. *result = append(*result, detailHtml)
  305. }
  306. }
  307. }
  308. return nil
  309. }
  310. }