vm.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. /**
  2. * 虚拟机
  3. */
  4. package browser
  5. import (
  6. sp_config "KeyWebsiteMonitor/spider/config"
  7. . "KeyWebsiteMonitor/spider/util"
  8. spider_util "KeyWebsiteMonitor/util"
  9. "context"
  10. "encoding/json"
  11. "fmt"
  12. "io/ioutil"
  13. "log"
  14. "net/http"
  15. "net/url"
  16. "os"
  17. "time"
  18. . "KeyWebsiteMonitor/spider/db"
  19. qu "app.yhyue.com/moapp/jybase/common"
  20. "github.com/chromedp/chromedp"
  21. "github.com/yuin/gopher-lua"
  22. )
  23. const (
  24. run_on_device_remote = iota
  25. run_on_device_local
  26. )
  27. type (
  28. //虚拟机
  29. VM struct {
  30. WsAddr string
  31. Headless bool
  32. ShowImage bool
  33. ProxyAddr string
  34. RunMode int
  35. DownloadPath string
  36. B *Browser
  37. }
  38. //浏览器,(不用之前封装的,这个更轻量)
  39. Browser struct {
  40. Ctx context.Context
  41. CancelFn context.CancelFunc
  42. ExecuteJSChan chan bool
  43. }
  44. )
  45. // createRemoteBrowser 创建远程浏览器
  46. func createRemoteBrowser(wsAddr string) *Browser {
  47. allocCtx, cancelFn := chromedp.NewRemoteAllocator(context.TODO(),
  48. wsAddr)
  49. incCtx, _ := chromedp.NewContext(allocCtx)
  50. return &Browser{
  51. incCtx, cancelFn, make(chan bool, 1),
  52. }
  53. }
  54. // createLocalBrowser 创建本地浏览器
  55. func createLocalBrowser(headless,
  56. showImage bool,
  57. proxyAddr, downloadPath string) *Browser {
  58. baseCtx, _ := chromedp.NewContext(context.Background())
  59. chromeOptions := []chromedp.ExecAllocatorOption{
  60. chromedp.NoFirstRun,
  61. chromedp.NoDefaultBrowserCheck,
  62. chromedp.DisableGPU,
  63. chromedp.NoSandbox,
  64. chromedp.WindowSize(1920, 1080),
  65. chromedp.Flag("enable-automation", false), // 防止监测webdriver
  66. chromedp.Flag("disable-blink-features", "AutomationControlled"), //禁用 blink 特征
  67. chromedp.Flag("lang", "zh-CN"),
  68. chromedp.Flag("mixed-forms-disable-autofill", true), //从https转http不再检查
  69. chromedp.Flag("ignore-certificate-errors", true), //忽略错误
  70. chromedp.Flag("ignore-urlfetcher-cert-requests", true),
  71. chromedp.Flag("force-dev-mode-highlighting", true),
  72. chromedp.Flag("disable-extensions", true), //是否禁用扩展
  73. chromedp.Flag("headless", headless),
  74. chromedp.Flag("user-agent", "Chrome 9 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/93.0.4577.58 Safari/537.36 Edg/93.0.961.33"),
  75. chromedp.Flag("disable-keep-alive", true),
  76. chromedp.Flag("disable-dev-shm-usage", true),
  77. chromedp.Flag("disable-web-security", true), //禁用网络安全标志
  78. chromedp.Flag("mute-audio", true),
  79. chromedp.Flag("https-upgrades", "disabled"),
  80. chromedp.Flag("accept-language", `zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6`),
  81. //chromedp.Flag("blink-settings", "imagesEnabled=true"),
  82. //chromedp.Flag("incognito", true), //隐私模式
  83. chromedp.Flag("disable-cache", true), //不用缓存
  84. }
  85. if proxyAddr != "" {
  86. chromeOptions = append(chromeOptions,
  87. chromedp.ProxyServer(fmt.Sprintf("socks5://%s", proxyAddr)))
  88. }
  89. if downloadPath != "" {
  90. chromeOptions = append(chromeOptions,
  91. chromedp.Flag("download-path", downloadPath))
  92. }
  93. if showImage {
  94. chromeOptions = append(chromeOptions,
  95. chromedp.Flag("blink-settings", "imagesEnabled=true"),
  96. )
  97. } else {
  98. chromeOptions = append(chromeOptions,
  99. chromedp.Flag("blink-settings", "imagesEnabled=false"),
  100. )
  101. }
  102. allocCtx, _ := chromedp.NewExecAllocator(baseCtx, chromeOptions...)
  103. // 创建一个浏览器实例
  104. incCtx, incCancelFn := chromedp.NewContext(allocCtx,
  105. chromedp.WithLogf(log.Printf))
  106. return &Browser{
  107. incCtx, incCancelFn, make(chan bool, 1),
  108. }
  109. }
  110. // 重置浏览器
  111. func (vm *VM) ResetBrowser() {
  112. if vm.B != nil && vm.B.CancelFn != nil {
  113. // err := chromedp.Cancel(vm.B.Ctx)
  114. // log.Println("重置啦")
  115. vm.B.CancelFn()
  116. vm.B.Ctx = nil
  117. vm.B.CancelFn = nil
  118. // log.Println("vmb:", vm.B == nil, vm.B)
  119. }
  120. var b *Browser
  121. if vm.RunMode == run_on_device_local {
  122. b = createLocalBrowser(vm.Headless, vm.ShowImage, vm.ProxyAddr, vm.DownloadPath)
  123. // log.Println("新创建")
  124. } else {
  125. b = createRemoteBrowser(vm.WsAddr)
  126. }
  127. if vm.B == nil {
  128. vm.B = b
  129. // log.Println("重置了就不该进来了")
  130. } else {
  131. vm.B.Ctx, vm.B.CancelFn = b.Ctx, b.CancelFn
  132. // log.Println("重新赋值")
  133. }
  134. }
  135. var Sum = 0
  136. // BindLuaState 绑定虚拟机函数
  137. func (vm *VM) BindLuaState(state *lua.LState) {
  138. state.SetGlobal("browser_reset", state.NewFunction(func(l *lua.LState) int {
  139. vm.ResetBrowser()
  140. log.Println("重置浏览器执行完毕..")
  141. return 0
  142. }))
  143. state.SetGlobal("browser_save", state.NewFunction(func(l *lua.LState) int {
  144. siteName := l.ToString(-5)
  145. spiderCode := l.ToString(-4)
  146. siteChannelName := l.ToString(-3)
  147. siteChannelUrl := l.ToString(-2)
  148. data_table := l.ToTable(-1)
  149. data := TableToMap(data_table)
  150. data["site"] = siteName
  151. data["spidercode"] = spiderCode
  152. data["channel"] = siteChannelName
  153. data["channelurl"] = siteChannelUrl
  154. data["comeintime"] = time.Now().Add(time.Duration(1) * time.Hour).Unix()
  155. content := qu.ObjToString(data["content"])
  156. // data["content"] = spider_util.NewCut().ClearHtml(content)
  157. data["detail"] = spider_util.NewCut().ClearHtml(content)
  158. data["contenthtml"] = content
  159. data["s_title"] = data["title"]
  160. if data["title"] == nil || data["href"] == nil || data["content"] == nil ||
  161. qu.ObjToString(data["title"]) == "" ||
  162. qu.ObjToString(data["href"]) == "" ||
  163. qu.ObjToString(data["content"]) == "" {
  164. Mgo.Save("data_bak_commmon_err", map[string]interface{}{
  165. "createtime": time.Now().Unix(),
  166. "spidercode": spiderCode,
  167. "err": "缺失字段",
  168. "data": data,
  169. "href": qu.ObjToString(data["href"]) == "",
  170. "title": qu.ObjToString(data["title"]) == "",
  171. "content": qu.ObjToString(data["content"]) == "",
  172. })
  173. log.Println(fmt.Sprint("%s字段缺失,地址:%s", spiderCode, qu.ObjToString(data["href"])), qu.ObjToString(data["title"]) == "", qu.ObjToString(data["href"]) == "", qu.ObjToString(data["content"]) == "")
  174. return 0
  175. }
  176. href := qu.ObjToString(data["href"])
  177. //TODO 测试关闭
  178. // if ok := GetRedisKeyExists(href); ok {
  179. // //存在 过滤掉该条信息
  180. // return 0
  181. // }
  182. PutRedisKey(href)
  183. //清洗
  184. t := ParseTimeStr(qu.ObjToString(data["publishtime"]))
  185. if t > 0 {
  186. data["l_np_publishtime"] = t
  187. }
  188. data["sendflag"] = "false"
  189. data["_d"] = "comeintime"
  190. data["dataging"] = 0
  191. data["publishdept"] = ""
  192. data["create_time"] = time.Now().Unix()
  193. // lua中处理过了
  194. // data["infoformat"] = "infoformat"
  195. // data["iscompete"] = "iscompete"
  196. // data["T"] = "bidding" //TODO
  197. id := MgoCommon.Save("data_bak_commmon", data)
  198. log.Println("id:", id)
  199. return 0
  200. }))
  201. // state.SetGlobal("browser_url_last_segs", state.NewFunction(func(l *lua.LState) int {
  202. // segs := l.ToInt(-2)
  203. // href := l.ToString(-1)
  204. // if segs == 0 {
  205. // segs = 2
  206. // }
  207. // s := urlLastSegs(href, segs)
  208. // l.Push(lua.LString(s))
  209. // return 1
  210. // }))
  211. //最多传10个string参数,不支持其他类型
  212. state.SetGlobal("browser_log", state.NewFunction(func(l *lua.LState) int {
  213. params := []string{}
  214. for i := -10; i < 0; i++ {
  215. p := l.ToString(i)
  216. if p != "" {
  217. params = append(params, p)
  218. }
  219. }
  220. if sp_config.Sl != nil {
  221. sp_config.Sl.Log(params...)
  222. }
  223. return 0
  224. }))
  225. //browser_getTask 获取任务
  226. state.SetGlobal("browser_getTask", state.NewFunction(func(l *lua.LState) int {
  227. log.Println("开始获取任务")
  228. Sum++
  229. if Sum%50 == 0 {
  230. log.Println("已运行", Sum)
  231. }
  232. if Sum == 600 {
  233. log.Println("运行600个停止程序")
  234. os.Exit(0)
  235. }
  236. now := time.Now()
  237. // 获得分钟
  238. minute := now.Minute()
  239. // 判断分钟是否在 00 到 05 之间
  240. if minute >= 0 && minute < 15 {
  241. // l.Push(lua.LString("stop"))
  242. // os.Exit(0)
  243. // return 2
  244. }
  245. resp, err := http.Get(sp_config.Sysconfig.GetTaskUrl)
  246. if err != nil {
  247. log.Println("get task url err:", err)
  248. }
  249. defer resp.Body.Close() // 确保在函数结束时关闭响应体
  250. // 读取响应体
  251. body, err := ioutil.ReadAll(resp.Body)
  252. if err != nil {
  253. log.Println("get body err:", err)
  254. }
  255. // 将响应体转换为 map
  256. var data map[string]interface{}
  257. json.Unmarshal(body, &data)
  258. //q := qu.ObjToMap(data["data"])
  259. //附件
  260. //(*q)["ChannelUrl"] = "http://jnggzy.jnzbtb.cn/JiNing/Bulletins?CategoryCode=553001"
  261. //(*q)["ChannelUrl"] = "http://www.guangxibid.com.cn/zbcg/002002/list.html"
  262. //(*q)["ChannelUrl"] = "https://jtt.nx.gov.cn/zfxxgk/zfxxgkml/zdlygk/jtjsxmztb/zbgg/"
  263. //未取到href
  264. //(*q)["ChannelUrl"] = "http://changs.ccgp-hunan.gov.cn/gp/newsSerach.html?categoryId=170&basicArea=changsha"
  265. //(*q)["ChannelUrl"] = "https://ggzy.hebi.gov.cn:8060/jyxx/006001/006001003/transaction_infos.html?cnum=006001003"
  266. //超时的
  267. //(*q)["ChannelUrl"] = "http://60.166.52.108:8090/xxgkweb/blue/index.jsp?unit=002986280"
  268. // (*q)["ChannelUrl"] = "http://old.zmzb.com/bggghw/index.jhtml" //打不开页面的
  269. //(*q)["ChannelUrl"] = "http://www.bigdatahefei.com/index.php/index/category/index?id=92"
  270. //(*q)["ChannelUrl"] = "https://www.ahwxgt.com/a/tzgg/hangye/"
  271. //(*q)["ChannelUrl"] = "http://www.gxnd.gov.cn/xxgk/zdlyxxgk/ggzypz/ggzyjy/"
  272. //(*q)["ChannelUrl"] = "http://www.cqhysl.net/"
  273. //(*q)["ChannelUrl"] = "http://ggzyjy.dl.gov.cn/TPFront/jyxx/071002/071002003/?COLLCC=993736632&"
  274. // log.Println("---", data)
  275. l.Push(MapToTable(data))
  276. l.Push(lua.LString("ok"))
  277. return 2
  278. }))
  279. //browser_setTask 改变任务状态
  280. state.SetGlobal("browser_setTask", state.NewFunction(func(l *lua.LState) int {
  281. status := l.ToInt(-1)
  282. spidercode := l.ToString(-2)
  283. site := l.ToString(-3)
  284. channelurl := l.ToString(-4)
  285. channel := l.ToString(-5)
  286. ret := l.ToString(-6)
  287. listSelector := l.ToString(-7)
  288. titleSelector := l.ToString(-8)
  289. publishtimeSelector := l.ToString(-9)
  290. contentSelector := l.ToString(-10)
  291. count := l.ToInt(-11)
  292. // parameter := "?status=%v&spidercode=%s&channel=%s&site=%s&channelurl=%s&spidersource=%s"
  293. // parameter = fmt.Sprintf(parameter, status, spidercode, channel, site, channelurl, config.Name)
  294. // resp, err := http.Get(config.CompletionTaskUrl + parameter)
  295. // if err != nil {
  296. // log.Println("get task url err:", err)
  297. // }
  298. // defer resp.Body.Close() // 确保在函数结束时关闭响应体
  299. queryParams := url.Values{}
  300. queryParams.Add("status", fmt.Sprintf("%v", status))
  301. queryParams.Add("spidercode", spidercode)
  302. queryParams.Add("channel", channel)
  303. queryParams.Add("site", site)
  304. queryParams.Add("channelurl", channelurl)
  305. dockerid := GetDockerId()
  306. if dockerid == "" {
  307. dockerid = sp_config.Sysconfig.Name
  308. }
  309. queryParams.Add("spidersource", dockerid)
  310. queryParams.Add("count", fmt.Sprintf("%v", count))
  311. // 构建完整的 URL
  312. baseURL, err := url.Parse(sp_config.Sysconfig.CompletionTaskUrl)
  313. if err != nil {
  314. log.Fatalf("Failed to parse base URL: %v", err)
  315. }
  316. baseURL.RawQuery = queryParams.Encode()
  317. resp, err := http.Get(baseURL.String())
  318. if err != nil {
  319. log.Fatalf("HTTP request failed: %v", err)
  320. }
  321. defer resp.Body.Close()
  322. if status == -1 {
  323. //失败了存库
  324. MgoCommon.Save("data_bak_commmon_err", map[string]interface{}{
  325. "createtime": time.Now().Unix(),
  326. "spidercode": spidercode,
  327. "err": ret,
  328. })
  329. }
  330. setmap := map[string]interface{}{}
  331. if listSelector != "" {
  332. setmap["listSelector"] = listSelector
  333. }
  334. if titleSelector != "" {
  335. setmap["titleSelector"] = titleSelector
  336. }
  337. if publishtimeSelector != "" {
  338. setmap["publishtimeSelector"] = publishtimeSelector
  339. }
  340. if contentSelector != "" {
  341. setmap["contentSelector"] = contentSelector
  342. }
  343. queryParams.Add("channel", channel)
  344. queryParams.Add("site", site)
  345. queryParams.Add("channelurl", channelurl)
  346. if len(setmap) > 0 {
  347. setmap["channel_common"] = channel
  348. setmap["site_common"] = site
  349. setmap["channelurl_common"] = channelurl
  350. Mgo.Update("luaconfig", map[string]interface{}{
  351. "code": spidercode,
  352. }, map[string]interface{}{
  353. "$set": setmap,
  354. }, true, false)
  355. }
  356. l.Push(lua.LString("ok"))
  357. return 1
  358. }))
  359. //browser_autocheck 自动检测
  360. state.SetGlobal("browser_autocheck", state.NewFunction(func(l *lua.LState) int {
  361. sess := Mgo.GetMgoConn()
  362. defer Mgo.DestoryMongoConn(sess)
  363. data := []map[string]interface{}{}
  364. it := sess.DB("zxl").C("luaconfig").Find(nil).Sort("-_id").Limit(8000).Select(map[string]interface{}{
  365. "code": 1,
  366. "param_common": 1,
  367. "site": 1,
  368. }).Iter()
  369. total := 0
  370. for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
  371. if total%500 == 0 {
  372. log.Println("cur index ", total)
  373. }
  374. code := qu.ObjToString(tmp["code"])
  375. site := qu.ObjToString(tmp["site"])
  376. param_common, _ := tmp["param_common"].([]interface{})
  377. if len(param_common) < 11 {
  378. continue
  379. }
  380. href := qu.ObjToString(param_common[11])
  381. name := qu.ObjToString(param_common[3])
  382. tab, _ := ExtractDomain(href)
  383. data = append(data, map[string]interface{}{
  384. "url": href,
  385. "tab": tab,
  386. "name": name,
  387. "site": site,
  388. "code": code,
  389. })
  390. tmp = make(map[string]interface{})
  391. }
  392. l.Push(MapToTable(map[string]interface{}{
  393. "data": data,
  394. }))
  395. l.Push(lua.LString("ok"))
  396. return 2
  397. }))
  398. }