/** * 虚拟机 */ package browser import ( sp_config "KeyWebsiteMonitor/spider/config" . "KeyWebsiteMonitor/spider/util" spider_util "KeyWebsiteMonitor/util" "context" "encoding/json" "fmt" "io/ioutil" "log" "net/http" "net/url" "os" "time" . "KeyWebsiteMonitor/spider/db" qu "app.yhyue.com/moapp/jybase/common" "github.com/chromedp/chromedp" "github.com/yuin/gopher-lua" ) const ( run_on_device_remote = iota run_on_device_local ) type ( //虚拟机 VM struct { WsAddr string Headless bool ShowImage bool ProxyAddr string RunMode int DownloadPath string B *Browser } //浏览器,(不用之前封装的,这个更轻量) Browser struct { Ctx context.Context CancelFn context.CancelFunc ExecuteJSChan chan bool } ) // createRemoteBrowser 创建远程浏览器 func createRemoteBrowser(wsAddr string) *Browser { allocCtx, cancelFn := chromedp.NewRemoteAllocator(context.TODO(), wsAddr) incCtx, _ := chromedp.NewContext(allocCtx) return &Browser{ incCtx, cancelFn, make(chan bool, 1), } } // createLocalBrowser 创建本地浏览器 func createLocalBrowser(headless, showImage bool, proxyAddr, downloadPath string) *Browser { baseCtx, _ := chromedp.NewContext(context.Background()) chromeOptions := []chromedp.ExecAllocatorOption{ chromedp.NoFirstRun, chromedp.NoDefaultBrowserCheck, chromedp.DisableGPU, chromedp.NoSandbox, chromedp.WindowSize(1920, 1080), chromedp.Flag("enable-automation", false), // 防止监测webdriver chromedp.Flag("disable-blink-features", "AutomationControlled"), //禁用 blink 特征 chromedp.Flag("lang", "zh-CN"), chromedp.Flag("mixed-forms-disable-autofill", true), //从https转http不再检查 chromedp.Flag("ignore-certificate-errors", true), //忽略错误 chromedp.Flag("ignore-urlfetcher-cert-requests", true), chromedp.Flag("force-dev-mode-highlighting", true), chromedp.Flag("disable-extensions", true), //是否禁用扩展 chromedp.Flag("headless", headless), chromedp.Flag("user-agent", "Chrome 9 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/93.0.4577.58 Safari/537.36 Edg/93.0.961.33"), chromedp.Flag("disable-keep-alive", true), chromedp.Flag("disable-dev-shm-usage", true), chromedp.Flag("disable-web-security", true), //禁用网络安全标志 chromedp.Flag("mute-audio", true), chromedp.Flag("https-upgrades", "disabled"), chromedp.Flag("accept-language", `zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6`), //chromedp.Flag("blink-settings", "imagesEnabled=true"), //chromedp.Flag("incognito", true), //隐私模式 chromedp.Flag("disable-cache", true), //不用缓存 } if proxyAddr != "" { chromeOptions = append(chromeOptions, chromedp.ProxyServer(fmt.Sprintf("socks5://%s", proxyAddr))) } if downloadPath != "" { chromeOptions = append(chromeOptions, chromedp.Flag("download-path", downloadPath)) } if showImage { chromeOptions = append(chromeOptions, chromedp.Flag("blink-settings", "imagesEnabled=true"), ) } else { chromeOptions = append(chromeOptions, chromedp.Flag("blink-settings", "imagesEnabled=false"), ) } allocCtx, _ := chromedp.NewExecAllocator(baseCtx, chromeOptions...) // 创建一个浏览器实例 incCtx, incCancelFn := chromedp.NewContext(allocCtx, chromedp.WithLogf(log.Printf)) return &Browser{ incCtx, incCancelFn, make(chan bool, 1), } } // 重置浏览器 func (vm *VM) ResetBrowser() { if vm.B != nil && vm.B.CancelFn != nil { // err := chromedp.Cancel(vm.B.Ctx) // log.Println("重置啦") vm.B.CancelFn() vm.B.Ctx = nil vm.B.CancelFn = nil // log.Println("vmb:", vm.B == nil, vm.B) } var b *Browser if vm.RunMode == run_on_device_local { b = createLocalBrowser(vm.Headless, vm.ShowImage, vm.ProxyAddr, vm.DownloadPath) // log.Println("新创建") } else { b = createRemoteBrowser(vm.WsAddr) } if vm.B == nil { vm.B = b // log.Println("重置了就不该进来了") } else { vm.B.Ctx, vm.B.CancelFn = b.Ctx, b.CancelFn // log.Println("重新赋值") } } var Sum = 0 // BindLuaState 绑定虚拟机函数 func (vm *VM) BindLuaState(state *lua.LState) { state.SetGlobal("browser_reset", state.NewFunction(func(l *lua.LState) int { vm.ResetBrowser() log.Println("重置浏览器执行完毕..") return 0 })) state.SetGlobal("browser_save", state.NewFunction(func(l *lua.LState) int { siteName := l.ToString(-5) spiderCode := l.ToString(-4) siteChannelName := l.ToString(-3) siteChannelUrl := l.ToString(-2) data_table := l.ToTable(-1) data := TableToMap(data_table) data["site"] = siteName data["spidercode"] = spiderCode data["channel"] = siteChannelName data["channelurl"] = siteChannelUrl data["comeintime"] = time.Now().Add(time.Duration(1) * time.Hour).Unix() content := qu.ObjToString(data["content"]) // data["content"] = spider_util.NewCut().ClearHtml(content) data["detail"] = spider_util.NewCut().ClearHtml(content) data["contenthtml"] = content data["s_title"] = data["title"] if data["title"] == nil || data["href"] == nil || data["content"] == nil || qu.ObjToString(data["title"]) == "" || qu.ObjToString(data["href"]) == "" || qu.ObjToString(data["content"]) == "" { Mgo.Save("data_bak_commmon_err", map[string]interface{}{ "createtime": time.Now().Unix(), "spidercode": spiderCode, "err": "缺失字段", "data": data, "href": qu.ObjToString(data["href"]) == "", "title": qu.ObjToString(data["title"]) == "", "content": qu.ObjToString(data["content"]) == "", }) log.Println(fmt.Sprint("%s字段缺失,地址:%s", spiderCode, qu.ObjToString(data["href"])), qu.ObjToString(data["title"]) == "", qu.ObjToString(data["href"]) == "", qu.ObjToString(data["content"]) == "") return 0 } href := qu.ObjToString(data["href"]) //TODO 测试关闭 // if ok := GetRedisKeyExists(href); ok { // //存在 过滤掉该条信息 // return 0 // } PutRedisKey(href) //清洗 t := ParseTimeStr(qu.ObjToString(data["publishtime"])) if t > 0 { data["l_np_publishtime"] = t } data["sendflag"] = "false" data["_d"] = "comeintime" data["dataging"] = 0 data["publishdept"] = "" data["create_time"] = time.Now().Unix() // lua中处理过了 // data["infoformat"] = "infoformat" // data["iscompete"] = "iscompete" // data["T"] = "bidding" //TODO id := MgoCommon.Save("data_bak_commmon", data) log.Println("id:", id) return 0 })) // state.SetGlobal("browser_url_last_segs", state.NewFunction(func(l *lua.LState) int { // segs := l.ToInt(-2) // href := l.ToString(-1) // if segs == 0 { // segs = 2 // } // s := urlLastSegs(href, segs) // l.Push(lua.LString(s)) // return 1 // })) //最多传10个string参数,不支持其他类型 state.SetGlobal("browser_log", state.NewFunction(func(l *lua.LState) int { params := []string{} for i := -10; i < 0; i++ { p := l.ToString(i) if p != "" { params = append(params, p) } } if sp_config.Sl != nil { sp_config.Sl.Log(params...) } return 0 })) //browser_getTask 获取任务 state.SetGlobal("browser_getTask", state.NewFunction(func(l *lua.LState) int { log.Println("开始获取任务") Sum++ if Sum%50 == 0 { log.Println("已运行", Sum) } if Sum == 600 { log.Println("运行600个停止程序") os.Exit(0) } now := time.Now() // 获得分钟 minute := now.Minute() // 判断分钟是否在 00 到 05 之间 if minute >= 0 && minute < 15 { // l.Push(lua.LString("stop")) // os.Exit(0) // return 2 } resp, err := http.Get(sp_config.Sysconfig.GetTaskUrl) if err != nil { log.Println("get task url err:", err) } defer resp.Body.Close() // 确保在函数结束时关闭响应体 // 读取响应体 body, err := ioutil.ReadAll(resp.Body) if err != nil { log.Println("get body err:", err) } // 将响应体转换为 map var data map[string]interface{} json.Unmarshal(body, &data) //q := qu.ObjToMap(data["data"]) //附件 //(*q)["ChannelUrl"] = "http://jnggzy.jnzbtb.cn/JiNing/Bulletins?CategoryCode=553001" //(*q)["ChannelUrl"] = "http://www.guangxibid.com.cn/zbcg/002002/list.html" //(*q)["ChannelUrl"] = "https://jtt.nx.gov.cn/zfxxgk/zfxxgkml/zdlygk/jtjsxmztb/zbgg/" //未取到href //(*q)["ChannelUrl"] = "http://changs.ccgp-hunan.gov.cn/gp/newsSerach.html?categoryId=170&basicArea=changsha" //(*q)["ChannelUrl"] = "https://ggzy.hebi.gov.cn:8060/jyxx/006001/006001003/transaction_infos.html?cnum=006001003" //超时的 //(*q)["ChannelUrl"] = "http://60.166.52.108:8090/xxgkweb/blue/index.jsp?unit=002986280" // (*q)["ChannelUrl"] = "http://old.zmzb.com/bggghw/index.jhtml" //打不开页面的 //(*q)["ChannelUrl"] = "http://www.bigdatahefei.com/index.php/index/category/index?id=92" //(*q)["ChannelUrl"] = "https://www.ahwxgt.com/a/tzgg/hangye/" //(*q)["ChannelUrl"] = "http://www.gxnd.gov.cn/xxgk/zdlyxxgk/ggzypz/ggzyjy/" //(*q)["ChannelUrl"] = "http://www.cqhysl.net/" //(*q)["ChannelUrl"] = "http://ggzyjy.dl.gov.cn/TPFront/jyxx/071002/071002003/?COLLCC=993736632&" // log.Println("---", data) l.Push(MapToTable(data)) l.Push(lua.LString("ok")) return 2 })) //browser_setTask 改变任务状态 state.SetGlobal("browser_setTask", state.NewFunction(func(l *lua.LState) int { status := l.ToInt(-1) spidercode := l.ToString(-2) site := l.ToString(-3) channelurl := l.ToString(-4) channel := l.ToString(-5) ret := l.ToString(-6) listSelector := l.ToString(-7) titleSelector := l.ToString(-8) publishtimeSelector := l.ToString(-9) contentSelector := l.ToString(-10) count := l.ToInt(-11) // parameter := "?status=%v&spidercode=%s&channel=%s&site=%s&channelurl=%s&spidersource=%s" // parameter = fmt.Sprintf(parameter, status, spidercode, channel, site, channelurl, config.Name) // resp, err := http.Get(config.CompletionTaskUrl + parameter) // if err != nil { // log.Println("get task url err:", err) // } // defer resp.Body.Close() // 确保在函数结束时关闭响应体 queryParams := url.Values{} queryParams.Add("status", fmt.Sprintf("%v", status)) queryParams.Add("spidercode", spidercode) queryParams.Add("channel", channel) queryParams.Add("site", site) queryParams.Add("channelurl", channelurl) dockerid := GetDockerId() if dockerid == "" { dockerid = sp_config.Sysconfig.Name } queryParams.Add("spidersource", dockerid) queryParams.Add("count", fmt.Sprintf("%v", count)) // 构建完整的 URL baseURL, err := url.Parse(sp_config.Sysconfig.CompletionTaskUrl) if err != nil { log.Fatalf("Failed to parse base URL: %v", err) } baseURL.RawQuery = queryParams.Encode() resp, err := http.Get(baseURL.String()) if err != nil { log.Fatalf("HTTP request failed: %v", err) } defer resp.Body.Close() if status == -1 { //失败了存库 MgoCommon.Save("data_bak_commmon_err", map[string]interface{}{ "createtime": time.Now().Unix(), "spidercode": spidercode, "err": ret, }) } setmap := map[string]interface{}{} if listSelector != "" { setmap["listSelector"] = listSelector } if titleSelector != "" { setmap["titleSelector"] = titleSelector } if publishtimeSelector != "" { setmap["publishtimeSelector"] = publishtimeSelector } if contentSelector != "" { setmap["contentSelector"] = contentSelector } queryParams.Add("channel", channel) queryParams.Add("site", site) queryParams.Add("channelurl", channelurl) if len(setmap) > 0 { setmap["channel_common"] = channel setmap["site_common"] = site setmap["channelurl_common"] = channelurl Mgo.Update("luaconfig", map[string]interface{}{ "code": spidercode, }, map[string]interface{}{ "$set": setmap, }, true, false) } l.Push(lua.LString("ok")) return 1 })) //browser_autocheck 自动检测 state.SetGlobal("browser_autocheck", state.NewFunction(func(l *lua.LState) int { sess := Mgo.GetMgoConn() defer Mgo.DestoryMongoConn(sess) data := []map[string]interface{}{} it := sess.DB("zxl").C("luaconfig").Find(nil).Sort("-_id").Limit(8000).Select(map[string]interface{}{ "code": 1, "param_common": 1, "site": 1, }).Iter() total := 0 for tmp := make(map[string]interface{}); it.Next(&tmp); total++ { if total%500 == 0 { log.Println("cur index ", total) } code := qu.ObjToString(tmp["code"]) site := qu.ObjToString(tmp["site"]) param_common, _ := tmp["param_common"].([]interface{}) if len(param_common) < 11 { continue } href := qu.ObjToString(param_common[11]) name := qu.ObjToString(param_common[3]) tab, _ := ExtractDomain(href) data = append(data, map[string]interface{}{ "url": href, "tab": tab, "name": name, "site": site, "code": code, }) tmp = make(map[string]interface{}) } l.Push(MapToTable(map[string]interface{}{ "data": data, })) l.Push(lua.LString("ok")) return 2 })) }