123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424 |
- /**
- * 虚拟机
- */
- package browser
- import (
- sp_config "KeyWebsiteMonitor/spider/config"
- . "KeyWebsiteMonitor/spider/util"
- spider_util "KeyWebsiteMonitor/util"
- "context"
- "encoding/json"
- "fmt"
- "io/ioutil"
- "log"
- "net/http"
- "net/url"
- "os"
- "time"
- . "KeyWebsiteMonitor/spider/db"
- qu "app.yhyue.com/moapp/jybase/common"
- "github.com/chromedp/chromedp"
- "github.com/yuin/gopher-lua"
- )
- const (
- run_on_device_remote = iota
- run_on_device_local
- )
- type (
- //虚拟机
- VM struct {
- WsAddr string
- Headless bool
- ShowImage bool
- ProxyAddr string
- RunMode int
- DownloadPath string
- B *Browser
- }
- //浏览器,(不用之前封装的,这个更轻量)
- Browser struct {
- Ctx context.Context
- CancelFn context.CancelFunc
- ExecuteJSChan chan bool
- }
- )
- // createRemoteBrowser 创建远程浏览器
- func createRemoteBrowser(wsAddr string) *Browser {
- allocCtx, cancelFn := chromedp.NewRemoteAllocator(context.TODO(),
- wsAddr)
- incCtx, _ := chromedp.NewContext(allocCtx)
- return &Browser{
- incCtx, cancelFn, make(chan bool, 1),
- }
- }
- // createLocalBrowser 创建本地浏览器
- func createLocalBrowser(headless,
- showImage bool,
- proxyAddr, downloadPath string) *Browser {
- baseCtx, _ := chromedp.NewContext(context.Background())
- chromeOptions := []chromedp.ExecAllocatorOption{
- chromedp.NoFirstRun,
- chromedp.NoDefaultBrowserCheck,
- chromedp.DisableGPU,
- chromedp.NoSandbox,
- chromedp.WindowSize(1920, 1080),
- chromedp.Flag("enable-automation", false), // 防止监测webdriver
- chromedp.Flag("disable-blink-features", "AutomationControlled"), //禁用 blink 特征
- chromedp.Flag("lang", "zh-CN"),
- chromedp.Flag("mixed-forms-disable-autofill", true), //从https转http不再检查
- chromedp.Flag("ignore-certificate-errors", true), //忽略错误
- chromedp.Flag("ignore-urlfetcher-cert-requests", true),
- chromedp.Flag("force-dev-mode-highlighting", true),
- chromedp.Flag("disable-extensions", true), //是否禁用扩展
- chromedp.Flag("headless", headless),
- chromedp.Flag("user-agent", "Chrome 9 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/93.0.4577.58 Safari/537.36 Edg/93.0.961.33"),
- chromedp.Flag("disable-keep-alive", true),
- chromedp.Flag("disable-dev-shm-usage", true),
- chromedp.Flag("disable-web-security", true), //禁用网络安全标志
- chromedp.Flag("mute-audio", true),
- chromedp.Flag("https-upgrades", "disabled"),
- chromedp.Flag("accept-language", `zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6`),
- //chromedp.Flag("blink-settings", "imagesEnabled=true"),
- //chromedp.Flag("incognito", true), //隐私模式
- chromedp.Flag("disable-cache", true), //不用缓存
- }
- if proxyAddr != "" {
- chromeOptions = append(chromeOptions,
- chromedp.ProxyServer(fmt.Sprintf("socks5://%s", proxyAddr)))
- }
- if downloadPath != "" {
- chromeOptions = append(chromeOptions,
- chromedp.Flag("download-path", downloadPath))
- }
- if showImage {
- chromeOptions = append(chromeOptions,
- chromedp.Flag("blink-settings", "imagesEnabled=true"),
- )
- } else {
- chromeOptions = append(chromeOptions,
- chromedp.Flag("blink-settings", "imagesEnabled=false"),
- )
- }
- allocCtx, _ := chromedp.NewExecAllocator(baseCtx, chromeOptions...)
- // 创建一个浏览器实例
- incCtx, incCancelFn := chromedp.NewContext(allocCtx,
- chromedp.WithLogf(log.Printf))
- return &Browser{
- incCtx, incCancelFn, make(chan bool, 1),
- }
- }
- // 重置浏览器
- func (vm *VM) ResetBrowser() {
- if vm.B != nil && vm.B.CancelFn != nil {
- // err := chromedp.Cancel(vm.B.Ctx)
- // log.Println("重置啦")
- vm.B.CancelFn()
- vm.B.Ctx = nil
- vm.B.CancelFn = nil
- // log.Println("vmb:", vm.B == nil, vm.B)
- }
- var b *Browser
- if vm.RunMode == run_on_device_local {
- b = createLocalBrowser(vm.Headless, vm.ShowImage, vm.ProxyAddr, vm.DownloadPath)
- // log.Println("新创建")
- } else {
- b = createRemoteBrowser(vm.WsAddr)
- }
- if vm.B == nil {
- vm.B = b
- // log.Println("重置了就不该进来了")
- } else {
- vm.B.Ctx, vm.B.CancelFn = b.Ctx, b.CancelFn
- // log.Println("重新赋值")
- }
- }
- var Sum = 0
- // BindLuaState 绑定虚拟机函数
- func (vm *VM) BindLuaState(state *lua.LState) {
- state.SetGlobal("browser_reset", state.NewFunction(func(l *lua.LState) int {
- vm.ResetBrowser()
- log.Println("重置浏览器执行完毕..")
- return 0
- }))
- state.SetGlobal("browser_save", state.NewFunction(func(l *lua.LState) int {
- siteName := l.ToString(-5)
- spiderCode := l.ToString(-4)
- siteChannelName := l.ToString(-3)
- siteChannelUrl := l.ToString(-2)
- data_table := l.ToTable(-1)
- data := TableToMap(data_table)
- data["site"] = siteName
- data["spidercode"] = spiderCode
- data["channel"] = siteChannelName
- data["channelurl"] = siteChannelUrl
- data["comeintime"] = time.Now().Add(time.Duration(1) * time.Hour).Unix()
- content := qu.ObjToString(data["content"])
- // data["content"] = spider_util.NewCut().ClearHtml(content)
- data["detail"] = spider_util.NewCut().ClearHtml(content)
- data["contenthtml"] = content
- data["s_title"] = data["title"]
- if data["title"] == nil || data["href"] == nil || data["content"] == nil ||
- qu.ObjToString(data["title"]) == "" ||
- qu.ObjToString(data["href"]) == "" ||
- qu.ObjToString(data["content"]) == "" {
- Mgo.Save("data_bak_commmon_err", map[string]interface{}{
- "createtime": time.Now().Unix(),
- "spidercode": spiderCode,
- "err": "缺失字段",
- "data": data,
- "href": qu.ObjToString(data["href"]) == "",
- "title": qu.ObjToString(data["title"]) == "",
- "content": qu.ObjToString(data["content"]) == "",
- })
- log.Println(fmt.Sprint("%s字段缺失,地址:%s", spiderCode, qu.ObjToString(data["href"])), qu.ObjToString(data["title"]) == "", qu.ObjToString(data["href"]) == "", qu.ObjToString(data["content"]) == "")
- return 0
- }
- href := qu.ObjToString(data["href"])
- //TODO 测试关闭
- // if ok := GetRedisKeyExists(href); ok {
- // //存在 过滤掉该条信息
- // return 0
- // }
- PutRedisKey(href)
- //清洗
- t := ParseTimeStr(qu.ObjToString(data["publishtime"]))
- if t > 0 {
- data["l_np_publishtime"] = t
- }
- data["sendflag"] = "false"
- data["_d"] = "comeintime"
- data["dataging"] = 0
- data["publishdept"] = ""
- data["create_time"] = time.Now().Unix()
- // lua中处理过了
- // data["infoformat"] = "infoformat"
- // data["iscompete"] = "iscompete"
- // data["T"] = "bidding" //TODO
- id := MgoCommon.Save("data_bak_commmon", data)
- log.Println("id:", id)
- return 0
- }))
- // state.SetGlobal("browser_url_last_segs", state.NewFunction(func(l *lua.LState) int {
- // segs := l.ToInt(-2)
- // href := l.ToString(-1)
- // if segs == 0 {
- // segs = 2
- // }
- // s := urlLastSegs(href, segs)
- // l.Push(lua.LString(s))
- // return 1
- // }))
- //最多传10个string参数,不支持其他类型
- state.SetGlobal("browser_log", state.NewFunction(func(l *lua.LState) int {
- params := []string{}
- for i := -10; i < 0; i++ {
- p := l.ToString(i)
- if p != "" {
- params = append(params, p)
- }
- }
- if sp_config.Sl != nil {
- sp_config.Sl.Log(params...)
- }
- return 0
- }))
- //browser_getTask 获取任务
- state.SetGlobal("browser_getTask", state.NewFunction(func(l *lua.LState) int {
- log.Println("开始获取任务")
- Sum++
- if Sum%50 == 0 {
- log.Println("已运行", Sum)
- }
- if Sum == 600 {
- log.Println("运行600个停止程序")
- os.Exit(0)
- }
- now := time.Now()
- // 获得分钟
- minute := now.Minute()
- // 判断分钟是否在 00 到 05 之间
- if minute >= 0 && minute < 15 {
- // l.Push(lua.LString("stop"))
- // os.Exit(0)
- // return 2
- }
- resp, err := http.Get(sp_config.Sysconfig.GetTaskUrl)
- if err != nil {
- log.Println("get task url err:", err)
- }
- defer resp.Body.Close() // 确保在函数结束时关闭响应体
- // 读取响应体
- body, err := ioutil.ReadAll(resp.Body)
- if err != nil {
- log.Println("get body err:", err)
- }
- // 将响应体转换为 map
- var data map[string]interface{}
- json.Unmarshal(body, &data)
- //q := qu.ObjToMap(data["data"])
- //附件
- //(*q)["ChannelUrl"] = "http://jnggzy.jnzbtb.cn/JiNing/Bulletins?CategoryCode=553001"
- //(*q)["ChannelUrl"] = "http://www.guangxibid.com.cn/zbcg/002002/list.html"
- //(*q)["ChannelUrl"] = "https://jtt.nx.gov.cn/zfxxgk/zfxxgkml/zdlygk/jtjsxmztb/zbgg/"
- //未取到href
- //(*q)["ChannelUrl"] = "http://changs.ccgp-hunan.gov.cn/gp/newsSerach.html?categoryId=170&basicArea=changsha"
- //(*q)["ChannelUrl"] = "https://ggzy.hebi.gov.cn:8060/jyxx/006001/006001003/transaction_infos.html?cnum=006001003"
- //超时的
- //(*q)["ChannelUrl"] = "http://60.166.52.108:8090/xxgkweb/blue/index.jsp?unit=002986280"
- // (*q)["ChannelUrl"] = "http://old.zmzb.com/bggghw/index.jhtml" //打不开页面的
- //(*q)["ChannelUrl"] = "http://www.bigdatahefei.com/index.php/index/category/index?id=92"
- //(*q)["ChannelUrl"] = "https://www.ahwxgt.com/a/tzgg/hangye/"
- //(*q)["ChannelUrl"] = "http://www.gxnd.gov.cn/xxgk/zdlyxxgk/ggzypz/ggzyjy/"
- //(*q)["ChannelUrl"] = "http://www.cqhysl.net/"
- //(*q)["ChannelUrl"] = "http://ggzyjy.dl.gov.cn/TPFront/jyxx/071002/071002003/?COLLCC=993736632&"
- // log.Println("---", data)
- l.Push(MapToTable(data))
- l.Push(lua.LString("ok"))
- return 2
- }))
- //browser_setTask 改变任务状态
- state.SetGlobal("browser_setTask", state.NewFunction(func(l *lua.LState) int {
- status := l.ToInt(-1)
- spidercode := l.ToString(-2)
- site := l.ToString(-3)
- channelurl := l.ToString(-4)
- channel := l.ToString(-5)
- ret := l.ToString(-6)
- listSelector := l.ToString(-7)
- titleSelector := l.ToString(-8)
- publishtimeSelector := l.ToString(-9)
- contentSelector := l.ToString(-10)
- count := l.ToInt(-11)
- // parameter := "?status=%v&spidercode=%s&channel=%s&site=%s&channelurl=%s&spidersource=%s"
- // parameter = fmt.Sprintf(parameter, status, spidercode, channel, site, channelurl, config.Name)
- // resp, err := http.Get(config.CompletionTaskUrl + parameter)
- // if err != nil {
- // log.Println("get task url err:", err)
- // }
- // defer resp.Body.Close() // 确保在函数结束时关闭响应体
- queryParams := url.Values{}
- queryParams.Add("status", fmt.Sprintf("%v", status))
- queryParams.Add("spidercode", spidercode)
- queryParams.Add("channel", channel)
- queryParams.Add("site", site)
- queryParams.Add("channelurl", channelurl)
- dockerid := GetDockerId()
- if dockerid == "" {
- dockerid = sp_config.Sysconfig.Name
- }
- queryParams.Add("spidersource", dockerid)
- queryParams.Add("count", fmt.Sprintf("%v", count))
- // 构建完整的 URL
- baseURL, err := url.Parse(sp_config.Sysconfig.CompletionTaskUrl)
- if err != nil {
- log.Fatalf("Failed to parse base URL: %v", err)
- }
- baseURL.RawQuery = queryParams.Encode()
- resp, err := http.Get(baseURL.String())
- if err != nil {
- log.Fatalf("HTTP request failed: %v", err)
- }
- defer resp.Body.Close()
- if status == -1 {
- //失败了存库
- MgoCommon.Save("data_bak_commmon_err", map[string]interface{}{
- "createtime": time.Now().Unix(),
- "spidercode": spidercode,
- "err": ret,
- })
- }
- setmap := map[string]interface{}{}
- if listSelector != "" {
- setmap["listSelector"] = listSelector
- }
- if titleSelector != "" {
- setmap["titleSelector"] = titleSelector
- }
- if publishtimeSelector != "" {
- setmap["publishtimeSelector"] = publishtimeSelector
- }
- if contentSelector != "" {
- setmap["contentSelector"] = contentSelector
- }
- queryParams.Add("channel", channel)
- queryParams.Add("site", site)
- queryParams.Add("channelurl", channelurl)
- if len(setmap) > 0 {
- setmap["channel_common"] = channel
- setmap["site_common"] = site
- setmap["channelurl_common"] = channelurl
- Mgo.Update("luaconfig", map[string]interface{}{
- "code": spidercode,
- }, map[string]interface{}{
- "$set": setmap,
- }, true, false)
- }
- l.Push(lua.LString("ok"))
- return 1
- }))
- //browser_autocheck 自动检测
- state.SetGlobal("browser_autocheck", state.NewFunction(func(l *lua.LState) int {
- sess := Mgo.GetMgoConn()
- defer Mgo.DestoryMongoConn(sess)
- data := []map[string]interface{}{}
- it := sess.DB("zxl").C("luaconfig").Find(nil).Sort("-_id").Limit(8000).Select(map[string]interface{}{
- "code": 1,
- "param_common": 1,
- "site": 1,
- }).Iter()
- total := 0
- for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
- if total%500 == 0 {
- log.Println("cur index ", total)
- }
- code := qu.ObjToString(tmp["code"])
- site := qu.ObjToString(tmp["site"])
- param_common, _ := tmp["param_common"].([]interface{})
- if len(param_common) < 11 {
- continue
- }
- href := qu.ObjToString(param_common[11])
- name := qu.ObjToString(param_common[3])
- tab, _ := ExtractDomain(href)
- data = append(data, map[string]interface{}{
- "url": href,
- "tab": tab,
- "name": name,
- "site": site,
- "code": code,
- })
- tmp = make(map[string]interface{})
- }
- l.Push(MapToTable(map[string]interface{}{
- "data": data,
- }))
- l.Push(lua.LString("ok"))
- return 2
- }))
- }
|