/** 脚本加载+调用 封装, 前期走文件系统加载 后期走数据库配置, LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件 */ package spider import ( "bytes" "compress/gzip" "crypto/aes" "encoding/base64" "encoding/json" "io/ioutil" mu "mfw/util" "net/http" "net/url" "path" qu "qfw/util" "regexp" util "spiderutil" "strconv" "strings" "time" "golang.org/x/text/encoding/simplifiedchinese" "golang.org/x/text/transform" "github.com/cjoudrey/gluahttp" lujson "github.com/yuin/gopher-json" "github.com/yuin/gopher-lua" ) //脚本 type Script struct { SCode, ScriptFile string Encoding string Downloader string //下载器 Timeout int64 //超时时间秒 L *lua.LState Test_luareqcount int //脚本请求次数 Test_goreqtime int //go发起次数(时间) Test_goreqlist int //go发起次数(列表) Test_goreqcon int //go发起次数(正文) } //加载文件 func (s *Script) LoadScript(downloadnode, script string, isfile ...string) { s.ScriptFile = script options := lua.Options{ RegistrySize: 256 * 20, CallStackSize: 256, IncludeGoStackTrace: false, } s.L = lua.NewState(options) //s.L.ScriptFileName = s.SCode s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader) s.L.PreloadModule("json", lujson.Loader) if len(isfile) > 0 { if err := s.L.DoFile(script); err != nil { panic("加载lua脚本错误" + err.Error()) } } else { if err := s.L.DoString(script); err != nil { panic("加载lua脚本错误" + err.Error()) } } s.Encoding = s.GetVar("spiderPageEncoding") //暴露go方法 //download(url,head) 普通下载 s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int { head := S.ToTable(-1) url := S.ToString(-2) ishttps := S.ToBool(-3) charset := S.ToString(-4) if charset == "" { charset = s.Encoding } ret := Download(downloadnode, s.Downloader, url, "get", util.GetTable(head), charset, false, ishttps, "", s.Timeout) S.Push(lua.LString(ret)) s.Test_luareqcount++ return 1 })) s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int { gpath := S.ToString(-2) content := S.ToString(-1) ret := util.FindContentText(gpath, content) S.Push(ret) return 1 })) //高级下载download(url,method,param,head,cookie) s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int { cookie := S.ToString(-1) head := S.ToTable(-2) param := S.ToTable(-3) method := S.ToString(-4) url := S.ToString(-5) ishttps := S.ToBool(-6) charset := S.ToString(-7) if charset == "" { charset = s.Encoding } var mycookie []*http.Cookie json.Unmarshal([]byte(cookie), &mycookie) var ret string var retcookie []*http.Cookie if param == nil { ptext := map[string]interface{}{"text": S.ToString(-3)} ret, retcookie = DownloadAdv(downloadnode, s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, false, ishttps, "", s.Timeout) } else { ret, retcookie = DownloadAdv(downloadnode, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, false, ishttps, "", s.Timeout) } S.Push(lua.LString(ret)) scookie, _ := json.Marshal(retcookie) S.Push(lua.LString(scookie)) s.Test_luareqcount++ return 2 })) s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int { nodetype := S.ToString(-3) gpath := S.ToString(-2) content := S.ToString(-1) ret := util.FindOneText(gpath, content, nodetype) S.Push(ret) return 1 })) s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int { nodetype := S.ToString(-3) gpath := S.ToString(-2) content := S.ToString(-1) ret := util.FindOneHtml(gpath, content, nodetype) S.Push(ret) return 1 })) s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int { gpath := S.ToString(-2) content := S.ToString(-1) ret := s.L.NewTable() util.FindListText(gpath, content, ret) S.Push(ret) return 1 })) s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int { gpath := S.ToString(-2) content := S.ToString(-1) ret := s.L.NewTable() util.FindListHtml(gpath, content, ret) S.Push(ret) return 1 })) s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int { qmap := S.ToTable(-2) content := S.ToString(-1) ret := s.L.NewTable() util.FindMap(qmap, content, ret) S.Push(ret) return 1 })) //调用jsvm s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int { js := S.ToString(-1) ret := s.L.NewTable() if js == "" { ret.RawSet(lua.LString("val"), lua.LString("")) ret.RawSet(lua.LString("err"), lua.LString("js is null")) } else { rep := util.JsVmPost(util.Config.JsVmUrl, js) ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"]))) ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"]))) } S.Push(ret) return 1 })) //指定下载器 s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int { s.Downloader = GetOneDownloader() S.Push(lua.LString(s.Downloader)) return 1 })) //手工延时 s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int { time.Sleep(1 * time.Second) return 0 })) //编码解码 s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int { codeType := strings.ToLower(S.ToString(-2)) str := S.CheckString(-1) switch codeType { case "unicode": str = strings.Replace(str, "%u", "\\u", -1) str = transUnic(str) case "urlencode_gbk": data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder())) l, _ := url.Parse("http://a.com/?" + string(data)) tmpstr := l.Query().Encode() if len(tmpstr) > 1 { str = tmpstr[0 : len(tmpstr)-1] } else { str = "" } case "urlencode_utf8": l, _ := url.Parse("http://a.com/?" + str) tmpstr := l.Query().Encode() if len(tmpstr) > 1 { str = tmpstr[0 : len(tmpstr)-1] } else { str = "" } case "urldecode_utf8": str, _ = url.QueryUnescape(str) case "decode64": str = util.DecodeB64(str) case "encodemd5": str = qu.GetMd5String(str) case "htmldecode": //html实体码 //txt := `
太阳岛特勤消防站、松浦特勤消防站建设项目设计中标公示
` str = S.ToString(-1) reg, _ := regexp.Compile("&#\\d+;") str = reg.ReplaceAllStringFunc(str, func(src string) string { v, _ := strconv.Atoi(src[2 : len(src)-1]) return string(rune(v)) }) } S.Push(lua.LString(str)) return 1 })) //保存错误日志 s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int { return 0 })) //添加改版日志 s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int { return 0 })) //如果服务端返回的html是gzip压缩过格式的 这里需要转一下 s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int { html := S.ToString(-1) bs := []byte(html) gzipreader, _ := gzip.NewReader(bytes.NewReader(bs)) bs, _ = ioutil.ReadAll(gzipreader) S.Push(lua.LString(bs)) return 1 })) s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int { bResult := false S.Push(lua.LBool(bResult)) return 1 })) //解析附件中的word、pdf s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int { ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)} str := S.ToString(-2) extension := S.ToString(-1) bs, _ := base64.StdEncoding.DecodeString(str) bs = append([]byte{ext[extension]}, bs...) msgid := mu.UUID(8) Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60) return 1 })) //下载附件download(url,method,param,head,cookie,fileName) s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int { cookie := S.ToString(-1) head := S.ToTable(-2) param := S.ToTable(-3) method := S.ToString(-4) url := S.ToString(-5) fileName := S.ToString(-6) ishttps := strings.Contains(url, "https") var mycookie []*http.Cookie if cookie != "{}" { json.Unmarshal([]byte(cookie), &mycookie) } else { mycookie = make([]*http.Cookie, 0) } fileName = strings.TrimSpace(fileName) url = strings.TrimSpace(url) ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout) name, size, ftype, fid := "", "", "", "" qu.Debug(GarbledCodeReg.FindAllString(string(ret), -1), len(ret)) if ret == nil || len(ret) < 1024*5 { qu.Debug("下载文件出错!") } else { ftype = qu.GetFileType(ret) if (ftype == "docx" || ftype == "doc") && len(GarbledCodeReg.FindAllString(string(ret), -1)) > 10 { url, name, size, ftype, fid = "附件中含有乱码", "附件中含有乱码", "", "", "" } else { url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret) } } if strings.TrimSpace(ftype) == "" { if len(path.Ext(name)) > 0 { ftype = path.Ext(name)[1:] } } S.Push(lua.LString(url)) S.Push(lua.LString(name)) S.Push(lua.LString(size)) S.Push(lua.LString(ftype)) S.Push(lua.LString(fid)) return 5 })) //支持正则 s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int { index := int(S.ToNumber(-1)) regstr := S.ToString(-2) text := S.ToString(-3) reg := regexp.MustCompile(regstr) reps := reg.FindAllStringSubmatchIndex(text, -1) ret := s.L.NewTable() number := 0 for _, v := range reps { number++ ret.Insert(number, lua.LString(text[v[index]:v[index+1]])) } S.Push(ret) return 1 })) //支持替换 s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int { text := S.ToString(-3) old := S.ToString(-2) repl := S.ToString(-1) text = strings.Replace(text, old, repl, -1) S.Push(lua.LString(text)) return 1 })) //标题的关键词、排除词过滤 s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int { keyWordReg := regexp.MustCompile(util.Config.Word["keyword"]) notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"]) data := S.ToTable(-1) dataMap := util.TableToMap(data) ret := s.L.NewTable() num := 1 for _, v := range dataMap { tmp := v.(map[string]interface{}) isOk := false if title := qu.ObjToString(tmp["title"]); title != "" { if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) { isOk = true } } if isOk { ret.Insert(num, util.MapToLuaTable(S, tmp)) num++ } } S.Push(ret) return 1 })) //标题的关键词、排除词过滤 s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int { keyWordReg := regexp.MustCompile(util.Config.Word["keyword"]) notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"]) data := S.ToTable(-1) dataMap := util.TableToMap(data) if title := qu.ObjToString(dataMap["title"]); title != "" { if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) { S.Push(lua.LBool(true)) return 1 } else { qu.Debug(s.SCode, dataMap["href"], " title error") } } else { qu.Debug(s.SCode, dataMap["href"], " title error") } S.Push(lua.LBool(false)) return 1 })) //detail过滤 s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int { /* 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容) 2.是否含汉字 */ reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|详见附件|见附件)") reg2 := regexp.MustCompile("[\u4e00-\u9fa5]") detail := S.ToString(-1) if reg1.MatchString(detail) { S.Push(lua.LBool(true)) return 1 } if len([]rune(detail)) < 50 || !reg2.MatchString(detail) { S.Push(lua.LBool(false)) return 1 } S.Push(lua.LBool(false)) return 1 })) //匹配汉字 s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int { reg1 := regexp.MustCompile("(见附件|详见附件)") reg2 := regexp.MustCompile("[\u4e00-\u9fa5]") detail := S.ToString(-1) detail = reg1.ReplaceAllString(detail, "") ok := reg2.MatchString(detail) S.Push(lua.LBool(ok)) return 1 })) //aes ecb模式加密 s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-2) key := S.ToString(-1) bytekey := []byte(key) byteorigData := []byte(origData) cipher, _ := aes.NewCipher(generateKey([]byte(bytekey))) length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize plain := make([]byte, length*aes.BlockSize) copy(plain, byteorigData) pad := byte(len(plain) - len(byteorigData)) for i := len(byteorigData); i < len(plain); i++ { plain[i] = pad } encrypted := make([]byte, len(plain)) // 分组分块加密 for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() { cipher.Encrypt(encrypted[bs:be], plain[bs:be]) } result := base64.StdEncoding.EncodeToString(encrypted) S.Push(lua.LString(result)) return 1 })) } // func (s *Script) Reload() { s.L.Close() s.LoadScript("", s.ScriptFile) } //unicode转码 func transUnic(str string) string { buf := bytes.NewBuffer(nil) i, j := 0, len(str) for i < j { x := i + 6 if x > j { buf.WriteString(str[i:]) break } if str[i] == '\\' && str[i+1] == 'u' { hex := str[i+2 : x] r, err := strconv.ParseUint(hex, 16, 64) if err == nil { buf.WriteRune(rune(r)) } else { buf.WriteString(str[i:x]) } i = x } else { buf.WriteByte(str[i]) i++ } } return buf.String() } //取得变量 func (s *Script) GetVar(key string) string { return s.L.GetGlobal(key).String() } // func (s *Script) GetIntVar(key string) int { lv := s.L.GetGlobal(key) if v, ok := lv.(lua.LNumber); ok { return int(v) } return -1 } // func (s *Script) GetBoolVar(key string) bool { lv := s.L.GetGlobal(key) if v, ok := lv.(lua.LBool); ok { return bool(v) } return false } func generateKey(key []byte) (genKey []byte) { genKey = make([]byte, 16) copy(genKey, key) for i := 16; i < len(key); { for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 { genKey[j] ^= key[i] } } return genKey }