123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503 |
- /**
- 脚本加载+调用 封装,
- 前期走文件系统加载
- 后期走数据库配置,
- LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
- */
- package spider
- import (
- "bytes"
- "compress/gzip"
- "crypto/aes"
- "encoding/base64"
- "encoding/json"
- "io/ioutil"
- mu "mfw/util"
- "net/http"
- "net/url"
- "path"
- qu "qfw/util"
- "regexp"
- util "spiderutil"
- "strconv"
- "strings"
- "time"
- "golang.org/x/text/encoding/simplifiedchinese"
- "golang.org/x/text/transform"
- "github.com/cjoudrey/gluahttp"
- lujson "github.com/yuin/gopher-json"
- "github.com/yuin/gopher-lua"
- )
- //脚本
- type Script struct {
- SCode, ScriptFile string
- Encoding string
- Downloader string //下载器
- Timeout int64 //超时时间秒
- L *lua.LState
- Test_luareqcount int //脚本请求次数
- Test_goreqtime int //go发起次数(时间)
- Test_goreqlist int //go发起次数(列表)
- Test_goreqcon int //go发起次数(正文)
- }
- //加载文件
- func (s *Script) LoadScript(downloadnode, script string, isfile ...string) {
- s.ScriptFile = script
- options := lua.Options{
- RegistrySize: 256 * 20,
- CallStackSize: 256,
- IncludeGoStackTrace: false,
- }
- s.L = lua.NewState(options)
- //s.L.ScriptFileName = s.SCode
- s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
- s.L.PreloadModule("json", lujson.Loader)
- if len(isfile) > 0 {
- if err := s.L.DoFile(script); err != nil {
- panic("加载lua脚本错误" + err.Error())
- }
- } else {
- if err := s.L.DoString(script); err != nil {
- panic("加载lua脚本错误" + err.Error())
- }
- }
- s.Encoding = s.GetVar("spiderPageEncoding")
- //暴露go方法
- //download(url,head) 普通下载
- s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
- head := S.ToTable(-1)
- url := S.ToString(-2)
- ishttps := S.ToBool(-3)
- charset := S.ToString(-4)
- if charset == "" {
- charset = s.Encoding
- }
- ret := Download(downloadnode, s.Downloader, url, "get", util.GetTable(head), charset, false, ishttps, "", s.Timeout)
- S.Push(lua.LString(ret))
- s.Test_luareqcount++
- return 1
- }))
- s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := util.FindContentText(gpath, content)
- S.Push(ret)
- return 1
- }))
- //高级下载download(url,method,param,head,cookie)
- s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
- cookie := S.ToString(-1)
- head := S.ToTable(-2)
- param := S.ToTable(-3)
- method := S.ToString(-4)
- url := S.ToString(-5)
- ishttps := S.ToBool(-6)
- charset := S.ToString(-7)
- if charset == "" {
- charset = s.Encoding
- }
- var mycookie []*http.Cookie
- json.Unmarshal([]byte(cookie), &mycookie)
- var ret string
- var retcookie []*http.Cookie
- if param == nil {
- ptext := map[string]interface{}{"text": S.ToString(-3)}
- ret, retcookie = DownloadAdv(downloadnode, s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, false, ishttps, "", s.Timeout)
- } else {
- ret, retcookie = DownloadAdv(downloadnode, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, false, ishttps, "", s.Timeout)
- }
- S.Push(lua.LString(ret))
- scookie, _ := json.Marshal(retcookie)
- S.Push(lua.LString(scookie))
- s.Test_luareqcount++
- return 2
- }))
- s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
- nodetype := S.ToString(-3)
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := util.FindOneText(gpath, content, nodetype)
- S.Push(ret)
- return 1
- }))
- s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
- nodetype := S.ToString(-3)
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := util.FindOneHtml(gpath, content, nodetype)
- S.Push(ret)
- return 1
- }))
- s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := s.L.NewTable()
- util.FindListText(gpath, content, ret)
- S.Push(ret)
- return 1
- }))
- s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := s.L.NewTable()
- util.FindListHtml(gpath, content, ret)
- S.Push(ret)
- return 1
- }))
- s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
- qmap := S.ToTable(-2)
- content := S.ToString(-1)
- ret := s.L.NewTable()
- util.FindMap(qmap, content, ret)
- S.Push(ret)
- return 1
- }))
- //调用jsvm
- s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
- js := S.ToString(-1)
- ret := s.L.NewTable()
- if js == "" {
- ret.RawSet(lua.LString("val"), lua.LString(""))
- ret.RawSet(lua.LString("err"), lua.LString("js is null"))
- } else {
- rep := util.JsVmPost(util.Config.JsVmUrl, js)
- ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
- ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
- }
- S.Push(ret)
- return 1
- }))
- //指定下载器
- s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
- s.Downloader = GetOneDownloader()
- S.Push(lua.LString(s.Downloader))
- return 1
- }))
- //手工延时
- s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
- time.Sleep(1 * time.Second)
- return 0
- }))
- //编码解码
- s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
- codeType := strings.ToLower(S.ToString(-2))
- str := S.CheckString(-1)
- switch codeType {
- case "unicode":
- str = strings.Replace(str, "%u", "\\u", -1)
- str = transUnic(str)
- case "urlencode_gbk":
- data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
- l, _ := url.Parse("http://a.com/?" + string(data))
- tmpstr := l.Query().Encode()
- if len(tmpstr) > 1 {
- str = tmpstr[0 : len(tmpstr)-1]
- } else {
- str = ""
- }
- case "urlencode_utf8":
- l, _ := url.Parse("http://a.com/?" + str)
- tmpstr := l.Query().Encode()
- if len(tmpstr) > 1 {
- str = tmpstr[0 : len(tmpstr)-1]
- } else {
- str = ""
- }
- case "urldecode_utf8":
- str, _ = url.QueryUnescape(str)
- case "decode64":
- str = util.DecodeB64(str)
- case "encodemd5":
- str = qu.GetMd5String(str)
- case "htmldecode": //html实体码
- //txt := `<div align="left" style="margin-left: 0pt;"><span style='font-family:; font-size:13px; color:#000000'>太阳岛特勤消防站、松浦特勤消防站建设项目设计中标公示</span></div>`
- str = S.ToString(-1)
- reg, _ := regexp.Compile("&#\\d+;")
- str = reg.ReplaceAllStringFunc(str, func(src string) string {
- v, _ := strconv.Atoi(src[2 : len(src)-1])
- return string(rune(v))
- })
- }
- S.Push(lua.LString(str))
- return 1
- }))
- //保存错误日志
- s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
- return 0
- }))
- //添加改版日志
- s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
- return 0
- }))
- //如果服务端返回的html是gzip压缩过格式的 这里需要转一下
- s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
- html := S.ToString(-1)
- bs := []byte(html)
- gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
- bs, _ = ioutil.ReadAll(gzipreader)
- S.Push(lua.LString(bs))
- return 1
- }))
- s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
- bResult := false
- S.Push(lua.LBool(bResult))
- return 1
- }))
- //解析附件中的word、pdf
- s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
- ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
- str := S.ToString(-2)
- extension := S.ToString(-1)
- bs, _ := base64.StdEncoding.DecodeString(str)
- bs = append([]byte{ext[extension]}, bs...)
- msgid := mu.UUID(8)
- Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
- return 1
- }))
- //下载附件download(url,method,param,head,cookie,fileName)
- s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
- cookie := S.ToString(-1)
- head := S.ToTable(-2)
- param := S.ToTable(-3)
- method := S.ToString(-4)
- url := S.ToString(-5)
- fileName := S.ToString(-6)
- ishttps := strings.Contains(url, "https")
- var mycookie []*http.Cookie
- if cookie != "{}" {
- json.Unmarshal([]byte(cookie), &mycookie)
- } else {
- mycookie = make([]*http.Cookie, 0)
- }
- fileName = strings.TrimSpace(fileName)
- url = strings.TrimSpace(url)
- ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout)
- name, size, ftype, fid := "", "", "", ""
- qu.Debug(GarbledCodeReg.FindAllString(string(ret), -1), len(ret))
- if ret == nil || len(ret) < 1024*5 {
- qu.Debug("下载文件出错!")
- } else {
- ftype = qu.GetFileType(ret)
- if (ftype == "docx" || ftype == "doc") && len(GarbledCodeReg.FindAllString(string(ret), -1)) > 10 {
- url, name, size, ftype, fid = "附件中含有乱码", "附件中含有乱码", "", "", ""
- } else {
- url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
- }
- }
- if strings.TrimSpace(ftype) == "" {
- if len(path.Ext(name)) > 0 {
- ftype = path.Ext(name)[1:]
- }
- }
- S.Push(lua.LString(url))
- S.Push(lua.LString(name))
- S.Push(lua.LString(size))
- S.Push(lua.LString(ftype))
- S.Push(lua.LString(fid))
- return 5
- }))
- //支持正则
- s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
- index := int(S.ToNumber(-1))
- regstr := S.ToString(-2)
- text := S.ToString(-3)
- reg := regexp.MustCompile(regstr)
- reps := reg.FindAllStringSubmatchIndex(text, -1)
- ret := s.L.NewTable()
- number := 0
- for _, v := range reps {
- number++
- ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
- }
- S.Push(ret)
- return 1
- }))
- //支持替换
- s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
- text := S.ToString(-3)
- old := S.ToString(-2)
- repl := S.ToString(-1)
- text = strings.Replace(text, old, repl, -1)
- S.Push(lua.LString(text))
- return 1
- }))
- //标题的关键词、排除词过滤
- s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
- keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
- notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
- data := S.ToTable(-1)
- dataMap := util.TableToMap(data)
- ret := s.L.NewTable()
- num := 1
- for _, v := range dataMap {
- tmp := v.(map[string]interface{})
- isOk := false
- if title := qu.ObjToString(tmp["title"]); title != "" {
- if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
- isOk = true
- }
- }
- if isOk {
- ret.Insert(num, util.MapToLuaTable(S, tmp))
- num++
- }
- }
- S.Push(ret)
- return 1
- }))
- //标题的关键词、排除词过滤
- s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
- keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
- notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
- data := S.ToTable(-1)
- dataMap := util.TableToMap(data)
- if title := qu.ObjToString(dataMap["title"]); title != "" {
- if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
- S.Push(lua.LBool(true))
- return 1
- } else {
- qu.Debug(s.SCode, dataMap["href"], " title error")
- }
- } else {
- qu.Debug(s.SCode, dataMap["href"], " title error")
- }
- S.Push(lua.LBool(false))
- return 1
- }))
- //detail过滤
- s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
- /*
- 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
- 2.是否含汉字
- */
- reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|详见附件|见附件)")
- reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
- detail := S.ToString(-1)
- if reg1.MatchString(detail) {
- S.Push(lua.LBool(true))
- return 1
- }
- if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
- S.Push(lua.LBool(false))
- return 1
- }
- S.Push(lua.LBool(false))
- return 1
- }))
- //匹配汉字
- s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
- reg1 := regexp.MustCompile("(见附件|详见附件)")
- reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
- detail := S.ToString(-1)
- detail = reg1.ReplaceAllString(detail, "")
- ok := reg2.MatchString(detail)
- S.Push(lua.LBool(ok))
- return 1
- }))
- //aes ecb模式加密
- s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
- origData := S.ToString(-2)
- key := S.ToString(-1)
- bytekey := []byte(key)
- byteorigData := []byte(origData)
- cipher, _ := aes.NewCipher(generateKey([]byte(bytekey)))
- length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize
- plain := make([]byte, length*aes.BlockSize)
- copy(plain, byteorigData)
- pad := byte(len(plain) - len(byteorigData))
- for i := len(byteorigData); i < len(plain); i++ {
- plain[i] = pad
- }
- encrypted := make([]byte, len(plain))
- // 分组分块加密
- for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() {
- cipher.Encrypt(encrypted[bs:be], plain[bs:be])
- }
- result := base64.StdEncoding.EncodeToString(encrypted)
- S.Push(lua.LString(result))
- return 1
- }))
- }
- //
- func (s *Script) Reload() {
- s.L.Close()
- s.LoadScript("", s.ScriptFile)
- }
- //unicode转码
- func transUnic(str string) string {
- buf := bytes.NewBuffer(nil)
- i, j := 0, len(str)
- for i < j {
- x := i + 6
- if x > j {
- buf.WriteString(str[i:])
- break
- }
- if str[i] == '\\' && str[i+1] == 'u' {
- hex := str[i+2 : x]
- r, err := strconv.ParseUint(hex, 16, 64)
- if err == nil {
- buf.WriteRune(rune(r))
- } else {
- buf.WriteString(str[i:x])
- }
- i = x
- } else {
- buf.WriteByte(str[i])
- i++
- }
- }
- return buf.String()
- }
- //取得变量
- func (s *Script) GetVar(key string) string {
- return s.L.GetGlobal(key).String()
- }
- //
- func (s *Script) GetIntVar(key string) int {
- lv := s.L.GetGlobal(key)
- if v, ok := lv.(lua.LNumber); ok {
- return int(v)
- }
- return -1
- }
- //
- func (s *Script) GetBoolVar(key string) bool {
- lv := s.L.GetGlobal(key)
- if v, ok := lv.(lua.LBool); ok {
- return bool(v)
- }
- return false
- }
- func generateKey(key []byte) (genKey []byte) {
- genKey = make([]byte, 16)
- copy(genKey, key)
- for i := 16; i < len(key); {
- for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
- genKey[j] ^= key[i]
- }
- }
- return genKey
- }
|