/**
脚本加载+调用 封装,
前期走文件系统加载
后期走数据库配置,
LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
*/
package spider
import (
"bytes"
"compress/gzip"
"crypto/aes"
"encoding/base64"
"encoding/json"
"io/ioutil"
mu "mfw/util"
"net/http"
"net/url"
"path"
qu "qfw/util"
"regexp"
util "spiderutil"
"strconv"
"strings"
"time"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
"github.com/cjoudrey/gluahttp"
lujson "github.com/yuin/gopher-json"
"github.com/yuin/gopher-lua"
)
//脚本
type Script struct {
SCode, ScriptFile string
Encoding string
Downloader string //下载器
Timeout int64 //超时时间秒
L *lua.LState
Test_luareqcount int //脚本请求次数
Test_goreqtime int //go发起次数(时间)
Test_goreqlist int //go发起次数(列表)
Test_goreqcon int //go发起次数(正文)
}
//加载文件
func (s *Script) LoadScript(downloadnode, script string, isfile ...string) {
s.ScriptFile = script
options := lua.Options{
RegistrySize: 256 * 20,
CallStackSize: 256,
IncludeGoStackTrace: false,
}
s.L = lua.NewState(options)
//s.L.ScriptFileName = s.SCode
s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
s.L.PreloadModule("json", lujson.Loader)
if len(isfile) > 0 {
if err := s.L.DoFile(script); err != nil {
panic("加载lua脚本错误" + err.Error())
}
} else {
if err := s.L.DoString(script); err != nil {
panic("加载lua脚本错误" + err.Error())
}
}
s.Encoding = s.GetVar("spiderPageEncoding")
//暴露go方法
//download(url,head) 普通下载
s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
head := S.ToTable(-1)
url := S.ToString(-2)
ishttps := S.ToBool(-3)
charset := S.ToString(-4)
if charset == "" {
charset = s.Encoding
}
ret := Download(downloadnode, s.Downloader, url, "get", util.GetTable(head), charset, false, ishttps, "", s.Timeout)
S.Push(lua.LString(ret))
s.Test_luareqcount++
return 1
}))
s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := util.FindContentText(gpath, content)
S.Push(ret)
return 1
}))
//高级下载download(url,method,param,head,cookie)
s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
cookie := S.ToString(-1)
head := S.ToTable(-2)
param := S.ToTable(-3)
method := S.ToString(-4)
url := S.ToString(-5)
ishttps := S.ToBool(-6)
charset := S.ToString(-7)
if charset == "" {
charset = s.Encoding
}
var mycookie []*http.Cookie
json.Unmarshal([]byte(cookie), &mycookie)
var ret string
var retcookie []*http.Cookie
if param == nil {
ptext := map[string]interface{}{"text": S.ToString(-3)}
ret, retcookie = DownloadAdv(downloadnode, s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, false, ishttps, "", s.Timeout)
} else {
ret, retcookie = DownloadAdv(downloadnode, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, false, ishttps, "", s.Timeout)
}
S.Push(lua.LString(ret))
scookie, _ := json.Marshal(retcookie)
S.Push(lua.LString(scookie))
s.Test_luareqcount++
return 2
}))
s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
nodetype := S.ToString(-3)
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := util.FindOneText(gpath, content, nodetype)
S.Push(ret)
return 1
}))
s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
nodetype := S.ToString(-3)
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := util.FindOneHtml(gpath, content, nodetype)
S.Push(ret)
return 1
}))
s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := s.L.NewTable()
util.FindListText(gpath, content, ret)
S.Push(ret)
return 1
}))
s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := s.L.NewTable()
util.FindListHtml(gpath, content, ret)
S.Push(ret)
return 1
}))
s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
qmap := S.ToTable(-2)
content := S.ToString(-1)
ret := s.L.NewTable()
util.FindMap(qmap, content, ret)
S.Push(ret)
return 1
}))
//调用jsvm
s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
js := S.ToString(-1)
ret := s.L.NewTable()
if js == "" {
ret.RawSet(lua.LString("val"), lua.LString(""))
ret.RawSet(lua.LString("err"), lua.LString("js is null"))
} else {
rep := util.JsVmPost(util.Config.JsVmUrl, js)
ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
}
S.Push(ret)
return 1
}))
//指定下载器
s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
s.Downloader = GetOneDownloader()
S.Push(lua.LString(s.Downloader))
return 1
}))
//手工延时
s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
time.Sleep(1 * time.Second)
return 0
}))
//编码解码
s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
codeType := strings.ToLower(S.ToString(-2))
str := S.CheckString(-1)
switch codeType {
case "unicode":
str = strings.Replace(str, "%u", "\\u", -1)
str = transUnic(str)
case "urlencode_gbk":
data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
l, _ := url.Parse("http://a.com/?" + string(data))
tmpstr := l.Query().Encode()
if len(tmpstr) > 1 {
str = tmpstr[0 : len(tmpstr)-1]
} else {
str = ""
}
case "urlencode_utf8":
l, _ := url.Parse("http://a.com/?" + str)
tmpstr := l.Query().Encode()
if len(tmpstr) > 1 {
str = tmpstr[0 : len(tmpstr)-1]
} else {
str = ""
}
case "urldecode_utf8":
str, _ = url.QueryUnescape(str)
case "decode64":
str = util.DecodeB64(str)
case "encodemd5":
str = qu.GetMd5String(str)
case "htmldecode": //html实体码
//txt := `
太阳岛特勤消防站、松浦特勤消防站建设项目设计中标公示
`
str = S.ToString(-1)
reg, _ := regexp.Compile("\\d+;")
str = reg.ReplaceAllStringFunc(str, func(src string) string {
v, _ := strconv.Atoi(src[2 : len(src)-1])
return string(rune(v))
})
}
S.Push(lua.LString(str))
return 1
}))
//保存错误日志
s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
return 0
}))
//添加改版日志
s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
return 0
}))
//如果服务端返回的html是gzip压缩过格式的 这里需要转一下
s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
html := S.ToString(-1)
bs := []byte(html)
gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
bs, _ = ioutil.ReadAll(gzipreader)
S.Push(lua.LString(bs))
return 1
}))
s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
bResult := false
S.Push(lua.LBool(bResult))
return 1
}))
//解析附件中的word、pdf
s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
str := S.ToString(-2)
extension := S.ToString(-1)
bs, _ := base64.StdEncoding.DecodeString(str)
bs = append([]byte{ext[extension]}, bs...)
msgid := mu.UUID(8)
Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
return 1
}))
//下载附件download(url,method,param,head,cookie,fileName)
s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
cookie := S.ToString(-1)
head := S.ToTable(-2)
param := S.ToTable(-3)
method := S.ToString(-4)
url := S.ToString(-5)
fileName := S.ToString(-6)
ishttps := strings.Contains(url, "https")
var mycookie []*http.Cookie
if cookie != "{}" {
json.Unmarshal([]byte(cookie), &mycookie)
} else {
mycookie = make([]*http.Cookie, 0)
}
fileName = strings.TrimSpace(fileName)
url = strings.TrimSpace(url)
ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout)
name, size, ftype, fid := "", "", "", ""
qu.Debug(GarbledCodeReg.FindAllString(string(ret), -1), len(ret))
if ret == nil || len(ret) < 1024*5 {
qu.Debug("下载文件出错!")
} else {
ftype = qu.GetFileType(ret)
if (ftype == "docx" || ftype == "doc") && len(GarbledCodeReg.FindAllString(string(ret), -1)) > 10 {
url, name, size, ftype, fid = "附件中含有乱码", "附件中含有乱码", "", "", ""
} else {
url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
}
}
if strings.TrimSpace(ftype) == "" {
if len(path.Ext(name)) > 0 {
ftype = path.Ext(name)[1:]
}
}
S.Push(lua.LString(url))
S.Push(lua.LString(name))
S.Push(lua.LString(size))
S.Push(lua.LString(ftype))
S.Push(lua.LString(fid))
return 5
}))
//支持正则
s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
index := int(S.ToNumber(-1))
regstr := S.ToString(-2)
text := S.ToString(-3)
reg := regexp.MustCompile(regstr)
reps := reg.FindAllStringSubmatchIndex(text, -1)
ret := s.L.NewTable()
number := 0
for _, v := range reps {
number++
ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
}
S.Push(ret)
return 1
}))
//支持替换
s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
text := S.ToString(-3)
old := S.ToString(-2)
repl := S.ToString(-1)
text = strings.Replace(text, old, repl, -1)
S.Push(lua.LString(text))
return 1
}))
//标题的关键词、排除词过滤
s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
data := S.ToTable(-1)
dataMap := util.TableToMap(data)
ret := s.L.NewTable()
num := 1
for _, v := range dataMap {
tmp := v.(map[string]interface{})
isOk := false
if title := qu.ObjToString(tmp["title"]); title != "" {
if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
isOk = true
}
}
if isOk {
ret.Insert(num, util.MapToLuaTable(S, tmp))
num++
}
}
S.Push(ret)
return 1
}))
//标题的关键词、排除词过滤
s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
data := S.ToTable(-1)
dataMap := util.TableToMap(data)
if title := qu.ObjToString(dataMap["title"]); title != "" {
if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
S.Push(lua.LBool(true))
return 1
} else {
qu.Debug(s.SCode, dataMap["href"], " title error")
}
} else {
qu.Debug(s.SCode, dataMap["href"], " title error")
}
S.Push(lua.LBool(false))
return 1
}))
//detail过滤
s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
/*
1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
2.是否含汉字
*/
reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|详见附件|见附件)")
reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
detail := S.ToString(-1)
if reg1.MatchString(detail) {
S.Push(lua.LBool(true))
return 1
}
if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
S.Push(lua.LBool(false))
return 1
}
S.Push(lua.LBool(false))
return 1
}))
//匹配汉字
s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
reg1 := regexp.MustCompile("(见附件|详见附件)")
reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
detail := S.ToString(-1)
detail = reg1.ReplaceAllString(detail, "")
ok := reg2.MatchString(detail)
S.Push(lua.LBool(ok))
return 1
}))
//aes ecb模式加密
s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-2)
key := S.ToString(-1)
bytekey := []byte(key)
byteorigData := []byte(origData)
cipher, _ := aes.NewCipher(generateKey([]byte(bytekey)))
length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize
plain := make([]byte, length*aes.BlockSize)
copy(plain, byteorigData)
pad := byte(len(plain) - len(byteorigData))
for i := len(byteorigData); i < len(plain); i++ {
plain[i] = pad
}
encrypted := make([]byte, len(plain))
// 分组分块加密
for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() {
cipher.Encrypt(encrypted[bs:be], plain[bs:be])
}
result := base64.StdEncoding.EncodeToString(encrypted)
S.Push(lua.LString(result))
return 1
}))
}
//
func (s *Script) Reload() {
s.L.Close()
s.LoadScript("", s.ScriptFile)
}
//unicode转码
func transUnic(str string) string {
buf := bytes.NewBuffer(nil)
i, j := 0, len(str)
for i < j {
x := i + 6
if x > j {
buf.WriteString(str[i:])
break
}
if str[i] == '\\' && str[i+1] == 'u' {
hex := str[i+2 : x]
r, err := strconv.ParseUint(hex, 16, 64)
if err == nil {
buf.WriteRune(rune(r))
} else {
buf.WriteString(str[i:x])
}
i = x
} else {
buf.WriteByte(str[i])
i++
}
}
return buf.String()
}
//取得变量
func (s *Script) GetVar(key string) string {
return s.L.GetGlobal(key).String()
}
//
func (s *Script) GetIntVar(key string) int {
lv := s.L.GetGlobal(key)
if v, ok := lv.(lua.LNumber); ok {
return int(v)
}
return -1
}
//
func (s *Script) GetBoolVar(key string) bool {
lv := s.L.GetGlobal(key)
if v, ok := lv.(lua.LBool); ok {
return bool(v)
}
return false
}
func generateKey(key []byte) (genKey []byte) {
genKey = make([]byte, 16)
copy(genKey, key)
for i := 16; i < len(key); {
for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
genKey[j] ^= key[i]
}
}
return genKey
}