package spider
import (
codegrpc "analysiscode/client"
"bytes"
"compress/gzip"
"encoding/base64"
"encoding/json"
"github.com/shopspring/decimal"
gojs "gorunjs/client"
"io"
"io/ioutil"
mu "mfw/util"
"net/http"
"net/url"
"path"
qu "qfw/util"
"regexp"
util "spiderutil"
"strconv"
"strings"
"sync/atomic"
"time"
gq "github.com/PuerkitoBio/goquery"
"github.com/cjoudrey/gluahttp"
"github.com/donnie4w/go-logger/logger"
lujson "github.com/yuin/gopher-json"
"github.com/yuin/gopher-lua"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/transform"
)
const (
MAX_STEP = 5 //计算时的最大步长
)
var TimeSleepChan = make(chan bool, 1)
// 脚本
type Script struct {
SCode, ScriptFile string
Encoding string
Userproxy bool
//Ishttps bool
ErrorNum int32 //错误数
Downloader string //下载器
TotalRequestNum int32 //总请求次数
ToDayRequestNum int32 //今日请求次数
YestoDayRequestNum int32 //昨日请求次数
Timeout int64 //超时时间秒
L *lua.LState
NoDownloadNum int32 //未成功下载数
LastThreeTimes []time.Duration //单条信息流程完成的时间,最后三次
FileLastThreeTimes []time.Duration //附件下载单条信息流程完成的时间,最后三次
}
var ErrFid = "a6879f0a8570256aa21fb978e6dabb50429a30dfacff697cf0b898abbc5c262e" //限制访问的附件
// 加载文件
func (s *Script) LoadScript(site, channel, user *string, code, script_file string) string {
defer mu.Catch()
s.SCode = code
s.ScriptFile = script_file
s.L = lua.NewState(lua.Options{
RegistrySize: 256 * 20,
CallStackSize: 256,
IncludeGoStackTrace: false,
})
s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
s.L.PreloadModule("json", lujson.Loader)
if err := s.L.DoString(script_file); err != nil {
logger.Debug(code + ",加载lua脚本错误:" + err.Error())
return "加载lua脚本错误:" + err.Error()
//panic(code + ",加载lua脚本错误:" + err.Error())
}
s.Encoding = s.GetVar("spiderPageEncoding")
s.Userproxy = s.GetBoolVar("spiderUserProxy")
//暴露go方法
//download(url,head) 普通下载
s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
if s.LastThreeTimes == nil {
s.LastThreeTimes = make([]time.Duration, 4)
}
if util.Config.IsDelay {
SleepTime(1, s.LastThreeTimes) //睡眠时间
}
start := time.Now() //起始时间
head := S.ToTable(-1)
url := S.ToString(-2)
ishttps := S.ToBool(-3)
charset := S.ToString(-4)
if charset == "" {
charset = s.Encoding
}
var retLen int64
ret := Download(&retLen, s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
//流量统计
//if retLen > 0 {
// key := Today + "+" + code
// if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
// if sfMap, ok := sf.(*SpiderFlow); ok {
// sfMap.Flow += retLen
// //sfMap.Site = *site
// //sfMap.Channel = *channel
// //sfMap.ModifyUser = *user
// SpiderFlowMap.Store(key, sfMap)
// }
// } else {
// SpiderFlowMap.Store(key, &SpiderFlow{
// //Code: code,
// Site: *site,
// Channel: *channel,
// Flow: retLen,
// ModifyUser: *user,
// })
// }
//}
S.Push(lua.LString(ret))
atomic.AddInt32(&s.ToDayRequestNum, 1)
atomic.AddInt32(&s.TotalRequestNum, 1)
end := time.Since(start)
if len(s.LastThreeTimes) >= 4 {
s.LastThreeTimes = s.LastThreeTimes[1:]
}
s.LastThreeTimes = append(s.LastThreeTimes, end)
return 1
}))
//高级下载downloadAdv(url,method,param,head,cookie)
s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
if s.LastThreeTimes == nil {
s.LastThreeTimes = make([]time.Duration, 4)
}
if util.Config.IsDelay {
SleepTime(1, s.LastThreeTimes) //睡眠时间
}
start := time.Now() //起始时间
cookie := S.ToString(-1)
head := S.ToTable(-2)
param := S.ToTable(-3)
method := S.ToString(-4)
url := S.ToString(-5)
ishttps := S.ToBool(-6)
charset := S.ToString(-7)
if charset == "" {
charset = s.Encoding
}
var mycookie []*http.Cookie
json.Unmarshal([]byte(cookie), &mycookie)
var ret string
var retcookie []*http.Cookie
var headers = map[string]interface{}{}
var retLen int64
if param == nil {
ptext := map[string]interface{}{"text": S.ToString(-3)}
ret, retcookie, headers = DownloadAdv(&retLen, s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
} else {
ret, retcookie, headers = DownloadAdv(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
}
//流量统计
//if retLen > 0 {
// key := Today + "+" + code
// if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
// if sfMap, ok := sf.(*SpiderFlow); ok {
// sfMap.Flow += retLen
// //sfMap.Site = *site
// //sfMap.Channel = *channel
// //sfMap.ModifyUser = *user
// SpiderFlowMap.Store(key, sfMap)
// }
// } else {
// SpiderFlowMap.Store(key, &SpiderFlow{
// //Code: code,
// Site: *site,
// Channel: *channel,
// Flow: retLen,
// ModifyUser: *user,
// })
// }
//}
S.Push(lua.LString(ret))
scookie, _ := json.Marshal(retcookie)
S.Push(lua.LString(scookie))
hTable := util.MapToLuaTable(S, headers)
S.Push(hTable)
atomic.AddInt32(&s.ToDayRequestNum, 1)
atomic.AddInt32(&s.TotalRequestNum, 1)
end := time.Since(start)
if len(s.LastThreeTimes) >= 4 {
s.LastThreeTimes = s.LastThreeTimes[1:]
}
s.LastThreeTimes = append(s.LastThreeTimes, end)
return 3
}))
//下载附件downloadFile(url,method,param,head,cookie,fileName)
s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
if s.FileLastThreeTimes == nil {
s.FileLastThreeTimes = make([]time.Duration, 4)
}
if util.Config.IsDelay {
SleepTime(3, s.FileLastThreeTimes) //睡眠时间
}
start := time.Now() //起始时间
cookie := S.ToString(-1)
head := S.ToTable(-2)
param := S.ToTable(-3)
method := S.ToString(-4)
url := S.ToString(-5)
fileName := S.ToString(-6)
ishttps := strings.Contains(url, "https")
//base64匹配
base64UrlReg := regexp.MustCompile("data:image")
indexArr := base64UrlReg.FindStringIndex(url)
name, size, ftype, fid := "", "", "", ""
tmpUrl := ""
var ret []byte
var err error
var mycookie []*http.Cookie
if cookie != "{}" {
json.Unmarshal([]byte(cookie), &mycookie)
} else {
mycookie = make([]*http.Cookie, 0)
}
//base64 url
if len(indexArr) == 2 { //base64 http://www.mmjyjt.com/data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAqAAAAOwCAYAAAD
//截取base64
start := indexArr[0]
url = url[start:]
fileName = "文件下载.jpg"
index := strings.Index(url, ",")
dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(url[index+1:]))
ret, err = io.ReadAll(dec)
if err == nil && len(ret) > 0 {
url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, "", ret)
}
} else {
fileName = strings.TrimSpace(fileName)
url = strings.TrimSpace(url)
tmpUrl = url
ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
if strings.TrimSpace(ftype) == "" {
if len(path.Ext(name)) > 0 {
ftype = path.Ext(name)[1:]
}
}
}
//特殊处理中国招标投标公共服务平台异常附件过滤
if *site == "中国招标投标公共服务平台" {
if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件
size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载
} else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf
logger.Info("Error File Type:", bttype, url)
size, ftype, fid = "", "", ""
}
} else if *site == "中国政府采购网" && tmpUrl != "" { //中国政府采购网附件大小异常,限制IP所致
if size == "4.1 KB" || size == "4.2 KB" {
times := 1
for { //重试三次
if times > 3 {
break
}
//http://www.ccgp.gov.cn/cggg/dfgg/jzxcs/202302/t20230210_19437644.htm
ret = DownloadFile(s.Downloader, tmpUrl, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
bs := bytes.NewReader(ret)
bsLen := qu.ConvertFileSize(bs.Len())
if bsLen != "4.1 KB" && bsLen != "4.2 KB" && bsLen != "0 B" {
url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, tmpUrl, ret)
break
}
times++
}
if size == "4.1 KB" || size == "4.2 KB" { //重试后异常
fid = ""
ftype = ""
name = ""
}
}
}
S.Push(lua.LString(url))
S.Push(lua.LString(name))
S.Push(lua.LString(size))
S.Push(lua.LString(ftype))
S.Push(lua.LString(fid))
atomic.AddInt32(&s.ToDayRequestNum, 1)
atomic.AddInt32(&s.TotalRequestNum, 1)
end := time.Since(start)
if len(s.FileLastThreeTimes) >= 4 {
s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
}
s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
return 5
}))
//下载、上传base64图片
s.L.SetGlobal("downloadBase64File", s.L.NewFunction(func(S *lua.LState) int {
url := S.ToString(-3)
fileName := S.ToString(-2)
base64Img := S.ToString(-1)
if fileName == "" {
fileName = "文件下载"
}
fileName = fileName + ".jpg"
i := strings.Index(base64Img, ",")
dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(base64Img[i+1:]))
ret, err := io.ReadAll(dec)
name, size, ftype, fid := "", "", "", ""
if err == nil && len(ret) > 0 {
url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
}
S.Push(lua.LString(url))
S.Push(lua.LString(name))
S.Push(lua.LString(size))
S.Push(lua.LString(ftype))
S.Push(lua.LString(fid))
return 5
}))
//保存验证错误日志
s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
code := S.ToString(-4)
name := S.ToString(-3)
url := S.ToString(-2)
content := S.ToString(-1)
//saveVerificationLog(code, name, url, content)
logger.Info("Error Log:", code, name, url, content)
atomic.AddInt32(&s.ErrorNum, 1)
atomic.AddInt32(&s.NoDownloadNum, 1)
//防止恶意增加日志
util.TimeSleepFunc(5*time.Second, TimeSleepChan)
return 0
}))
//添加改版日志
s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
url := S.ToString(-2)
str := S.ToString(-1)
logger.Error(s.SCode, url, str)
return 0
}))
//查找信息是否存在(作废)
s.L.SetGlobal("findHasExit", s.L.NewFunction(func(S *lua.LState) int {
//c := S.ToString(-2)
//q := S.ToString(-1)
//b := findHasExit(c, q)
S.Push(lua.LBool(false))
return 1
}))
s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
nodetype := S.ToString(-3)
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := util.FindOneText(gpath, content, nodetype)
S.Push(ret)
return 1
}))
s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := util.FindContentText(gpath, content)
S.Push(ret)
return 1
}))
s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
nodetype := S.ToString(-3)
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := util.FindOneHtml(gpath, content, nodetype)
S.Push(ret)
return 1
}))
s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := s.L.NewTable()
util.FindListText(gpath, content, ret)
S.Push(ret)
return 1
}))
s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
gpath := S.ToString(-2)
content := S.ToString(-1)
ret := s.L.NewTable()
util.FindListHtml(gpath, content, ret)
//if ret.Len() > 0 {
// UpdateHeart(site, channel, code, user, "findlist") //记录列表页实际采集数据量心跳
//}
S.Push(ret)
return 1
}))
//推送列表页下载数据量
s.L.SetGlobal("sendListNum", s.L.NewFunction(func(S *lua.LState) int {
table := S.ToTable(-1)
list := util.TableToMap(table)
logger.Info(s.SCode, len(list))
//if len(list) > 0 {
// UpdateHeart(*site, *channel, code, *user, "findlist") //记录列表页实际采集数据量心跳
//}
return 1
}))
// s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int {
// update := [][]map[string]interface{}{}
// query := map[string]interface{}{"state": 0}
// data, _ := Mgo.Find(util.Config.TmpCollName, query, `{"_id":-1}`, nil, false, 0, 10)
// pageList := []interface{}{}
// for _, d := range *data {
// tmpMap := map[string]string{}
// tmpMap["title"] = qu.ObjToString(d["title"])
// tmpMap["detail"] = qu.ObjToString(d["detail"])
// tmpMap["href"] = qu.ObjToString(d["href"])
// publishtime := qu.Int64All(d["publishtime"])
// tmpMap["publishtime"] = qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout)
// tmpMap["_id"] = qu.BsonIdToSId(d["_id"])
// pageList = append(pageList, tmpMap)
// update = append(update, []map[string]interface{}{
// map[string]interface{}{"_id": d["_id"]},
// map[string]interface{}{"$set": map[string]interface{}{"state": 1}},
// })
// }
// ret := util.MapToTable(s.L, pageList)
// S.Push(ret)
// if len(update) > 0 {
// Mgo.UpdateBulk(util.Config.TmpCollName, update...)
// }
// return 1
// }))
s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
qmap := S.ToTable(-2)
content := S.ToString(-1)
ret := s.L.NewTable()
util.FindMap(qmap, content, ret)
S.Push(ret)
return 1
}))
//公示暴露方式
s.L.SetGlobal("getEcpsCode", s.L.NewFunction(func(S *lua.LState) int {
area := strings.ToUpper(S.ToString(-2))
content := S.ToString(-1)
code, state := util.GetEcpsCode(area, []byte(content))
if state == "wx" {
code, _ = GetCodeByWx([]byte(content))
}
S.Push(lua.LString(code))
return 1
}))
//调用jsvm
s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
js := S.ToString(-1)
ret := s.L.NewTable()
if js == "" {
ret.RawSet(lua.LString("val"), lua.LString(""))
ret.RawSet(lua.LString("err"), lua.LString("js is null"))
} else {
rep := util.JsVmPost(util.Config.JsVmUrl, js)
ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
}
S.Push(ret)
return 1
}))
//指定下载器
s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
s.Downloader = GetOneDownloader()
S.Push(lua.LString(s.Downloader))
return 1
}))
//指定下载器file
s.L.SetGlobal("changeDownloaderFile", s.L.NewFunction(func(S *lua.LState) int {
s.Downloader = GetOneDownloaderFile()
S.Push(lua.LString(s.Downloader))
return 1
}))
//手工延时
s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
// if workTime {
// util.TimeSleepFunc(time.Duration(S.ToInt(-1))*time.Second, TimeSleepChan)
// } else {
// util.TimeSleepFunc(1*time.Second, TimeSleepChan)
// }
util.TimeSleepFunc(time.Second*2, TimeSleepChan)
return 0
}))
//编码解码
s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
codeType := strings.ToLower(S.ToString(-2))
str := S.CheckString(-1)
switch codeType {
case "unicode":
str = strings.Replace(str, "%u", "\\u", -1)
str = transUnic(str)
case "urlencode_gbk":
data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
l, _ := url.Parse("http://a.com/?" + string(data))
tmpstr := l.Query().Encode()
if len(tmpstr) > 1 {
str = tmpstr[0 : len(tmpstr)-1]
} else {
str = ""
}
case "urlencode_utf8":
l, _ := url.Parse("http://a.com/?" + str)
tmpstr := l.Query().Encode()
if len(tmpstr) > 1 {
str = tmpstr[0 : len(tmpstr)-1]
} else {
str = ""
}
case "urldecode_utf8":
str, _ = url.QueryUnescape(str)
case "decode64":
str = util.DecodeB64(str)
case "encodemd5":
str = qu.GetMd5String(str)
case "htmldecode": //html实体码
//txt := `
太阳岛特勤消防站、松浦特勤消防站建设项目设计中标公示
`
str = S.ToString(-1)
reg, _ := regexp.Compile("\\d+;")
str = reg.ReplaceAllStringFunc(str, func(src string) string {
v, _ := strconv.Atoi(src[2 : len(src)-1])
return string(rune(v))
})
}
S.Push(lua.LString(str))
return 1
}))
//如果服务端返回的html是gzip压缩过格式的 这里需要转一下
s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
html := S.ToString(-1)
bs := []byte(html)
gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
bs, _ = ioutil.ReadAll(gzipreader)
S.Push(lua.LString(bs))
return 1
}))
//luamaker提供的分析列表页url地址 获取列表数据公用方法
s.L.SetGlobal("getSimpleListPage", s.L.NewFunction(func(S *lua.LState) int {
html := S.ToString(-3)
date_pattern := S.ToString(-2)
pageListUrl := S.ToString(-1) //列表页url
bs := []byte(html)
tmparr := []string{}
tmpret := []int{}
re, _ := regexp.Compile(`采购|招标|公示|公告|意见|结果|通知|工程`)
doc, _ := gq.NewDocumentFromReader(bytes.NewReader(bs))
doc.Find("a").Each(func(i int, sq *gq.Selection) {
text := sq.Text()
if len(text) < 30 {
return
}
tmparr = append(tmparr, text)
if re.MatchString(text) {
tmpret = append(tmpret, 1)
//logger.Debug(text)
} else {
tmpret = append(tmpret, 0)
}
})
logger.Debug(tmpret)
//线性分析,算周边,只算周围5步的点
tmplen, thepos, themax := len(tmpret), -1, 0
for i := 0; i < tmplen; i++ {
if tmpret[i] == 0 {
continue
}
start, end := i-MAX_STEP, i+MAX_STEP
if start < 0 {
start = 0
}
if end > tmplen {
end = tmplen
}
tmp := 0
//从当前位置往左,往右找连续点
for j := i; j > start; j-- {
if tmpret[j] == 1 {
tmp++
} else {
break
}
}
for j := i; j < end; j++ {
if tmpret[j] == 1 {
tmp++
} else {
break
}
}
if tmp > themax {
themax = tmp
thepos = i
}
} //end of for...
//logger.Debug("找位置完成")
//验证
if thepos == -1 {
logger.Error("完蛋,找不到")
panic("不支持啊,失败啊")
}
//下边是找父容器
var thelink *gq.Selection
doc.Find("a").Each(func(i int, sq *gq.Selection) {
if sq.Text() == tmparr[thepos] {
thelink = sq
}
})
isfind := false
//同样Path向上找,不超过5步
for i := 0; i < MAX_STEP; i++ {
thelink = thelink.Parent()
clen := getChildrenLen(thelink)
if clen >= themax-1 {
isfind = true
break
}
//logger.Debug("TAG:::", thelink.Nodes[0].Data, clen)
}
//找到列表
pageList := []interface{}{}
if isfind {
thelink.Children().Each(func(i int, sq *gq.Selection) {
page := map[string]string{}
link_sq := sq.Find("a")
href := link_sq.AttrOr("href", "")
text := link_sq.Text()
page["title"] = text
page["href"] = dealHref(pageListUrl, href)
page["publishtime"] = dealPublishTime(strings.TrimSpace(sq.Text()), date_pattern)
//logger.Debug(i)
pageList = append(pageList, page)
})
} else {
logger.Error("完蛋,找父亲节点失败啊")
//panic("不支持啊,失败啊")
}
ret := util.MapToTable(s.L, pageList)
S.Push(ret)
return 1
}))
//招投标信息标题判重
s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
S.Push(lua.LBool(false))
return 1
}))
//招标信息判重新方法 2016-12-14 wanghuidong
s.L.SetGlobal("urlRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
S.Push(lua.LBool(false))
return 1
}))
//将url放入内存缓存 2016-12-14 wanghuidong
s.L.SetGlobal("putUrl2Redis", s.L.NewFunction(func(S *lua.LState) int {
//url := S.ToString(-1)
return 1
}))
//解析附件中的word、pdf
s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
str := S.ToString(-2)
extension := S.ToString(-1)
bs, _ := base64.StdEncoding.DecodeString(str)
bs = append([]byte{ext[extension]}, bs...)
msgid := mu.UUID(8)
Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
return 1
}))
s.L.SetGlobal("clearMemoeryCache", s.L.NewFunction(func(S *lua.LState) int {
/*title := S.ToString(-1)
isExist, _ := redis.Exists("title_repeat_judgement", "title_repeat_"+title)
if isExist {
redis.Del("title_repeat_judgement", "title_repeat_"+title)
}*/
return 1
}))
//支持正则,提取
s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
index := int(S.ToNumber(-1))
regstr := S.ToString(-2)
text := S.ToString(-3)
reg := regexp.MustCompile(regstr)
reps := reg.FindAllStringSubmatchIndex(text, -1)
ret := s.L.NewTable()
number := 0
for _, v := range reps {
number++
ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
}
S.Push(ret)
return 1
}))
//支持替换
s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
text := S.ToString(-3)
old := S.ToString(-2)
repl := S.ToString(-1)
text = strings.Replace(text, old, repl, -1)
S.Push(lua.LString(text))
return 1
}))
//标题的关键词、排除词过滤
s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
data := S.ToTable(-1)
dataMap := util.TableToMap(data)
ret := s.L.NewTable()
num := 1
for _, v := range dataMap {
tmp := v.(map[string]interface{})
isOk := false
if title := qu.ObjToString(tmp["title"]); title != "" {
if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
isOk = true
}
}
if isOk {
ret.Insert(num, util.MapToLuaTable(S, tmp))
num++
}
}
S.Push(ret)
return 1
}))
//标题的关键词、排除词过滤
s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
data := S.ToTable(-1)
dataMap := util.TableToMap(data)
if title := qu.ObjToString(dataMap["title"]); title != "" {
if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
S.Push(lua.LBool(true))
return 1
} else {
qu.Debug(s.SCode, dataMap["href"], " title error")
}
} else {
qu.Debug(s.SCode, dataMap["href"], " title error")
}
S.Push(lua.LBool(false))
return 1
}))
//detail过滤
s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
/*
1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
2.是否含汉字
*/
reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|见附件|详见附件)")
reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
detail := S.ToString(-1)
if reg1.MatchString(detail) {
S.Push(lua.LBool(true))
return 1
}
if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
S.Push(lua.LBool(false))
return 1
}
S.Push(lua.LBool(false))
return 1
}))
//匹配汉字
s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
reg1 := regexp.MustCompile("(见附件|详见附件)")
reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
detail := S.ToString(-1)
detail = reg1.ReplaceAllString(detail, "")
ok := reg2.MatchString(detail)
S.Push(lua.LBool(ok))
return 1
}))
//base64加密
s.L.SetGlobal("encodeBase64", s.L.NewFunction(func(S *lua.LState) int {
text := S.ToString(-1)
base64Text := base64.StdEncoding.EncodeToString([]byte(text))
S.Push(lua.LString(base64Text))
return 1
}))
//base64解密
s.L.SetGlobal("decodeBase64", s.L.NewFunction(func(S *lua.LState) int {
text := S.ToString(-1)
result := ""
byteText, err := base64.StdEncoding.DecodeString(text)
if err == nil {
result = string(byteText)
}
S.Push(lua.LString(result))
return 1
}))
//aes cbc模式加密
s.L.SetGlobal("aesEncryptCBC", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-3)
key := S.ToString(-2)
iv := S.ToString(-1)
bytekey := []byte(key)
byteorigData := []byte(origData)
byteiv := []byte(iv)
encrypted := util.AesCBCEncrypt(byteorigData, bytekey, byteiv)
// 将加密后的数据和初始向量进行Base64编码
result := base64.StdEncoding.EncodeToString(encrypted)
S.Push(lua.LString(result))
return 1
}))
//aes cbc模式解密
s.L.SetGlobal("aesDecryptCBC", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-3)
key := S.ToString(-2)
iv := S.ToString(-1)
bytekey := []byte(key)
byteiv := []byte(iv)
data, _ := base64.StdEncoding.DecodeString(origData)
result := util.AesCBCDecrypter(data, bytekey, byteiv)
S.Push(lua.LString(result))
return 1
}))
//aes ecb模式加密
s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-2)
key := S.ToString(-1)
bytekey := []byte(key)
byteorigData := []byte(origData)
encrypted := util.AesECBEncrypt(byteorigData, bytekey)
result := base64.StdEncoding.EncodeToString(encrypted)
S.Push(lua.LString(result))
return 1
}))
//aes ecb模式解密
s.L.SetGlobal("aesDecryptECB", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-2)
key := S.ToString(-1)
data, _ := base64.StdEncoding.DecodeString(origData)
result := util.AesECBDecrypter(data, []byte(key))
S.Push(lua.LString(result))
return 1
}))
//des ecb模式加密
s.L.SetGlobal("desEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-2)
key := S.ToString(-1)
encrypted := util.DesECBEncrypt([]byte(origData), []byte(key))
result := base64.StdEncoding.EncodeToString(encrypted)
S.Push(lua.LString(result))
return 1
}))
//des ecb模式解密
s.L.SetGlobal("desDecryptECB", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-2)
key := S.ToString(-1)
data, _ := base64.StdEncoding.DecodeString(origData)
result := util.DesECBDecrypter(data, []byte(key))
S.Push(lua.LString(result))
return 1
}))
//des cbc模式加密
s.L.SetGlobal("desEncryptCBC", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-3)
key := S.ToString(-2)
iv := S.ToString(-1)
bytekey := []byte(key)
byteorigData := []byte(origData)
byteiv := []byte(iv)
encrypted := util.DesCBCEncrypt(byteorigData, bytekey, byteiv)
result := base64.StdEncoding.EncodeToString(encrypted)
S.Push(lua.LString(result))
return 1
}))
//des cbc模式解密
s.L.SetGlobal("desDecryptCBC", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-3)
key := S.ToString(-2)
iv := S.ToString(-1)
bytekey := []byte(key)
byteiv := []byte(iv)
data, _ := base64.StdEncoding.DecodeString(origData)
result := util.DesCBCDecrypter(data, bytekey, byteiv)
S.Push(lua.LString(result))
return 1
}))
//rsa 公钥加密
s.L.SetGlobal("rsaEncrypt", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-2)
key := S.ToString(-1)
encrypted := util.EncryptWithPublicKey([]byte(origData), []byte(key))
result := base64.StdEncoding.EncodeToString(encrypted)
S.Push(lua.LString(result))
return 1
}))
//rsa 私钥解密
s.L.SetGlobal("rsaDecrypt", s.L.NewFunction(func(S *lua.LState) int {
origData := S.ToString(-2)
key := S.ToString(-1)
data, _ := base64.StdEncoding.DecodeString(origData)
result := util.DecryptWithPrivateKey(data, []byte(key))
S.Push(lua.LString(result))
return 1
}))
//根据正文获取发布时间
s.L.SetGlobal("getPublishtime", s.L.NewFunction(func(S *lua.LState) int {
detail := S.ToString(-2)
contenthtml := S.ToString(-1)
publishtime := util.GetPublishtime([]string{contenthtml, detail})
S.Push(lua.LString(publishtime))
return 1
}))
//匹配
s.L.SetGlobal("stringFind", s.L.NewFunction(func(S *lua.LState) int {
regstr := S.ToString(-1)
text := S.ToString(-2)
textReg := regexp.MustCompile(regstr)
//spaceReg := regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
//text = spaceReg.ReplaceAllString(text, "")
result := textReg.FindString(text)
isMatch := false
if result != "" {
isMatch = true
}
S.Push(lua.LString(result))
S.Push(lua.LBool(isMatch))
return 2
}))
//截取
s.L.SetGlobal("stringSub", s.L.NewFunction(func(S *lua.LState) int {
text := S.ToString(-3)
start := S.ToInt(-2)
end := S.ToInt(-1)
result := ""
if len(text) > 0 {
textRune := []rune(text)
textLen := len(textRune)
if end < 0 {
if start > 0 { //正向截取到倒数第end位
result = string(textRune[start-1 : textLen+1+end])
} else if start < 0 { //反向截取 从倒数第start位截取到倒数第end位
result = string(textRune[textLen+start : textLen+1+end])
}
} else if start > 0 && end >= start && end <= textLen { //从第start个截取到第end个
result = string(textRune[start-1 : end])
}
// if end == -1 {
// if start >= 1 { //正向截取到结尾
// result = string(textRune[start-1:])
// } else if start < 0 && textLen+start >= 0 { //反向截取后缀
// result = string(textRune[textLen+start:])
// }
// } else if start >= 1 && end <= textLen { //从第start个截取到第end个
// result = string(textRune[start-1 : end])
// }
}
S.Push(lua.LString(result))
return 1
}))
//长度
s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int {
text := S.ToString(-1)
textLen := len([]rune(text))
S.Push(lua.LNumber(textLen))
return 1
}))
//去除特殊标签中间内容
s.L.SetGlobal("getPureContent", s.L.NewFunction(func(S *lua.LState) int {
con := S.ToString(-1)
reg := regexp.MustCompile("(?s)<(!%-%-|!--|style).*?(%-%-|--|style)>") //注释 css
con = reg.ReplaceAllString(con, "")
// indexArr := reg.FindAllStringIndex(con, -1)
// for i := len(indexArr) - 1; i >= 0; i-- {
// if index := indexArr[i]; len(index) == 2 {
// con = con[:index[0]] + con[index[1]:]
// }
// }
S.Push(lua.LString(con))
return 1
}))
//interface转string
s.L.SetGlobal("formatToString", s.L.NewFunction(func(S *lua.LState) int {
strNum := S.ToString(-1)
decimalNum, _ := decimal.NewFromString(strNum)
S.Push(lua.LString(decimalNum.String()))
return 1
}))
//获取验证码
s.L.SetGlobal("getCodeByPath", s.L.NewFunction(func(S *lua.LState) int {
cookie := S.ToString(-1)
head := S.ToTable(-2)
stype := S.ToString(-3)
path := S.ToString(-4)
proxy := S.ToBool(-5)
headMap := util.GetTable(head)
//qu.Debug("cookie----------", cookie)
//qu.Debug("headMap----------", headMap)
headJsonStr := ""
headByte, err := json.Marshal(headMap)
if err == nil {
headJsonStr = string(headByte)
}
code, respHead, respCookie := codegrpc.GetCodeByPath(path, stype, headJsonStr, cookie, proxy)
//qu.Debug("code====", code)
//qu.Debug("respHead====", respHead)
//qu.Debug("respCookie====", respCookie)
S.Push(lua.LString(code))
respHeadMap := map[string]interface{}{}
json.Unmarshal([]byte(respHead), &respHeadMap)
hTable := util.MapToLuaTable(S, respHeadMap)
S.Push(hTable)
S.Push(lua.LString(respCookie))
return 3
}))
s.L.SetGlobal("goRunJs", s.L.NewFunction(func(S *lua.LState) int {
param := S.ToString(-2) //list or detail
step := S.ToString(-1) //参数
result := gojs.GoRunJsGetResult(s.SCode, param, step)
qu.Debug("Go Run Js Result:", param, step, result)
S.Push(lua.LString(result))
return 1
}))
s.L.SetGlobal("newDownloadFile", s.L.NewFunction(func(S *lua.LState) int {
cookie := S.ToString(-1)
head := S.ToTable(-2)
param := S.ToTable(-3)
method := S.ToString(-4)
url := S.ToString(-5)
fileName := S.ToString(-6)
ishttps := strings.Contains(url, "https")
var mycookie []*http.Cookie
if cookie != "{}" {
json.Unmarshal([]byte(cookie), &mycookie)
} else {
mycookie = make([]*http.Cookie, 0)
}
fileName = strings.TrimSpace(fileName)
url = strings.TrimSpace(url)
ret := NewDownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout, false)
url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
if strings.TrimSpace(ftype) == "" {
if len(path.Ext(name)) > 0 {
ftype = path.Ext(name)[1:]
}
}
//特殊处理中国招标投标公共服务平台异常附件过滤
if *site == "中国招标投标公共服务平台" {
if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件
size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载
} else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf
logger.Info("Error File Type:", bttype, url)
size, ftype, fid = "", "", ""
}
}
S.Push(lua.LString(url))
S.Push(lua.LString(name))
S.Push(lua.LString(size))
S.Push(lua.LString(ftype))
S.Push(lua.LString(fid))
return 5
}))
//渲染页面下载
s.L.SetGlobal("downloadByRender", s.L.NewFunction(func(S *lua.LState) int {
href := S.ToString(-1)
contentHtml := util.DownloadByRender(href)
S.Push(lua.LString(contentHtml))
return 1
}))
//chromedp下载
s.L.SetGlobal("downloadByChrome", s.L.NewFunction(func(S *lua.LState) int {
timeout := S.ToInt64(-2)
taskStr := S.ToString(-1)
cam := util.ChromeActionMap{}
if json.Unmarshal([]byte(taskStr), &cam) == nil {
if len(cam.BaseActions) > 0 {
if len(cam.RangeActions) > 0 && cam.RangeTimes > 0 {
for times := 1; times <= cam.RangeTimes; times++ {
cam.BaseActions = append(cam.BaseActions, cam.RangeActions...)
}
}
chromeTask := util.ChromeTask{
TimeOut: timeout,
Actions: cam.BaseActions,
}
ret := DownloadByChrome(s.SCode, s.Downloader, chromeTask, s.Timeout)
S.Push(util.MapToTable(S, ret))
} else {
S.Push(S.NewTable())
}
} else {
S.Push(S.NewTable())
}
return 1
}))
//针对中国招标投标公共服务平台三级页瑞数加密下载方法
s.L.SetGlobal("downloadByDataIntercept", s.L.NewFunction(func(S *lua.LState) int {
url := S.ToString(-4)
url_regex := S.ToString(-3)
timeout := S.ToInt(-2)
proxy := S.ToBool(-1)
headers := util.DownloadByDataIntercept(url, url_regex, timeout, proxy)
table := util.MapToLuaTable(S, headers)
S.Push(table)
return 1
}))
return ""
}
func dealHref(pageListUrl, href string) string {
returnUrl := ""
if href != "" {
r, _ := regexp.Compile("^./")
match := r.MatchString(href)
if match {
url2 := r.ReplaceAllString(href, "")
returnUrl = pageListUrl + url2
}
r2, _ := regexp.Compile("^/")
match2 := r2.MatchString(href)
if match2 {
r3, _ := regexp.Compile("http://[^/]*/")
domain := r3.FindString(pageListUrl)
//fmt.Println(domain)
url2 := r2.ReplaceAllString(href, "")
returnUrl = domain + url2
}
}
return returnUrl
}
func dealPublishTime(content string, pattern string) string {
publishTime := ""
if pattern == "yyyy-MM-dd HH:mm:ss" {
r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2}")
publishTime = r.FindString(content)
} else if pattern == "yyyy-MM-dd" {
r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}")
publishTime = r.FindString(content)
} else if pattern == "MM-dd" {
r, _ := regexp.Compile("\\d{2}-\\d{2}")
publishTime = r.FindString(content)
}
return publishTime
}
func getChildrenLen(sq *gq.Selection) (ret int) {
sq.Children().Each(func(i int, sq2 *gq.Selection) {
ret = i
})
return
}
// unicode转码
func transUnic(str string) string {
buf := bytes.NewBuffer(nil)
i, j := 0, len(str)
for i < j {
x := i + 6
if x > j {
buf.WriteString(str[i:])
break
}
if str[i] == '\\' && str[i+1] == 'u' {
hex := str[i+2 : x]
r, err := strconv.ParseUint(hex, 16, 64)
if err == nil {
buf.WriteRune(rune(r))
} else {
logger.Warn(err.Error())
buf.WriteString(str[i:x])
}
i = x
} else {
buf.WriteByte(str[i])
i++
}
}
return buf.String()
}
// 取得变量
func (s *Script) GetVar(key string) string {
return s.L.GetGlobal(key).String()
}
func (s *Script) GetIntVar(key string) int {
lv := s.L.GetGlobal(key)
if v, ok := lv.(lua.LNumber); ok {
return int(v)
}
return -1
}
func (s *Script) GetBoolVar(key string) bool {
lv := s.L.GetGlobal(key)
if v, ok := lv.(lua.LBool); ok {
return bool(v)
}
return false
}
// 设置睡眠时间
func SleepTime(basetime int, times []time.Duration) {
st := 0 //记录最后睡眠时长
base := float64(basetime * 60)
if times[3].Seconds() > base { //最后一次大于 basetime*60秒
if times[2].Seconds() > base {
n := 0
if times[0].Seconds() > base {
n++
}
if times[1].Seconds() > base {
n++
}
st = n + 1
} else if times[2].Seconds() < base && times[0].Seconds() > base && times[1].Seconds() > base {
st = 1
}
}
if st > 0 {
time.Sleep(time.Duration(st) * time.Minute)
}
}
func generateKey(key []byte) (genKey []byte) {
genKey = make([]byte, 16)
copy(genKey, key)
for i := 16; i < len(key); {
for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
genKey[j] ^= key[i]
}
}
return genKey
}