123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928 |
- /**
- 脚本加载+调用 封装,
- 前期走文件系统加载
- 后期走数据库配置,
- LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
- */
- package spider
- import (
- "bytes"
- "compress/gzip"
- "crypto/aes"
- "encoding/base64"
- "encoding/json"
- "io/ioutil"
- mu "mfw/util"
- "net/http"
- "net/url"
- "path"
- "github.com/shopspring/decimal"
- qu "qfw/util"
- _ "qfw/util/redis"
- "regexp"
- util "spiderutil"
- "strconv"
- "strings"
- "sync/atomic"
- "time"
- gq "github.com/PuerkitoBio/goquery"
- "github.com/cjoudrey/gluahttp"
- "github.com/donnie4w/go-logger/logger"
- lujson "github.com/yuin/gopher-json"
- "github.com/yuin/gopher-lua"
- "golang.org/x/text/encoding/simplifiedchinese"
- "golang.org/x/text/transform"
- )
- //脚本
- type Script struct {
- SCode, ScriptFile string
- Encoding string
- Userproxy bool
- //Ishttps bool
- ErrorNum int32 //错误数
- Downloader string //下载器
- TotalRequestNum int32 //总请求次数
- ToDayRequestNum int32 //今日请求次数
- YestoDayRequestNum int32 //昨日请求次数
- Timeout int64 //超时时间秒
- L *lua.LState
- NoDownloadNum int32 //未成功下载数
- LastThreeTimes []time.Duration //单条信息流程完成的时间,最后三次
- FileLastThreeTimes []time.Duration //附件下载单条信息流程完成的时间,最后三次
- }
- const (
- MAX_STEP = 5 //计算时的最大步长
- )
- var workTime = true
- //
- func init() {
- go isWorkTime()
- }
- var TimeSleepChan = make(chan bool, 1)
- //加载文件
- func (s *Script) LoadScript(code, script_file string, newstate bool) string {
- defer mu.Catch()
- s.SCode = code
- s.ScriptFile = script_file
- if util.Config.Working == 0 {
- if newstate {
- s.L = lua.NewState(lua.Options{
- RegistrySize: 256 * 20,
- CallStackSize: 256,
- IncludeGoStackTrace: false,
- })
- }
- } else { //节能模式从CC池中获取lua.LState
- if newstate { //队列模式的newstate主要区分是列表页爬虫CC还是三级页爬虫CC2
- lState := <-CC2
- s.L = lState
- } else {
- lState := <-CC
- s.L = lState
- }
- //logger.Debug("获取CC资源", script_file)
- }
- s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
- s.L.PreloadModule("json", lujson.Loader)
- if err := s.L.DoString(script_file); err != nil {
- logger.Debug(code + ",加载lua脚本错误:" + err.Error())
- return "加载lua脚本错误:" + err.Error()
- //panic(code + ",加载lua脚本错误:" + err.Error())
- }
- s.Encoding = s.GetVar("spiderPageEncoding")
- s.Userproxy = s.GetBoolVar("spiderUserProxy")
- //暴露go方法
- //download(url,head) 普通下载
- s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
- if s.LastThreeTimes == nil {
- s.LastThreeTimes = make([]time.Duration, 4)
- }
- if util.Config.IsDelay {
- SleepTime(1, s.LastThreeTimes) //睡眠时间
- }
- start := time.Now() //起始时间
- head := S.ToTable(-1)
- url := S.ToString(-2)
- ishttps := S.ToBool(-3)
- charset := S.ToString(-4)
- if charset == "" {
- charset = s.Encoding
- }
- ret := Download(s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
- S.Push(lua.LString(ret))
- atomic.AddInt32(&s.ToDayRequestNum, 1)
- atomic.AddInt32(&s.TotalRequestNum, 1)
- end := time.Since(start)
- if len(s.LastThreeTimes) >= 4 {
- s.LastThreeTimes = s.LastThreeTimes[1:]
- }
- s.LastThreeTimes = append(s.LastThreeTimes, end)
- return 1
- }))
- //高级下载downloadAdv(url,method,param,head,cookie)
- s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
- if s.LastThreeTimes == nil {
- s.LastThreeTimes = make([]time.Duration, 4)
- }
- if util.Config.IsDelay {
- SleepTime(1, s.LastThreeTimes) //睡眠时间
- }
- start := time.Now() //起始时间
- cookie := S.ToString(-1)
- head := S.ToTable(-2)
- param := S.ToTable(-3)
- method := S.ToString(-4)
- url := S.ToString(-5)
- ishttps := S.ToBool(-6)
- charset := S.ToString(-7)
- if charset == "" {
- charset = s.Encoding
- }
- var mycookie []*http.Cookie
- json.Unmarshal([]byte(cookie), &mycookie)
- var ret string
- var retcookie []*http.Cookie
- if param == nil {
- ptext := map[string]interface{}{"text": S.ToString(-3)}
- ret, retcookie = DownloadAdv(s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
- } else {
- ret, retcookie = DownloadAdv(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
- }
- S.Push(lua.LString(ret))
- scookie, _ := json.Marshal(retcookie)
- S.Push(lua.LString(scookie))
- atomic.AddInt32(&s.ToDayRequestNum, 1)
- atomic.AddInt32(&s.TotalRequestNum, 1)
- end := time.Since(start)
- if len(s.LastThreeTimes) >= 4 {
- s.LastThreeTimes = s.LastThreeTimes[1:]
- }
- s.LastThreeTimes = append(s.LastThreeTimes, end)
- return 2
- }))
- //保存验证错误日志
- s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
- code := S.ToString(-4)
- name := S.ToString(-3)
- url := S.ToString(-2)
- content := S.ToString(-1)
- saveVerificationLog(code, name, url, content)
- atomic.AddInt32(&s.ErrorNum, 1)
- atomic.AddInt32(&s.NoDownloadNum, 1)
- //防止恶意增加日志
- util.TimeSleepFunc(5*time.Second, TimeSleepChan)
- return 0
- }))
- //添加改版日志
- s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
- url := S.ToString(-2)
- str := S.ToString(-1)
- logger.Error(s.SCode, url, str)
- return 0
- }))
- //查找信息是否存在(作废)
- s.L.SetGlobal("findHasExit", s.L.NewFunction(func(S *lua.LState) int {
- //c := S.ToString(-2)
- //q := S.ToString(-1)
- //b := findHasExit(c, q)
- S.Push(lua.LBool(false))
- return 1
- }))
- s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
- nodetype := S.ToString(-3)
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := util.FindOneText(gpath, content, nodetype)
- S.Push(ret)
- return 1
- }))
- s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := util.FindContentText(gpath, content)
- S.Push(ret)
- return 1
- }))
- s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
- nodetype := S.ToString(-3)
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := util.FindOneHtml(gpath, content, nodetype)
- S.Push(ret)
- return 1
- }))
- s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := s.L.NewTable()
- util.FindListText(gpath, content, ret)
- S.Push(ret)
- return 1
- }))
- s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
- gpath := S.ToString(-2)
- content := S.ToString(-1)
- ret := s.L.NewTable()
- util.FindListHtml(gpath, content, ret)
- if ret.Len() > 0 {
- UpdateHeart("", "", code, "", "findlist") //记录列表页实际采集数据量心跳
- }
- S.Push(ret)
- return 1
- }))
- // s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int {
- // update := [][]map[string]interface{}{}
- // query := map[string]interface{}{"state": 0}
- // data, _ := Mgo.Find(util.Config.TmpCollName, query, `{"_id":-1}`, nil, false, 0, 10)
- // pageList := []interface{}{}
- // for _, d := range *data {
- // tmpMap := map[string]string{}
- // tmpMap["title"] = qu.ObjToString(d["title"])
- // tmpMap["detail"] = qu.ObjToString(d["detail"])
- // tmpMap["href"] = qu.ObjToString(d["href"])
- // publishtime := qu.Int64All(d["publishtime"])
- // tmpMap["publishtime"] = qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout)
- // tmpMap["_id"] = qu.BsonIdToSId(d["_id"])
- // pageList = append(pageList, tmpMap)
- // update = append(update, []map[string]interface{}{
- // map[string]interface{}{"_id": d["_id"]},
- // map[string]interface{}{"$set": map[string]interface{}{"state": 1}},
- // })
- // }
- // ret := util.MapToTable(s.L, pageList)
- // S.Push(ret)
- // if len(update) > 0 {
- // Mgo.UpdateBulk(util.Config.TmpCollName, update...)
- // }
- // return 1
- // }))
- s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
- qmap := S.ToTable(-2)
- content := S.ToString(-1)
- ret := s.L.NewTable()
- util.FindMap(qmap, content, ret)
- S.Push(ret)
- return 1
- }))
- //公示暴露方式
- s.L.SetGlobal("getEcpsCode", s.L.NewFunction(func(S *lua.LState) int {
- area := strings.ToUpper(S.ToString(-2))
- content := S.ToString(-1)
- code, state := util.GetEcpsCode(area, []byte(content))
- if state == "wx" {
- code, _ = GetCodeByWx([]byte(content))
- }
- S.Push(lua.LString(code))
- return 1
- }))
- //调用jsvm
- s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
- js := S.ToString(-1)
- ret := s.L.NewTable()
- if js == "" {
- ret.RawSet(lua.LString("val"), lua.LString(""))
- ret.RawSet(lua.LString("err"), lua.LString("js is null"))
- } else {
- rep := util.JsVmPost(util.Config.JsVmUrl, js)
- ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
- ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
- }
- S.Push(ret)
- return 1
- }))
- //指定下载器
- s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
- s.Downloader = GetOneDownloader()
- S.Push(lua.LString(s.Downloader))
- return 1
- }))
- //指定下载器file
- s.L.SetGlobal("changeDownloaderFile", s.L.NewFunction(func(S *lua.LState) int {
- s.Downloader = GetOneDownloaderFile()
- S.Push(lua.LString(s.Downloader))
- return 1
- }))
- //手工延时
- s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
- // if workTime {
- // util.TimeSleepFunc(time.Duration(S.ToInt(-1))*time.Second, TimeSleepChan)
- // } else {
- // util.TimeSleepFunc(1*time.Second, TimeSleepChan)
- // }
- util.TimeSleepFunc(time.Second*2, TimeSleepChan)
- return 0
- }))
- //编码解码
- s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
- codeType := strings.ToLower(S.ToString(-2))
- str := S.CheckString(-1)
- switch codeType {
- case "unicode":
- str = transUnic(str)
- case "urlencode_gbk":
- data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
- l, _ := url.Parse("http://a.com/?" + string(data))
- tmpstr := l.Query().Encode()
- if len(tmpstr) > 1 {
- str = tmpstr[0 : len(tmpstr)-1]
- } else {
- str = ""
- }
- case "urlencode_utf8":
- l, _ := url.Parse("http://a.com/?" + str)
- tmpstr := l.Query().Encode()
- if len(tmpstr) > 1 {
- str = tmpstr[0 : len(tmpstr)-1]
- } else {
- str = ""
- }
- case "urldecode_utf8":
- str, _ = url.QueryUnescape(str)
- case "decode64":
- str = util.DecodeB64(str)
- case "encodemd5":
- str = qu.GetMd5String(str)
- case "htmldecode": //html实体码
- //txt := `<div align="left" style="margin-left: 0pt;"><span style='font-family:; font-size:13px; color:#000000'>太阳岛特勤消防站、松浦特勤消防站建设项目设计中标公示</span></div>`
- str = S.ToString(-1)
- reg, _ := regexp.Compile("&#\\d+;")
- str = reg.ReplaceAllStringFunc(str, func(src string) string {
- v, _ := strconv.Atoi(src[2 : len(src)-1])
- return string(rune(v))
- })
- }
- S.Push(lua.LString(str))
- return 1
- }))
- //如果服务端返回的html是gzip压缩过格式的 这里需要转一下
- s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
- html := S.ToString(-1)
- bs := []byte(html)
- gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
- bs, _ = ioutil.ReadAll(gzipreader)
- S.Push(lua.LString(bs))
- return 1
- }))
- //luamaker提供的分析列表页url地址 获取列表数据公用方法
- s.L.SetGlobal("getSimpleListPage", s.L.NewFunction(func(S *lua.LState) int {
- html := S.ToString(-3)
- date_pattern := S.ToString(-2)
- pageListUrl := S.ToString(-1) //列表页url
- bs := []byte(html)
- tmparr := []string{}
- tmpret := []int{}
- re, _ := regexp.Compile(`采购|招标|公示|公告|意见|结果|通知|工程`)
- doc, _ := gq.NewDocumentFromReader(bytes.NewReader(bs))
- doc.Find("a").Each(func(i int, sq *gq.Selection) {
- text := sq.Text()
- if len(text) < 30 {
- return
- }
- tmparr = append(tmparr, text)
- if re.MatchString(text) {
- tmpret = append(tmpret, 1)
- //logger.Debug(text)
- } else {
- tmpret = append(tmpret, 0)
- }
- })
- logger.Debug(tmpret)
- //线性分析,算周边,只算周围5步的点
- tmplen, thepos, themax := len(tmpret), -1, 0
- for i := 0; i < tmplen; i++ {
- if tmpret[i] == 0 {
- continue
- }
- start, end := i-MAX_STEP, i+MAX_STEP
- if start < 0 {
- start = 0
- }
- if end > tmplen {
- end = tmplen
- }
- tmp := 0
- //从当前位置往左,往右找连续点
- for j := i; j > start; j-- {
- if tmpret[j] == 1 {
- tmp++
- } else {
- break
- }
- }
- for j := i; j < end; j++ {
- if tmpret[j] == 1 {
- tmp++
- } else {
- break
- }
- }
- if tmp > themax {
- themax = tmp
- thepos = i
- }
- } //end of for...
- //logger.Debug("找位置完成")
- //验证
- if thepos == -1 {
- logger.Error("完蛋,找不到")
- panic("不支持啊,失败啊")
- }
- //下边是找父容器
- var thelink *gq.Selection
- doc.Find("a").Each(func(i int, sq *gq.Selection) {
- if sq.Text() == tmparr[thepos] {
- thelink = sq
- }
- })
- isfind := false
- //同样Path向上找,不超过5步
- for i := 0; i < MAX_STEP; i++ {
- thelink = thelink.Parent()
- clen := getChildrenLen(thelink)
- if clen >= themax-1 {
- isfind = true
- break
- }
- //logger.Debug("TAG:::", thelink.Nodes[0].Data, clen)
- }
- //找到列表
- pageList := []interface{}{}
- if isfind {
- thelink.Children().Each(func(i int, sq *gq.Selection) {
- page := map[string]string{}
- link_sq := sq.Find("a")
- href := link_sq.AttrOr("href", "")
- text := link_sq.Text()
- page["title"] = text
- page["href"] = dealHref(pageListUrl, href)
- page["publishtime"] = dealPublishTime(strings.TrimSpace(sq.Text()), date_pattern)
- //logger.Debug(i)
- pageList = append(pageList, page)
- })
- } else {
- logger.Error("完蛋,找父亲节点失败啊")
- //panic("不支持啊,失败啊")
- }
- ret := util.MapToTable(s.L, pageList)
- S.Push(ret)
- return 1
- }))
- //招投标信息标题判重
- s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
- S.Push(lua.LBool(false))
- return 1
- }))
- //招标信息判重新方法 2016-12-14 wanghuidong
- s.L.SetGlobal("urlRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
- S.Push(lua.LBool(false))
- return 1
- }))
- //将url放入内存缓存 2016-12-14 wanghuidong
- s.L.SetGlobal("putUrl2Redis", s.L.NewFunction(func(S *lua.LState) int {
- //url := S.ToString(-1)
- return 1
- }))
- //解析附件中的word、pdf
- s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
- ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
- str := S.ToString(-2)
- extension := S.ToString(-1)
- bs, _ := base64.StdEncoding.DecodeString(str)
- bs = append([]byte{ext[extension]}, bs...)
- msgid := mu.UUID(8)
- Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
- return 1
- }))
- //下载附件download(url,method,param,head,cookie,fileName)
- s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
- if s.FileLastThreeTimes == nil {
- s.FileLastThreeTimes = make([]time.Duration, 4)
- }
- if util.Config.IsDelay {
- SleepTime(3, s.FileLastThreeTimes) //睡眠时间
- }
- start := time.Now() //起始时间
- cookie := S.ToString(-1)
- head := S.ToTable(-2)
- param := S.ToTable(-3)
- method := S.ToString(-4)
- url := S.ToString(-5)
- fileName := S.ToString(-6)
- ishttps := strings.Contains(url, "https")
- var mycookie []*http.Cookie
- if cookie != "{}" {
- json.Unmarshal([]byte(cookie), &mycookie)
- } else {
- mycookie = make([]*http.Cookie, 0)
- }
- fileName = strings.TrimSpace(fileName)
- url = strings.TrimSpace(url)
- ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
- url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
- if strings.TrimSpace(ftype) == "" {
- if len(path.Ext(name)) > 0 {
- ftype = path.Ext(name)[1:]
- }
- }
- S.Push(lua.LString(url))
- S.Push(lua.LString(name))
- S.Push(lua.LString(size))
- S.Push(lua.LString(ftype))
- S.Push(lua.LString(fid))
- atomic.AddInt32(&s.ToDayRequestNum, 1)
- atomic.AddInt32(&s.TotalRequestNum, 1)
- end := time.Since(start)
- if len(s.FileLastThreeTimes) >= 4 {
- s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
- }
- s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
- return 5
- }))
- s.L.SetGlobal("clearMemoeryCache", s.L.NewFunction(func(S *lua.LState) int {
- /*title := S.ToString(-1)
- isExist, _ := redis.Exists("title_repeat_judgement", "title_repeat_"+title)
- if isExist {
- redis.Del("title_repeat_judgement", "title_repeat_"+title)
- }*/
- return 1
- }))
- //支持正则,提取
- s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
- index := int(S.ToNumber(-1))
- regstr := S.ToString(-2)
- text := S.ToString(-3)
- reg := regexp.MustCompile(regstr)
- reps := reg.FindAllStringSubmatchIndex(text, -1)
- ret := s.L.NewTable()
- number := 0
- for _, v := range reps {
- number++
- ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
- }
- S.Push(ret)
- return 1
- }))
- //支持替换
- s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
- text := S.ToString(-3)
- old := S.ToString(-2)
- repl := S.ToString(-1)
- text = strings.Replace(text, old, repl, -1)
- S.Push(lua.LString(text))
- return 1
- }))
- //标题的关键词、排除词过滤
- s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
- keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
- notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
- data := S.ToTable(-1)
- dataMap := util.TableToMap(data)
- ret := s.L.NewTable()
- num := 1
- for _, v := range dataMap {
- tmp := v.(map[string]interface{})
- isOk := false
- if title := qu.ObjToString(tmp["title"]); title != "" {
- if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
- isOk = true
- }
- }
- if isOk {
- ret.Insert(num, util.MapToLuaTable(S, tmp))
- num++
- }
- }
- S.Push(ret)
- return 1
- }))
- //标题的关键词、排除词过滤
- s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
- keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
- notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
- data := S.ToTable(-1)
- dataMap := util.TableToMap(data)
- if title := qu.ObjToString(dataMap["title"]); title != "" {
- if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
- S.Push(lua.LBool(true))
- return 1
- } else {
- qu.Debug(s.SCode, dataMap["href"], " title error")
- }
- } else {
- qu.Debug(s.SCode, dataMap["href"], " title error")
- }
- S.Push(lua.LBool(false))
- return 1
- }))
- //detail过滤
- s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
- /*
- 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
- 2.是否含汉字
- */
- reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|见附件|详见附件)")
- reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
- detail := S.ToString(-1)
- if reg1.MatchString(detail) {
- S.Push(lua.LBool(true))
- return 1
- }
- if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
- S.Push(lua.LBool(false))
- return 1
- }
- S.Push(lua.LBool(false))
- return 1
- }))
- //匹配汉字
- s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
- reg1 := regexp.MustCompile("(见附件|详见附件)")
- reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
- detail := S.ToString(-1)
- detail = reg1.ReplaceAllString(detail, "")
- ok := reg2.MatchString(detail)
- S.Push(lua.LBool(ok))
- return 1
- }))
- //aes ecb模式加密
- s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
- origData := S.ToString(-2)
- key := S.ToString(-1)
- bytekey := []byte(key)
- byteorigData := []byte(origData)
- cipher, _ := aes.NewCipher(generateKey([]byte(bytekey)))
- length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize
- plain := make([]byte, length*aes.BlockSize)
- copy(plain, byteorigData)
- pad := byte(len(plain) - len(byteorigData))
- for i := len(byteorigData); i < len(plain); i++ {
- plain[i] = pad
- }
- encrypted := make([]byte, len(plain))
- // 分组分块加密
- for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() {
- cipher.Encrypt(encrypted[bs:be], plain[bs:be])
- }
- result := base64.StdEncoding.EncodeToString(encrypted)
- S.Push(lua.LString(result))
- return 1
- }))
- //根据正文获取发布时间
- s.L.SetGlobal("getPublishtime", s.L.NewFunction(func(S *lua.LState) int {
- detail := S.ToString(-2)
- contenthtml := S.ToString(-1)
- publishtime := util.GetPublishtime([]string{contenthtml, detail})
- S.Push(lua.LString(publishtime))
- return 1
- }))
- //匹配
- s.L.SetGlobal("stringFind", s.L.NewFunction(func(S *lua.LState) int {
- regstr := S.ToString(-1)
- text := S.ToString(-2)
- reg := regexp.MustCompile(regstr)
- result := reg.FindString(text)
- isMatch := false
- if result != "" {
- isMatch = true
- }
- S.Push(lua.LString(result))
- S.Push(lua.LBool(isMatch))
- return 2
- }))
- //截取
- s.L.SetGlobal("stringSub", s.L.NewFunction(func(S *lua.LState) int {
- text := S.ToString(-3)
- start := S.ToInt(-2)
- end := S.ToInt(-1)
- result := ""
- if len(text) > 0 {
- textRune := []rune(text)
- textLen := len(textRune)
- if end < 0 {
- if start > 0 { //正向截取到倒数第end位
- result = string(textRune[start-1 : textLen+1+end])
- } else if start < 0 { //反向截取 从倒数第start位截取到倒数第end位
- result = string(textRune[textLen+start : textLen+1+end])
- }
- } else if start > 0 && end >= start && end <= textLen { //从第start个截取到第end个
- result = string(textRune[start-1 : end])
- }
- // if end == -1 {
- // if start >= 1 { //正向截取到结尾
- // result = string(textRune[start-1:])
- // } else if start < 0 && textLen+start >= 0 { //反向截取后缀
- // result = string(textRune[textLen+start:])
- // }
- // } else if start >= 1 && end <= textLen { //从第start个截取到第end个
- // result = string(textRune[start-1 : end])
- // }
- }
- S.Push(lua.LString(result))
- return 1
- }))
- //base64加密
- s.L.SetGlobal("encodeBase64", s.L.NewFunction(func(S *lua.LState) int {
- text := S.ToString(-1)
- base64Text := base64.StdEncoding.EncodeToString([]byte(text))
- S.Push(lua.LString(base64Text))
- return 1
- }))
- //base64解密
- s.L.SetGlobal("decodeBase64", s.L.NewFunction(func(S *lua.LState) int {
- text := S.ToString(-1)
- result := ""
- byteText, err := base64.StdEncoding.DecodeString(text)
- if err == nil {
- result = string(byteText)
- }
- S.Push(lua.LString(result))
- return 1
- }))
- //长度
- s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int {
- text := S.ToString(-1)
- textLen := len([]rune(text))
- S.Push(lua.LNumber(textLen))
- return 1
- }))
- //去除特殊标签中间内容
- s.L.SetGlobal("getPureContent", s.L.NewFunction(func(S *lua.LState) int {
- con := S.ToString(-1)
- reg := regexp.MustCompile("(?s)<(!%-%-|!--|style).*?(%-%-|--|style)>") //注释 css
- con = reg.ReplaceAllString(con, "")
- // indexArr := reg.FindAllStringIndex(con, -1)
- // for i := len(indexArr) - 1; i >= 0; i-- {
- // if index := indexArr[i]; len(index) == 2 {
- // con = con[:index[0]] + con[index[1]:]
- // }
- // }
- S.Push(lua.LString(con))
- return 1
- }))
- //interface转string
- s.L.SetGlobal("formatToString", s.L.NewFunction(func(S *lua.LState) int {
- strNum := S.ToString(-1)
- decimalNum, _ := decimal.NewFromString(strNum)
- S.Push(lua.LString(decimalNum.String()))
- return 1
- }))
- return ""
- }
- func dealHref(pageListUrl, href string) string {
- returnUrl := ""
- if href != "" {
- r, _ := regexp.Compile("^./")
- match := r.MatchString(href)
- if match {
- url2 := r.ReplaceAllString(href, "")
- returnUrl = pageListUrl + url2
- }
- r2, _ := regexp.Compile("^/")
- match2 := r2.MatchString(href)
- if match2 {
- r3, _ := regexp.Compile("http://[^/]*/")
- domain := r3.FindString(pageListUrl)
- //fmt.Println(domain)
- url2 := r2.ReplaceAllString(href, "")
- returnUrl = domain + url2
- }
- }
- return returnUrl
- }
- func dealPublishTime(content string, pattern string) string {
- publishTime := ""
- if pattern == "yyyy-MM-dd HH:mm:ss" {
- r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2}")
- publishTime = r.FindString(content)
- } else if pattern == "yyyy-MM-dd" {
- r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}")
- publishTime = r.FindString(content)
- } else if pattern == "MM-dd" {
- r, _ := regexp.Compile("\\d{2}-\\d{2}")
- publishTime = r.FindString(content)
- }
- return publishTime
- }
- func getChildrenLen(sq *gq.Selection) (ret int) {
- sq.Children().Each(func(i int, sq2 *gq.Selection) {
- ret = i
- })
- return
- }
- //
- //func (s *Script) Reload() {
- // s.L.Close()
- // s.LoadScript(s.SCode, s.ScriptFile, false)
- //}
- //unicode转码
- func transUnic(str string) string {
- buf := bytes.NewBuffer(nil)
- i, j := 0, len(str)
- for i < j {
- x := i + 6
- if x > j {
- buf.WriteString(str[i:])
- break
- }
- if str[i] == '\\' && str[i+1] == 'u' {
- hex := str[i+2 : x]
- r, err := strconv.ParseUint(hex, 16, 64)
- if err == nil {
- buf.WriteRune(rune(r))
- } else {
- logger.Warn(err.Error())
- buf.WriteString(str[i:x])
- }
- i = x
- } else {
- buf.WriteByte(str[i])
- i++
- }
- }
- return buf.String()
- }
- //取得变量
- func (s *Script) GetVar(key string) string {
- return s.L.GetGlobal(key).String()
- }
- //
- func (s *Script) GetIntVar(key string) int {
- lv := s.L.GetGlobal(key)
- if v, ok := lv.(lua.LNumber); ok {
- return int(v)
- }
- return -1
- }
- //
- func (s *Script) GetBoolVar(key string) bool {
- lv := s.L.GetGlobal(key)
- if v, ok := lv.(lua.LBool); ok {
- return bool(v)
- }
- return false
- }
- func isWorkTime() {
- workTime = util.IsWorkTime()
- util.TimeAfterFunc(10*time.Minute, isWorkTime, TimeChan)
- }
- //设置睡眠时间
- func SleepTime(basetime int, times []time.Duration) {
- st := 0 //记录最后睡眠时长
- base := float64(basetime * 60)
- if times[3].Seconds() > base { //最后一次大于 basetime*60秒
- if times[2].Seconds() > base {
- n := 0
- if times[0].Seconds() > base {
- n++
- }
- if times[1].Seconds() > base {
- n++
- }
- st = n + 1
- } else if times[2].Seconds() < base && times[0].Seconds() > base && times[1].Seconds() > base {
- st = 1
- }
- }
- if st > 0 {
- time.Sleep(time.Duration(st) * time.Minute)
- }
- }
- func generateKey(key []byte) (genKey []byte) {
- genKey = make([]byte, 16)
- copy(genKey, key)
- for i := 16; i < len(key); {
- for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
- genKey[j] ^= key[i]
- }
- }
- return genKey
- }
|