package spider import ( codegrpc "analysiscode/client" "bytes" "compress/gzip" "encoding/base64" "encoding/json" "github.com/shopspring/decimal" gojs "gorunjs/client" "io" "io/ioutil" mu "mfw/util" "net/http" "net/url" "path" qu "qfw/util" "regexp" util "spiderutil" "strconv" "strings" "sync/atomic" "time" gq "github.com/PuerkitoBio/goquery" "github.com/cjoudrey/gluahttp" "github.com/donnie4w/go-logger/logger" lujson "github.com/yuin/gopher-json" "github.com/yuin/gopher-lua" "golang.org/x/text/encoding/simplifiedchinese" "golang.org/x/text/transform" ) const ( MAX_STEP = 5 //计算时的最大步长 ) var TimeSleepChan = make(chan bool, 1) // 脚本 type Script struct { SCode, ScriptFile string Encoding string Userproxy bool //Ishttps bool ErrorNum int32 //错误数 Downloader string //下载器 TotalRequestNum int32 //总请求次数 ToDayRequestNum int32 //今日请求次数 YestoDayRequestNum int32 //昨日请求次数 Timeout int64 //超时时间秒 L *lua.LState NoDownloadNum int32 //未成功下载数 LastThreeTimes []time.Duration //单条信息流程完成的时间,最后三次 FileLastThreeTimes []time.Duration //附件下载单条信息流程完成的时间,最后三次 } var ErrFid = "a6879f0a8570256aa21fb978e6dabb50429a30dfacff697cf0b898abbc5c262e" //限制访问的附件 // 加载文件 func (s *Script) LoadScript(site, channel, user *string, code, script_file string) string { defer mu.Catch() s.SCode = code s.ScriptFile = script_file s.L = lua.NewState(lua.Options{ RegistrySize: 256 * 20, CallStackSize: 256, IncludeGoStackTrace: false, }) s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader) s.L.PreloadModule("json", lujson.Loader) if err := s.L.DoString(script_file); err != nil { logger.Debug(code + ",加载lua脚本错误:" + err.Error()) return "加载lua脚本错误:" + err.Error() //panic(code + ",加载lua脚本错误:" + err.Error()) } s.Encoding = s.GetVar("spiderPageEncoding") s.Userproxy = s.GetBoolVar("spiderUserProxy") //暴露go方法 //download(url,head) 普通下载 s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int { if s.LastThreeTimes == nil { s.LastThreeTimes = make([]time.Duration, 4) } if util.Config.IsDelay { SleepTime(1, s.LastThreeTimes) //睡眠时间 } start := time.Now() //起始时间 head := S.ToTable(-1) url := S.ToString(-2) ishttps := S.ToBool(-3) charset := S.ToString(-4) if charset == "" { charset = s.Encoding } var retLen int64 ret := Download(&retLen, s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout) //流量统计 //if retLen > 0 { // key := Today + "+" + code // if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil { // if sfMap, ok := sf.(*SpiderFlow); ok { // sfMap.Flow += retLen // //sfMap.Site = *site // //sfMap.Channel = *channel // //sfMap.ModifyUser = *user // SpiderFlowMap.Store(key, sfMap) // } // } else { // SpiderFlowMap.Store(key, &SpiderFlow{ // //Code: code, // Site: *site, // Channel: *channel, // Flow: retLen, // ModifyUser: *user, // }) // } //} S.Push(lua.LString(ret)) atomic.AddInt32(&s.ToDayRequestNum, 1) atomic.AddInt32(&s.TotalRequestNum, 1) end := time.Since(start) if len(s.LastThreeTimes) >= 4 { s.LastThreeTimes = s.LastThreeTimes[1:] } s.LastThreeTimes = append(s.LastThreeTimes, end) return 1 })) //高级下载downloadAdv(url,method,param,head,cookie) s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int { if s.LastThreeTimes == nil { s.LastThreeTimes = make([]time.Duration, 4) } if util.Config.IsDelay { SleepTime(1, s.LastThreeTimes) //睡眠时间 } start := time.Now() //起始时间 cookie := S.ToString(-1) head := S.ToTable(-2) param := S.ToTable(-3) method := S.ToString(-4) url := S.ToString(-5) ishttps := S.ToBool(-6) charset := S.ToString(-7) if charset == "" { charset = s.Encoding } var mycookie []*http.Cookie json.Unmarshal([]byte(cookie), &mycookie) var ret string var retcookie []*http.Cookie var headers = map[string]interface{}{} var retLen int64 if param == nil { ptext := map[string]interface{}{"text": S.ToString(-3)} ret, retcookie, headers = DownloadAdv(&retLen, s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout) } else { ret, retcookie, headers = DownloadAdv(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout) } //流量统计 //if retLen > 0 { // key := Today + "+" + code // if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil { // if sfMap, ok := sf.(*SpiderFlow); ok { // sfMap.Flow += retLen // //sfMap.Site = *site // //sfMap.Channel = *channel // //sfMap.ModifyUser = *user // SpiderFlowMap.Store(key, sfMap) // } // } else { // SpiderFlowMap.Store(key, &SpiderFlow{ // //Code: code, // Site: *site, // Channel: *channel, // Flow: retLen, // ModifyUser: *user, // }) // } //} S.Push(lua.LString(ret)) scookie, _ := json.Marshal(retcookie) S.Push(lua.LString(scookie)) hTable := util.MapToLuaTable(S, headers) S.Push(hTable) atomic.AddInt32(&s.ToDayRequestNum, 1) atomic.AddInt32(&s.TotalRequestNum, 1) end := time.Since(start) if len(s.LastThreeTimes) >= 4 { s.LastThreeTimes = s.LastThreeTimes[1:] } s.LastThreeTimes = append(s.LastThreeTimes, end) return 3 })) //下载附件downloadFile(url,method,param,head,cookie,fileName) s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int { if s.FileLastThreeTimes == nil { s.FileLastThreeTimes = make([]time.Duration, 4) } if util.Config.IsDelay { SleepTime(3, s.FileLastThreeTimes) //睡眠时间 } start := time.Now() //起始时间 cookie := S.ToString(-1) head := S.ToTable(-2) param := S.ToTable(-3) method := S.ToString(-4) url := S.ToString(-5) fileName := S.ToString(-6) ishttps := strings.Contains(url, "https") //base64匹配 base64UrlReg := regexp.MustCompile("data:image") indexArr := base64UrlReg.FindStringIndex(url) name, size, ftype, fid := "", "", "", "" tmpUrl := "" var ret []byte var err error var mycookie []*http.Cookie if cookie != "{}" { json.Unmarshal([]byte(cookie), &mycookie) } else { mycookie = make([]*http.Cookie, 0) } //base64 url if len(indexArr) == 2 { //base64 http://www.mmjyjt.com/ //截取base64 start := indexArr[0] url = url[start:] fileName = "文件下载.jpg" index := strings.Index(url, ",") dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(url[index+1:])) ret, err = io.ReadAll(dec) if err == nil && len(ret) > 0 { url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, "", ret) } } else { fileName = strings.TrimSpace(fileName) url = strings.TrimSpace(url) tmpUrl = url ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout) url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret) if strings.TrimSpace(ftype) == "" { if len(path.Ext(name)) > 0 { ftype = path.Ext(name)[1:] } } } //特殊处理中国招标投标公共服务平台异常附件过滤 if *site == "中国招标投标公共服务平台" { if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件 size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载 } else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf logger.Info("Error File Type:", bttype, url) size, ftype, fid = "", "", "" } } else if *site == "中国政府采购网" && tmpUrl != "" { //中国政府采购网附件大小异常,限制IP所致 if size == "4.1 KB" || size == "4.2 KB" { times := 1 for { //重试三次 if times > 3 { break } //http://www.ccgp.gov.cn/cggg/dfgg/jzxcs/202302/t20230210_19437644.htm ret = DownloadFile(s.Downloader, tmpUrl, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout) bs := bytes.NewReader(ret) bsLen := qu.ConvertFileSize(bs.Len()) if bsLen != "4.1 KB" && bsLen != "4.2 KB" && bsLen != "0 B" { url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, tmpUrl, ret) break } times++ } if size == "4.1 KB" || size == "4.2 KB" { //重试后异常 fid = "" ftype = "" name = "" } } } S.Push(lua.LString(url)) S.Push(lua.LString(name)) S.Push(lua.LString(size)) S.Push(lua.LString(ftype)) S.Push(lua.LString(fid)) atomic.AddInt32(&s.ToDayRequestNum, 1) atomic.AddInt32(&s.TotalRequestNum, 1) end := time.Since(start) if len(s.FileLastThreeTimes) >= 4 { s.FileLastThreeTimes = s.FileLastThreeTimes[1:] } s.FileLastThreeTimes = append(s.FileLastThreeTimes, end) return 5 })) //下载、上传base64图片 s.L.SetGlobal("downloadBase64File", s.L.NewFunction(func(S *lua.LState) int { url := S.ToString(-3) fileName := S.ToString(-2) base64Img := S.ToString(-1) if fileName == "" { fileName = "文件下载" } fileName = fileName + ".jpg" i := strings.Index(base64Img, ",") dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(base64Img[i+1:])) ret, err := io.ReadAll(dec) name, size, ftype, fid := "", "", "", "" if err == nil && len(ret) > 0 { url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret) } S.Push(lua.LString(url)) S.Push(lua.LString(name)) S.Push(lua.LString(size)) S.Push(lua.LString(ftype)) S.Push(lua.LString(fid)) return 5 })) //保存验证错误日志 s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int { code := S.ToString(-4) name := S.ToString(-3) url := S.ToString(-2) content := S.ToString(-1) //saveVerificationLog(code, name, url, content) logger.Info("Error Log:", code, name, url, content) atomic.AddInt32(&s.ErrorNum, 1) atomic.AddInt32(&s.NoDownloadNum, 1) //防止恶意增加日志 util.TimeSleepFunc(5*time.Second, TimeSleepChan) return 0 })) //添加改版日志 s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int { url := S.ToString(-2) str := S.ToString(-1) logger.Error(s.SCode, url, str) return 0 })) //查找信息是否存在(作废) s.L.SetGlobal("findHasExit", s.L.NewFunction(func(S *lua.LState) int { //c := S.ToString(-2) //q := S.ToString(-1) //b := findHasExit(c, q) S.Push(lua.LBool(false)) return 1 })) s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int { nodetype := S.ToString(-3) gpath := S.ToString(-2) content := S.ToString(-1) ret := util.FindOneText(gpath, content, nodetype) S.Push(ret) return 1 })) s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int { gpath := S.ToString(-2) content := S.ToString(-1) ret := util.FindContentText(gpath, content) S.Push(ret) return 1 })) s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int { nodetype := S.ToString(-3) gpath := S.ToString(-2) content := S.ToString(-1) ret := util.FindOneHtml(gpath, content, nodetype) S.Push(ret) return 1 })) s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int { gpath := S.ToString(-2) content := S.ToString(-1) ret := s.L.NewTable() util.FindListText(gpath, content, ret) S.Push(ret) return 1 })) s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int { gpath := S.ToString(-2) content := S.ToString(-1) ret := s.L.NewTable() util.FindListHtml(gpath, content, ret) //if ret.Len() > 0 { // UpdateHeart(site, channel, code, user, "findlist") //记录列表页实际采集数据量心跳 //} S.Push(ret) return 1 })) //推送列表页下载数据量 s.L.SetGlobal("sendListNum", s.L.NewFunction(func(S *lua.LState) int { table := S.ToTable(-1) list := util.TableToMap(table) logger.Info(s.SCode, len(list)) //if len(list) > 0 { // UpdateHeart(*site, *channel, code, *user, "findlist") //记录列表页实际采集数据量心跳 //} return 1 })) // s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int { // update := [][]map[string]interface{}{} // query := map[string]interface{}{"state": 0} // data, _ := Mgo.Find(util.Config.TmpCollName, query, `{"_id":-1}`, nil, false, 0, 10) // pageList := []interface{}{} // for _, d := range *data { // tmpMap := map[string]string{} // tmpMap["title"] = qu.ObjToString(d["title"]) // tmpMap["detail"] = qu.ObjToString(d["detail"]) // tmpMap["href"] = qu.ObjToString(d["href"]) // publishtime := qu.Int64All(d["publishtime"]) // tmpMap["publishtime"] = qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout) // tmpMap["_id"] = qu.BsonIdToSId(d["_id"]) // pageList = append(pageList, tmpMap) // update = append(update, []map[string]interface{}{ // map[string]interface{}{"_id": d["_id"]}, // map[string]interface{}{"$set": map[string]interface{}{"state": 1}}, // }) // } // ret := util.MapToTable(s.L, pageList) // S.Push(ret) // if len(update) > 0 { // Mgo.UpdateBulk(util.Config.TmpCollName, update...) // } // return 1 // })) s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int { qmap := S.ToTable(-2) content := S.ToString(-1) ret := s.L.NewTable() util.FindMap(qmap, content, ret) S.Push(ret) return 1 })) //公示暴露方式 s.L.SetGlobal("getEcpsCode", s.L.NewFunction(func(S *lua.LState) int { area := strings.ToUpper(S.ToString(-2)) content := S.ToString(-1) code, state := util.GetEcpsCode(area, []byte(content)) if state == "wx" { code, _ = GetCodeByWx([]byte(content)) } S.Push(lua.LString(code)) return 1 })) //调用jsvm s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int { js := S.ToString(-1) ret := s.L.NewTable() if js == "" { ret.RawSet(lua.LString("val"), lua.LString("")) ret.RawSet(lua.LString("err"), lua.LString("js is null")) } else { rep := util.JsVmPost(util.Config.JsVmUrl, js) ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"]))) ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"]))) } S.Push(ret) return 1 })) //指定下载器 s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int { s.Downloader = GetOneDownloader() S.Push(lua.LString(s.Downloader)) return 1 })) //指定下载器file s.L.SetGlobal("changeDownloaderFile", s.L.NewFunction(func(S *lua.LState) int { s.Downloader = GetOneDownloaderFile() S.Push(lua.LString(s.Downloader)) return 1 })) //手工延时 s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int { // if workTime { // util.TimeSleepFunc(time.Duration(S.ToInt(-1))*time.Second, TimeSleepChan) // } else { // util.TimeSleepFunc(1*time.Second, TimeSleepChan) // } util.TimeSleepFunc(time.Second*2, TimeSleepChan) return 0 })) //编码解码 s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int { codeType := strings.ToLower(S.ToString(-2)) str := S.CheckString(-1) switch codeType { case "unicode": str = strings.Replace(str, "%u", "\\u", -1) str = transUnic(str) case "urlencode_gbk": data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder())) l, _ := url.Parse("http://a.com/?" + string(data)) tmpstr := l.Query().Encode() if len(tmpstr) > 1 { str = tmpstr[0 : len(tmpstr)-1] } else { str = "" } case "urlencode_utf8": l, _ := url.Parse("http://a.com/?" + str) tmpstr := l.Query().Encode() if len(tmpstr) > 1 { str = tmpstr[0 : len(tmpstr)-1] } else { str = "" } case "urldecode_utf8": str, _ = url.QueryUnescape(str) case "decode64": str = util.DecodeB64(str) case "encodemd5": str = qu.GetMd5String(str) case "htmldecode": //html实体码 //txt := `
太阳岛特勤消防站、松浦特勤消防站建设项目设计中标公示
` str = S.ToString(-1) reg, _ := regexp.Compile("&#\\d+;") str = reg.ReplaceAllStringFunc(str, func(src string) string { v, _ := strconv.Atoi(src[2 : len(src)-1]) return string(rune(v)) }) } S.Push(lua.LString(str)) return 1 })) //如果服务端返回的html是gzip压缩过格式的 这里需要转一下 s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int { html := S.ToString(-1) bs := []byte(html) gzipreader, _ := gzip.NewReader(bytes.NewReader(bs)) bs, _ = ioutil.ReadAll(gzipreader) S.Push(lua.LString(bs)) return 1 })) //luamaker提供的分析列表页url地址 获取列表数据公用方法 s.L.SetGlobal("getSimpleListPage", s.L.NewFunction(func(S *lua.LState) int { html := S.ToString(-3) date_pattern := S.ToString(-2) pageListUrl := S.ToString(-1) //列表页url bs := []byte(html) tmparr := []string{} tmpret := []int{} re, _ := regexp.Compile(`采购|招标|公示|公告|意见|结果|通知|工程`) doc, _ := gq.NewDocumentFromReader(bytes.NewReader(bs)) doc.Find("a").Each(func(i int, sq *gq.Selection) { text := sq.Text() if len(text) < 30 { return } tmparr = append(tmparr, text) if re.MatchString(text) { tmpret = append(tmpret, 1) //logger.Debug(text) } else { tmpret = append(tmpret, 0) } }) logger.Debug(tmpret) //线性分析,算周边,只算周围5步的点 tmplen, thepos, themax := len(tmpret), -1, 0 for i := 0; i < tmplen; i++ { if tmpret[i] == 0 { continue } start, end := i-MAX_STEP, i+MAX_STEP if start < 0 { start = 0 } if end > tmplen { end = tmplen } tmp := 0 //从当前位置往左,往右找连续点 for j := i; j > start; j-- { if tmpret[j] == 1 { tmp++ } else { break } } for j := i; j < end; j++ { if tmpret[j] == 1 { tmp++ } else { break } } if tmp > themax { themax = tmp thepos = i } } //end of for... //logger.Debug("找位置完成") //验证 if thepos == -1 { logger.Error("完蛋,找不到") panic("不支持啊,失败啊") } //下边是找父容器 var thelink *gq.Selection doc.Find("a").Each(func(i int, sq *gq.Selection) { if sq.Text() == tmparr[thepos] { thelink = sq } }) isfind := false //同样Path向上找,不超过5步 for i := 0; i < MAX_STEP; i++ { thelink = thelink.Parent() clen := getChildrenLen(thelink) if clen >= themax-1 { isfind = true break } //logger.Debug("TAG:::", thelink.Nodes[0].Data, clen) } //找到列表 pageList := []interface{}{} if isfind { thelink.Children().Each(func(i int, sq *gq.Selection) { page := map[string]string{} link_sq := sq.Find("a") href := link_sq.AttrOr("href", "") text := link_sq.Text() page["title"] = text page["href"] = dealHref(pageListUrl, href) page["publishtime"] = dealPublishTime(strings.TrimSpace(sq.Text()), date_pattern) //logger.Debug(i) pageList = append(pageList, page) }) } else { logger.Error("完蛋,找父亲节点失败啊") //panic("不支持啊,失败啊") } ret := util.MapToTable(s.L, pageList) S.Push(ret) return 1 })) //招投标信息标题判重 s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int { S.Push(lua.LBool(false)) return 1 })) //招标信息判重新方法 2016-12-14 wanghuidong s.L.SetGlobal("urlRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int { S.Push(lua.LBool(false)) return 1 })) //将url放入内存缓存 2016-12-14 wanghuidong s.L.SetGlobal("putUrl2Redis", s.L.NewFunction(func(S *lua.LState) int { //url := S.ToString(-1) return 1 })) //解析附件中的word、pdf s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int { ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)} str := S.ToString(-2) extension := S.ToString(-1) bs, _ := base64.StdEncoding.DecodeString(str) bs = append([]byte{ext[extension]}, bs...) msgid := mu.UUID(8) Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60) return 1 })) s.L.SetGlobal("clearMemoeryCache", s.L.NewFunction(func(S *lua.LState) int { /*title := S.ToString(-1) isExist, _ := redis.Exists("title_repeat_judgement", "title_repeat_"+title) if isExist { redis.Del("title_repeat_judgement", "title_repeat_"+title) }*/ return 1 })) //支持正则,提取 s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int { index := int(S.ToNumber(-1)) regstr := S.ToString(-2) text := S.ToString(-3) reg := regexp.MustCompile(regstr) reps := reg.FindAllStringSubmatchIndex(text, -1) ret := s.L.NewTable() number := 0 for _, v := range reps { number++ ret.Insert(number, lua.LString(text[v[index]:v[index+1]])) } S.Push(ret) return 1 })) //支持替换 s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int { text := S.ToString(-3) old := S.ToString(-2) repl := S.ToString(-1) text = strings.Replace(text, old, repl, -1) S.Push(lua.LString(text)) return 1 })) //标题的关键词、排除词过滤 s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int { keyWordReg := regexp.MustCompile(util.Config.Word["keyword"]) notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"]) data := S.ToTable(-1) dataMap := util.TableToMap(data) ret := s.L.NewTable() num := 1 for _, v := range dataMap { tmp := v.(map[string]interface{}) isOk := false if title := qu.ObjToString(tmp["title"]); title != "" { if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) { isOk = true } } if isOk { ret.Insert(num, util.MapToLuaTable(S, tmp)) num++ } } S.Push(ret) return 1 })) //标题的关键词、排除词过滤 s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int { keyWordReg := regexp.MustCompile(util.Config.Word["keyword"]) notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"]) data := S.ToTable(-1) dataMap := util.TableToMap(data) if title := qu.ObjToString(dataMap["title"]); title != "" { if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) { S.Push(lua.LBool(true)) return 1 } else { qu.Debug(s.SCode, dataMap["href"], " title error") } } else { qu.Debug(s.SCode, dataMap["href"], " title error") } S.Push(lua.LBool(false)) return 1 })) //detail过滤 s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int { /* 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容) 2.是否含汉字 */ reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|见附件|详见附件)") reg2 := regexp.MustCompile("[\u4e00-\u9fa5]") detail := S.ToString(-1) if reg1.MatchString(detail) { S.Push(lua.LBool(true)) return 1 } if len([]rune(detail)) < 50 || !reg2.MatchString(detail) { S.Push(lua.LBool(false)) return 1 } S.Push(lua.LBool(false)) return 1 })) //匹配汉字 s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int { reg1 := regexp.MustCompile("(见附件|详见附件)") reg2 := regexp.MustCompile("[\u4e00-\u9fa5]") detail := S.ToString(-1) detail = reg1.ReplaceAllString(detail, "") ok := reg2.MatchString(detail) S.Push(lua.LBool(ok)) return 1 })) //base64加密 s.L.SetGlobal("encodeBase64", s.L.NewFunction(func(S *lua.LState) int { text := S.ToString(-1) base64Text := base64.StdEncoding.EncodeToString([]byte(text)) S.Push(lua.LString(base64Text)) return 1 })) //base64解密 s.L.SetGlobal("decodeBase64", s.L.NewFunction(func(S *lua.LState) int { text := S.ToString(-1) result := "" byteText, err := base64.StdEncoding.DecodeString(text) if err == nil { result = string(byteText) } S.Push(lua.LString(result)) return 1 })) //aes cbc模式加密 s.L.SetGlobal("aesEncryptCBC", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-3) key := S.ToString(-2) iv := S.ToString(-1) bytekey := []byte(key) byteorigData := []byte(origData) byteiv := []byte(iv) encrypted := util.AesCBCEncrypt(byteorigData, bytekey, byteiv) // 将加密后的数据和初始向量进行Base64编码 result := base64.StdEncoding.EncodeToString(encrypted) S.Push(lua.LString(result)) return 1 })) //aes cbc模式解密 s.L.SetGlobal("aesDecryptCBC", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-3) key := S.ToString(-2) iv := S.ToString(-1) bytekey := []byte(key) byteiv := []byte(iv) data, _ := base64.StdEncoding.DecodeString(origData) result := util.AesCBCDecrypter(data, bytekey, byteiv) S.Push(lua.LString(result)) return 1 })) //aes ecb模式加密 s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-2) key := S.ToString(-1) bytekey := []byte(key) byteorigData := []byte(origData) encrypted := util.AesECBEncrypt(byteorigData, bytekey) result := base64.StdEncoding.EncodeToString(encrypted) S.Push(lua.LString(result)) return 1 })) //aes ecb模式解密 s.L.SetGlobal("aesDecryptECB", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-2) key := S.ToString(-1) data, _ := base64.StdEncoding.DecodeString(origData) result := util.AesECBDecrypter(data, []byte(key)) S.Push(lua.LString(result)) return 1 })) //des ecb模式加密 s.L.SetGlobal("desEncryptECB", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-2) key := S.ToString(-1) encrypted := util.DesECBEncrypt([]byte(origData), []byte(key)) result := base64.StdEncoding.EncodeToString(encrypted) S.Push(lua.LString(result)) return 1 })) //des ecb模式解密 s.L.SetGlobal("desDecryptECB", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-2) key := S.ToString(-1) data, _ := base64.StdEncoding.DecodeString(origData) result := util.DesECBDecrypter(data, []byte(key)) S.Push(lua.LString(result)) return 1 })) //des cbc模式加密 s.L.SetGlobal("desEncryptCBC", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-3) key := S.ToString(-2) iv := S.ToString(-1) bytekey := []byte(key) byteorigData := []byte(origData) byteiv := []byte(iv) encrypted := util.DesCBCEncrypt(byteorigData, bytekey, byteiv) result := base64.StdEncoding.EncodeToString(encrypted) S.Push(lua.LString(result)) return 1 })) //des cbc模式解密 s.L.SetGlobal("desDecryptCBC", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-3) key := S.ToString(-2) iv := S.ToString(-1) bytekey := []byte(key) byteiv := []byte(iv) data, _ := base64.StdEncoding.DecodeString(origData) result := util.DesCBCDecrypter(data, bytekey, byteiv) S.Push(lua.LString(result)) return 1 })) //rsa 公钥加密 s.L.SetGlobal("rsaEncrypt", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-2) key := S.ToString(-1) encrypted := util.EncryptWithPublicKey([]byte(origData), []byte(key)) result := base64.StdEncoding.EncodeToString(encrypted) S.Push(lua.LString(result)) return 1 })) //rsa 私钥解密 s.L.SetGlobal("rsaDecrypt", s.L.NewFunction(func(S *lua.LState) int { origData := S.ToString(-2) key := S.ToString(-1) data, _ := base64.StdEncoding.DecodeString(origData) result := util.DecryptWithPrivateKey(data, []byte(key)) S.Push(lua.LString(result)) return 1 })) //根据正文获取发布时间 s.L.SetGlobal("getPublishtime", s.L.NewFunction(func(S *lua.LState) int { detail := S.ToString(-2) contenthtml := S.ToString(-1) publishtime := util.GetPublishtime([]string{contenthtml, detail}) S.Push(lua.LString(publishtime)) return 1 })) //匹配 s.L.SetGlobal("stringFind", s.L.NewFunction(func(S *lua.LState) int { regstr := S.ToString(-1) text := S.ToString(-2) textReg := regexp.MustCompile(regstr) //spaceReg := regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") //text = spaceReg.ReplaceAllString(text, "") result := textReg.FindString(text) isMatch := false if result != "" { isMatch = true } S.Push(lua.LString(result)) S.Push(lua.LBool(isMatch)) return 2 })) //截取 s.L.SetGlobal("stringSub", s.L.NewFunction(func(S *lua.LState) int { text := S.ToString(-3) start := S.ToInt(-2) end := S.ToInt(-1) result := "" if len(text) > 0 { textRune := []rune(text) textLen := len(textRune) if end < 0 { if start > 0 { //正向截取到倒数第end位 result = string(textRune[start-1 : textLen+1+end]) } else if start < 0 { //反向截取 从倒数第start位截取到倒数第end位 result = string(textRune[textLen+start : textLen+1+end]) } } else if start > 0 && end >= start && end <= textLen { //从第start个截取到第end个 result = string(textRune[start-1 : end]) } // if end == -1 { // if start >= 1 { //正向截取到结尾 // result = string(textRune[start-1:]) // } else if start < 0 && textLen+start >= 0 { //反向截取后缀 // result = string(textRune[textLen+start:]) // } // } else if start >= 1 && end <= textLen { //从第start个截取到第end个 // result = string(textRune[start-1 : end]) // } } S.Push(lua.LString(result)) return 1 })) //长度 s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int { text := S.ToString(-1) textLen := len([]rune(text)) S.Push(lua.LNumber(textLen)) return 1 })) //去除特殊标签中间内容 s.L.SetGlobal("getPureContent", s.L.NewFunction(func(S *lua.LState) int { con := S.ToString(-1) reg := regexp.MustCompile("(?s)<(!%-%-|!--|style).*?(%-%-|--|style)>") //注释 css con = reg.ReplaceAllString(con, "") // indexArr := reg.FindAllStringIndex(con, -1) // for i := len(indexArr) - 1; i >= 0; i-- { // if index := indexArr[i]; len(index) == 2 { // con = con[:index[0]] + con[index[1]:] // } // } S.Push(lua.LString(con)) return 1 })) //interface转string s.L.SetGlobal("formatToString", s.L.NewFunction(func(S *lua.LState) int { strNum := S.ToString(-1) decimalNum, _ := decimal.NewFromString(strNum) S.Push(lua.LString(decimalNum.String())) return 1 })) //获取验证码 s.L.SetGlobal("getCodeByPath", s.L.NewFunction(func(S *lua.LState) int { cookie := S.ToString(-1) head := S.ToTable(-2) stype := S.ToString(-3) path := S.ToString(-4) proxy := S.ToBool(-5) headMap := util.GetTable(head) //qu.Debug("cookie----------", cookie) //qu.Debug("headMap----------", headMap) headJsonStr := "" headByte, err := json.Marshal(headMap) if err == nil { headJsonStr = string(headByte) } code, respHead, respCookie := codegrpc.GetCodeByPath(path, stype, headJsonStr, cookie, proxy) //qu.Debug("code====", code) //qu.Debug("respHead====", respHead) //qu.Debug("respCookie====", respCookie) S.Push(lua.LString(code)) respHeadMap := map[string]interface{}{} json.Unmarshal([]byte(respHead), &respHeadMap) hTable := util.MapToLuaTable(S, respHeadMap) S.Push(hTable) S.Push(lua.LString(respCookie)) return 3 })) s.L.SetGlobal("goRunJs", s.L.NewFunction(func(S *lua.LState) int { param := S.ToString(-2) //list or detail step := S.ToString(-1) //参数 result := gojs.GoRunJsGetResult(s.SCode, param, step) qu.Debug("Go Run Js Result:", param, step, result) S.Push(lua.LString(result)) return 1 })) s.L.SetGlobal("newDownloadFile", s.L.NewFunction(func(S *lua.LState) int { cookie := S.ToString(-1) head := S.ToTable(-2) param := S.ToTable(-3) method := S.ToString(-4) url := S.ToString(-5) fileName := S.ToString(-6) ishttps := strings.Contains(url, "https") var mycookie []*http.Cookie if cookie != "{}" { json.Unmarshal([]byte(cookie), &mycookie) } else { mycookie = make([]*http.Cookie, 0) } fileName = strings.TrimSpace(fileName) url = strings.TrimSpace(url) ret := NewDownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout, false) url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret) if strings.TrimSpace(ftype) == "" { if len(path.Ext(name)) > 0 { ftype = path.Ext(name)[1:] } } //特殊处理中国招标投标公共服务平台异常附件过滤 if *site == "中国招标投标公共服务平台" { if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件 size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载 } else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf logger.Info("Error File Type:", bttype, url) size, ftype, fid = "", "", "" } } S.Push(lua.LString(url)) S.Push(lua.LString(name)) S.Push(lua.LString(size)) S.Push(lua.LString(ftype)) S.Push(lua.LString(fid)) return 5 })) //渲染页面下载 s.L.SetGlobal("downloadByRender", s.L.NewFunction(func(S *lua.LState) int { href := S.ToString(-1) contentHtml := util.DownloadByRender(href) S.Push(lua.LString(contentHtml)) return 1 })) //chromedp下载 s.L.SetGlobal("downloadByChrome", s.L.NewFunction(func(S *lua.LState) int { timeout := S.ToInt64(-2) taskStr := S.ToString(-1) cam := util.ChromeActionMap{} if json.Unmarshal([]byte(taskStr), &cam) == nil { if len(cam.BaseActions) > 0 { if len(cam.RangeActions) > 0 && cam.RangeTimes > 0 { for times := 1; times <= cam.RangeTimes; times++ { cam.BaseActions = append(cam.BaseActions, cam.RangeActions...) } } chromeTask := util.ChromeTask{ TimeOut: timeout, Actions: cam.BaseActions, } ret := DownloadByChrome(s.SCode, s.Downloader, chromeTask, s.Timeout) S.Push(util.MapToTable(S, ret)) } else { S.Push(S.NewTable()) } } else { S.Push(S.NewTable()) } return 1 })) //针对中国招标投标公共服务平台三级页瑞数加密下载方法 s.L.SetGlobal("downloadByDataIntercept", s.L.NewFunction(func(S *lua.LState) int { url := S.ToString(-4) url_regex := S.ToString(-3) timeout := S.ToInt(-2) proxy := S.ToBool(-1) headers := util.DownloadByDataIntercept(url, url_regex, timeout, proxy) table := util.MapToLuaTable(S, headers) S.Push(table) return 1 })) return "" } func dealHref(pageListUrl, href string) string { returnUrl := "" if href != "" { r, _ := regexp.Compile("^./") match := r.MatchString(href) if match { url2 := r.ReplaceAllString(href, "") returnUrl = pageListUrl + url2 } r2, _ := regexp.Compile("^/") match2 := r2.MatchString(href) if match2 { r3, _ := regexp.Compile("http://[^/]*/") domain := r3.FindString(pageListUrl) //fmt.Println(domain) url2 := r2.ReplaceAllString(href, "") returnUrl = domain + url2 } } return returnUrl } func dealPublishTime(content string, pattern string) string { publishTime := "" if pattern == "yyyy-MM-dd HH:mm:ss" { r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2}") publishTime = r.FindString(content) } else if pattern == "yyyy-MM-dd" { r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}") publishTime = r.FindString(content) } else if pattern == "MM-dd" { r, _ := regexp.Compile("\\d{2}-\\d{2}") publishTime = r.FindString(content) } return publishTime } func getChildrenLen(sq *gq.Selection) (ret int) { sq.Children().Each(func(i int, sq2 *gq.Selection) { ret = i }) return } // unicode转码 func transUnic(str string) string { buf := bytes.NewBuffer(nil) i, j := 0, len(str) for i < j { x := i + 6 if x > j { buf.WriteString(str[i:]) break } if str[i] == '\\' && str[i+1] == 'u' { hex := str[i+2 : x] r, err := strconv.ParseUint(hex, 16, 64) if err == nil { buf.WriteRune(rune(r)) } else { logger.Warn(err.Error()) buf.WriteString(str[i:x]) } i = x } else { buf.WriteByte(str[i]) i++ } } return buf.String() } // 取得变量 func (s *Script) GetVar(key string) string { return s.L.GetGlobal(key).String() } func (s *Script) GetIntVar(key string) int { lv := s.L.GetGlobal(key) if v, ok := lv.(lua.LNumber); ok { return int(v) } return -1 } func (s *Script) GetBoolVar(key string) bool { lv := s.L.GetGlobal(key) if v, ok := lv.(lua.LBool); ok { return bool(v) } return false } // 设置睡眠时间 func SleepTime(basetime int, times []time.Duration) { st := 0 //记录最后睡眠时长 base := float64(basetime * 60) if times[3].Seconds() > base { //最后一次大于 basetime*60秒 if times[2].Seconds() > base { n := 0 if times[0].Seconds() > base { n++ } if times[1].Seconds() > base { n++ } st = n + 1 } else if times[2].Seconds() < base && times[0].Seconds() > base && times[1].Seconds() > base { st = 1 } } if st > 0 { time.Sleep(time.Duration(st) * time.Minute) } } func generateKey(key []byte) (genKey []byte) { genKey = make([]byte, 16) copy(genKey, key) for i := 16; i < len(key); { for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 { genKey[j] ^= key[i] } } return genKey }