script.go 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930
  1. /**
  2. 脚本加载+调用 封装,
  3. 前期走文件系统加载
  4. 后期走数据库配置,
  5. LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
  6. */
  7. package spider
  8. import (
  9. codegrpc "analysiscode"
  10. "bytes"
  11. "compress/gzip"
  12. "crypto/aes"
  13. "encoding/base64"
  14. "encoding/json"
  15. "io/ioutil"
  16. mu "mfw/util"
  17. "net/http"
  18. "net/url"
  19. "path"
  20. "github.com/shopspring/decimal"
  21. qu "qfw/util"
  22. _ "qfw/util/redis"
  23. "regexp"
  24. util "spiderutil"
  25. "strconv"
  26. "strings"
  27. "sync/atomic"
  28. "time"
  29. gq "github.com/PuerkitoBio/goquery"
  30. "github.com/cjoudrey/gluahttp"
  31. "github.com/donnie4w/go-logger/logger"
  32. lujson "github.com/yuin/gopher-json"
  33. "github.com/yuin/gopher-lua"
  34. "golang.org/x/text/encoding/simplifiedchinese"
  35. "golang.org/x/text/transform"
  36. )
  37. //脚本
  38. type Script struct {
  39. SCode, ScriptFile string
  40. Encoding string
  41. Userproxy bool
  42. //Ishttps bool
  43. ErrorNum int32 //错误数
  44. Downloader string //下载器
  45. TotalRequestNum int32 //总请求次数
  46. ToDayRequestNum int32 //今日请求次数
  47. YestoDayRequestNum int32 //昨日请求次数
  48. Timeout int64 //超时时间秒
  49. L *lua.LState
  50. NoDownloadNum int32 //未成功下载数
  51. LastThreeTimes []time.Duration //单条信息流程完成的时间,最后三次
  52. FileLastThreeTimes []time.Duration //附件下载单条信息流程完成的时间,最后三次
  53. }
  54. const (
  55. MAX_STEP = 5 //计算时的最大步长
  56. )
  57. var workTime = true
  58. //
  59. func init() {
  60. go isWorkTime()
  61. }
  62. var TimeSleepChan = make(chan bool, 1)
  63. //加载文件
  64. func (s *Script) LoadScript(code, script_file string, newstate bool) string {
  65. defer mu.Catch()
  66. s.SCode = code
  67. s.ScriptFile = script_file
  68. if util.Config.Working == 0 {
  69. if newstate {
  70. s.L = lua.NewState(lua.Options{
  71. RegistrySize: 256 * 20,
  72. CallStackSize: 256,
  73. IncludeGoStackTrace: false,
  74. })
  75. }
  76. } else { //节能模式从CC池中获取lua.LState
  77. if newstate { //队列模式的newstate主要区分是列表页爬虫CC还是三级页爬虫CC2
  78. lState := <-CC2
  79. s.L = lState
  80. } else {
  81. lState := <-CC
  82. s.L = lState
  83. }
  84. //logger.Debug("获取CC资源", script_file)
  85. }
  86. s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
  87. s.L.PreloadModule("json", lujson.Loader)
  88. if err := s.L.DoString(script_file); err != nil {
  89. logger.Debug(code + ",加载lua脚本错误:" + err.Error())
  90. //panic(code + ",加载lua脚本错误:" + err.Error())
  91. }
  92. s.Encoding = s.GetVar("spiderPageEncoding")
  93. s.Userproxy = s.GetBoolVar("spiderUserProxy")
  94. //暴露go方法
  95. //download(url,head) 普通下载
  96. s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
  97. if s.LastThreeTimes == nil {
  98. s.LastThreeTimes = make([]time.Duration, 4)
  99. }
  100. if util.Config.IsDelay {
  101. SleepTime(1, s.LastThreeTimes) //睡眠时间
  102. }
  103. start := time.Now() //起始时间
  104. head := S.ToTable(-1)
  105. url := S.ToString(-2)
  106. ishttps := S.ToBool(-3)
  107. charset := S.ToString(-4)
  108. if charset == "" {
  109. charset = s.Encoding
  110. }
  111. ret := Download(s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  112. S.Push(lua.LString(ret))
  113. atomic.AddInt32(&s.ToDayRequestNum, 1)
  114. atomic.AddInt32(&s.TotalRequestNum, 1)
  115. end := time.Since(start)
  116. if len(s.LastThreeTimes) >= 4 {
  117. s.LastThreeTimes = s.LastThreeTimes[1:]
  118. }
  119. s.LastThreeTimes = append(s.LastThreeTimes, end)
  120. return 1
  121. }))
  122. //高级下载downloadAdv(url,method,param,head,cookie)
  123. s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
  124. if s.LastThreeTimes == nil {
  125. s.LastThreeTimes = make([]time.Duration, 4)
  126. }
  127. if util.Config.IsDelay {
  128. SleepTime(1, s.LastThreeTimes) //睡眠时间
  129. }
  130. start := time.Now() //起始时间
  131. cookie := S.ToString(-1)
  132. head := S.ToTable(-2)
  133. param := S.ToTable(-3)
  134. method := S.ToString(-4)
  135. url := S.ToString(-5)
  136. ishttps := S.ToBool(-6)
  137. charset := S.ToString(-7)
  138. if charset == "" {
  139. charset = s.Encoding
  140. }
  141. var mycookie []*http.Cookie
  142. json.Unmarshal([]byte(cookie), &mycookie)
  143. var ret string
  144. var retcookie []*http.Cookie
  145. if param == nil {
  146. ptext := map[string]interface{}{"text": S.ToString(-3)}
  147. ret, retcookie = DownloadAdv(s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  148. } else {
  149. ret, retcookie = DownloadAdv(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  150. }
  151. S.Push(lua.LString(ret))
  152. scookie, _ := json.Marshal(retcookie)
  153. S.Push(lua.LString(scookie))
  154. atomic.AddInt32(&s.ToDayRequestNum, 1)
  155. atomic.AddInt32(&s.TotalRequestNum, 1)
  156. end := time.Since(start)
  157. if len(s.LastThreeTimes) >= 4 {
  158. s.LastThreeTimes = s.LastThreeTimes[1:]
  159. }
  160. s.LastThreeTimes = append(s.LastThreeTimes, end)
  161. return 2
  162. }))
  163. //保存验证错误日志
  164. s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
  165. code := S.ToString(-4)
  166. name := S.ToString(-3)
  167. url := S.ToString(-2)
  168. content := S.ToString(-1)
  169. saveVerificationLog(code, name, url, content)
  170. atomic.AddInt32(&s.ErrorNum, 1)
  171. atomic.AddInt32(&s.NoDownloadNum, 1)
  172. //防止恶意增加日志
  173. util.TimeSleepFunc(5*time.Second, TimeSleepChan)
  174. return 0
  175. }))
  176. //添加改版日志
  177. s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
  178. url := S.ToString(-2)
  179. str := S.ToString(-1)
  180. logger.Error(s.SCode, url, str)
  181. return 0
  182. }))
  183. //查找信息是否存在(作废)
  184. s.L.SetGlobal("findHasExit", s.L.NewFunction(func(S *lua.LState) int {
  185. //c := S.ToString(-2)
  186. //q := S.ToString(-1)
  187. //b := findHasExit(c, q)
  188. S.Push(lua.LBool(false))
  189. return 1
  190. }))
  191. s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
  192. nodetype := S.ToString(-3)
  193. gpath := S.ToString(-2)
  194. content := S.ToString(-1)
  195. ret := util.FindOneText(gpath, content, nodetype)
  196. S.Push(ret)
  197. return 1
  198. }))
  199. s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
  200. gpath := S.ToString(-2)
  201. content := S.ToString(-1)
  202. ret := util.FindContentText(gpath, content)
  203. S.Push(ret)
  204. return 1
  205. }))
  206. s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
  207. nodetype := S.ToString(-3)
  208. gpath := S.ToString(-2)
  209. content := S.ToString(-1)
  210. ret := util.FindOneHtml(gpath, content, nodetype)
  211. S.Push(ret)
  212. return 1
  213. }))
  214. s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
  215. gpath := S.ToString(-2)
  216. content := S.ToString(-1)
  217. ret := s.L.NewTable()
  218. util.FindListText(gpath, content, ret)
  219. S.Push(ret)
  220. return 1
  221. }))
  222. s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
  223. gpath := S.ToString(-2)
  224. content := S.ToString(-1)
  225. ret := s.L.NewTable()
  226. util.FindListHtml(gpath, content, ret)
  227. S.Push(ret)
  228. return 1
  229. }))
  230. // s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int {
  231. // update := [][]map[string]interface{}{}
  232. // query := map[string]interface{}{"state": 0}
  233. // data, _ := Mgo.Find(util.Config.TmpCollName, query, `{"_id":-1}`, nil, false, 0, 10)
  234. // pageList := []interface{}{}
  235. // for _, d := range *data {
  236. // tmpMap := map[string]string{}
  237. // tmpMap["title"] = qu.ObjToString(d["title"])
  238. // tmpMap["detail"] = qu.ObjToString(d["detail"])
  239. // tmpMap["href"] = qu.ObjToString(d["href"])
  240. // publishtime := qu.Int64All(d["publishtime"])
  241. // tmpMap["publishtime"] = qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout)
  242. // tmpMap["_id"] = qu.BsonIdToSId(d["_id"])
  243. // pageList = append(pageList, tmpMap)
  244. // update = append(update, []map[string]interface{}{
  245. // map[string]interface{}{"_id": d["_id"]},
  246. // map[string]interface{}{"$set": map[string]interface{}{"state": 1}},
  247. // })
  248. // }
  249. // ret := util.MapToTable(s.L, pageList)
  250. // S.Push(ret)
  251. // if len(update) > 0 {
  252. // Mgo.UpdateBulk(util.Config.TmpCollName, update...)
  253. // }
  254. // return 1
  255. // }))
  256. s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
  257. qmap := S.ToTable(-2)
  258. content := S.ToString(-1)
  259. ret := s.L.NewTable()
  260. util.FindMap(qmap, content, ret)
  261. S.Push(ret)
  262. return 1
  263. }))
  264. //公示暴露方式
  265. s.L.SetGlobal("getEcpsCode", s.L.NewFunction(func(S *lua.LState) int {
  266. area := strings.ToUpper(S.ToString(-2))
  267. content := S.ToString(-1)
  268. code, state := util.GetEcpsCode(area, []byte(content))
  269. if state == "wx" {
  270. code, _ = GetCodeByWx([]byte(content))
  271. }
  272. S.Push(lua.LString(code))
  273. return 1
  274. }))
  275. //调用jsvm
  276. s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
  277. js := S.ToString(-1)
  278. ret := s.L.NewTable()
  279. if js == "" {
  280. ret.RawSet(lua.LString("val"), lua.LString(""))
  281. ret.RawSet(lua.LString("err"), lua.LString("js is null"))
  282. } else {
  283. rep := util.JsVmPost(util.Config.JsVmUrl, js)
  284. ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
  285. ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
  286. }
  287. S.Push(ret)
  288. return 1
  289. }))
  290. //指定下载器
  291. s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
  292. s.Downloader = GetOneDownloader()
  293. S.Push(lua.LString(s.Downloader))
  294. return 1
  295. }))
  296. //指定下载器file
  297. s.L.SetGlobal("changeDownloaderFile", s.L.NewFunction(func(S *lua.LState) int {
  298. s.Downloader = GetOneDownloaderFile()
  299. S.Push(lua.LString(s.Downloader))
  300. return 1
  301. }))
  302. //手工延时
  303. s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
  304. // if workTime {
  305. // util.TimeSleepFunc(time.Duration(S.ToInt(-1))*time.Second, TimeSleepChan)
  306. // } else {
  307. // util.TimeSleepFunc(1*time.Second, TimeSleepChan)
  308. // }
  309. util.TimeSleepFunc(time.Second*2, TimeSleepChan)
  310. return 0
  311. }))
  312. //编码解码
  313. s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
  314. codeType := strings.ToLower(S.ToString(-2))
  315. str := S.CheckString(-1)
  316. switch codeType {
  317. case "unicode":
  318. str = transUnic(str)
  319. case "urlencode_gbk":
  320. data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
  321. l, _ := url.Parse("http://a.com/?" + string(data))
  322. tmpstr := l.Query().Encode()
  323. if len(tmpstr) > 1 {
  324. str = tmpstr[0 : len(tmpstr)-1]
  325. } else {
  326. str = ""
  327. }
  328. case "urlencode_utf8":
  329. l, _ := url.Parse("http://a.com/?" + str)
  330. tmpstr := l.Query().Encode()
  331. if len(tmpstr) > 1 {
  332. str = tmpstr[0 : len(tmpstr)-1]
  333. } else {
  334. str = ""
  335. }
  336. case "urldecode_utf8":
  337. str, _ = url.QueryUnescape(str)
  338. case "decode64":
  339. str = util.DecodeB64(str)
  340. case "encodemd5":
  341. str = qu.GetMd5String(str)
  342. case "htmldecode": //html实体码
  343. //txt := `<div align="left" style="margin-left: 0pt;"><span style='font-family:; font-size:13px; color:#000000'>&#22826;&#38451;&#23707;&#29305;&#21220;&#28040;&#38450;&#31449;&#12289;&#26494;&#28006;&#29305;&#21220;&#28040;&#38450;&#31449;&#24314;&#35774;&#39033;&#30446;&#35774;&#35745;&#20013;&#26631;&#20844;&#31034;</span></div>`
  344. str = S.ToString(-1)
  345. reg, _ := regexp.Compile("&#\\d+;")
  346. str = reg.ReplaceAllStringFunc(str, func(src string) string {
  347. v, _ := strconv.Atoi(src[2 : len(src)-1])
  348. return string(rune(v))
  349. })
  350. }
  351. S.Push(lua.LString(str))
  352. return 1
  353. }))
  354. //如果服务端返回的html是gzip压缩过格式的 这里需要转一下
  355. s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
  356. html := S.ToString(-1)
  357. bs := []byte(html)
  358. gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
  359. bs, _ = ioutil.ReadAll(gzipreader)
  360. S.Push(lua.LString(bs))
  361. return 1
  362. }))
  363. //luamaker提供的分析列表页url地址 获取列表数据公用方法
  364. s.L.SetGlobal("getSimpleListPage", s.L.NewFunction(func(S *lua.LState) int {
  365. html := S.ToString(-3)
  366. date_pattern := S.ToString(-2)
  367. pageListUrl := S.ToString(-1) //列表页url
  368. bs := []byte(html)
  369. tmparr := []string{}
  370. tmpret := []int{}
  371. re, _ := regexp.Compile(`采购|招标|公示|公告|意见|结果|通知|工程`)
  372. doc, _ := gq.NewDocumentFromReader(bytes.NewReader(bs))
  373. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  374. text := sq.Text()
  375. if len(text) < 30 {
  376. return
  377. }
  378. tmparr = append(tmparr, text)
  379. if re.MatchString(text) {
  380. tmpret = append(tmpret, 1)
  381. //logger.Debug(text)
  382. } else {
  383. tmpret = append(tmpret, 0)
  384. }
  385. })
  386. logger.Debug(tmpret)
  387. //线性分析,算周边,只算周围5步的点
  388. tmplen, thepos, themax := len(tmpret), -1, 0
  389. for i := 0; i < tmplen; i++ {
  390. if tmpret[i] == 0 {
  391. continue
  392. }
  393. start, end := i-MAX_STEP, i+MAX_STEP
  394. if start < 0 {
  395. start = 0
  396. }
  397. if end > tmplen {
  398. end = tmplen
  399. }
  400. tmp := 0
  401. //从当前位置往左,往右找连续点
  402. for j := i; j > start; j-- {
  403. if tmpret[j] == 1 {
  404. tmp++
  405. } else {
  406. break
  407. }
  408. }
  409. for j := i; j < end; j++ {
  410. if tmpret[j] == 1 {
  411. tmp++
  412. } else {
  413. break
  414. }
  415. }
  416. if tmp > themax {
  417. themax = tmp
  418. thepos = i
  419. }
  420. } //end of for...
  421. //logger.Debug("找位置完成")
  422. //验证
  423. if thepos == -1 {
  424. logger.Error("完蛋,找不到")
  425. panic("不支持啊,失败啊")
  426. }
  427. //下边是找父容器
  428. var thelink *gq.Selection
  429. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  430. if sq.Text() == tmparr[thepos] {
  431. thelink = sq
  432. }
  433. })
  434. isfind := false
  435. //同样Path向上找,不超过5步
  436. for i := 0; i < MAX_STEP; i++ {
  437. thelink = thelink.Parent()
  438. clen := getChildrenLen(thelink)
  439. if clen >= themax-1 {
  440. isfind = true
  441. break
  442. }
  443. //logger.Debug("TAG:::", thelink.Nodes[0].Data, clen)
  444. }
  445. //找到列表
  446. pageList := []interface{}{}
  447. if isfind {
  448. thelink.Children().Each(func(i int, sq *gq.Selection) {
  449. page := map[string]string{}
  450. link_sq := sq.Find("a")
  451. href := link_sq.AttrOr("href", "")
  452. text := link_sq.Text()
  453. page["title"] = text
  454. page["href"] = dealHref(pageListUrl, href)
  455. page["publishtime"] = dealPublishTime(strings.TrimSpace(sq.Text()), date_pattern)
  456. //logger.Debug(i)
  457. pageList = append(pageList, page)
  458. })
  459. } else {
  460. logger.Error("完蛋,找父亲节点失败啊")
  461. //panic("不支持啊,失败啊")
  462. }
  463. ret := util.MapToTable(s.L, pageList)
  464. S.Push(ret)
  465. return 1
  466. }))
  467. //招投标信息标题判重
  468. s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  469. S.Push(lua.LBool(false))
  470. return 1
  471. }))
  472. //招标信息判重新方法 2016-12-14 wanghuidong
  473. s.L.SetGlobal("urlRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  474. S.Push(lua.LBool(false))
  475. return 1
  476. }))
  477. //将url放入内存缓存 2016-12-14 wanghuidong
  478. s.L.SetGlobal("putUrl2Redis", s.L.NewFunction(func(S *lua.LState) int {
  479. //url := S.ToString(-1)
  480. return 1
  481. }))
  482. //解析附件中的word、pdf
  483. s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
  484. ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
  485. str := S.ToString(-2)
  486. extension := S.ToString(-1)
  487. bs, _ := base64.StdEncoding.DecodeString(str)
  488. bs = append([]byte{ext[extension]}, bs...)
  489. msgid := mu.UUID(8)
  490. Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
  491. return 1
  492. }))
  493. //下载附件download(url,method,param,head,cookie,fileName)
  494. s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
  495. if s.FileLastThreeTimes == nil {
  496. s.FileLastThreeTimes = make([]time.Duration, 4)
  497. }
  498. if util.Config.IsDelay {
  499. SleepTime(3, s.FileLastThreeTimes) //睡眠时间
  500. }
  501. start := time.Now() //起始时间
  502. cookie := S.ToString(-1)
  503. head := S.ToTable(-2)
  504. param := S.ToTable(-3)
  505. method := S.ToString(-4)
  506. url := S.ToString(-5)
  507. fileName := S.ToString(-6)
  508. ishttps := strings.Contains(url, "https")
  509. var mycookie []*http.Cookie
  510. if cookie != "{}" {
  511. json.Unmarshal([]byte(cookie), &mycookie)
  512. } else {
  513. mycookie = make([]*http.Cookie, 0)
  514. }
  515. fileName = strings.TrimSpace(fileName)
  516. url = strings.TrimSpace(url)
  517. ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
  518. url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
  519. if strings.TrimSpace(ftype) == "" {
  520. if len(path.Ext(name)) > 0 {
  521. ftype = path.Ext(name)[1:]
  522. }
  523. }
  524. S.Push(lua.LString(url))
  525. S.Push(lua.LString(name))
  526. S.Push(lua.LString(size))
  527. S.Push(lua.LString(ftype))
  528. S.Push(lua.LString(fid))
  529. atomic.AddInt32(&s.ToDayRequestNum, 1)
  530. atomic.AddInt32(&s.TotalRequestNum, 1)
  531. end := time.Since(start)
  532. if len(s.FileLastThreeTimes) >= 4 {
  533. s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
  534. }
  535. s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
  536. return 5
  537. }))
  538. s.L.SetGlobal("clearMemoeryCache", s.L.NewFunction(func(S *lua.LState) int {
  539. /*title := S.ToString(-1)
  540. isExist, _ := redis.Exists("title_repeat_judgement", "title_repeat_"+title)
  541. if isExist {
  542. redis.Del("title_repeat_judgement", "title_repeat_"+title)
  543. }*/
  544. return 1
  545. }))
  546. //支持正则,提取
  547. s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
  548. index := int(S.ToNumber(-1))
  549. regstr := S.ToString(-2)
  550. text := S.ToString(-3)
  551. reg := regexp.MustCompile(regstr)
  552. reps := reg.FindAllStringSubmatchIndex(text, -1)
  553. ret := s.L.NewTable()
  554. number := 0
  555. for _, v := range reps {
  556. number++
  557. ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
  558. }
  559. S.Push(ret)
  560. return 1
  561. }))
  562. //支持替换
  563. s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
  564. text := S.ToString(-3)
  565. old := S.ToString(-2)
  566. repl := S.ToString(-1)
  567. text = strings.Replace(text, old, repl, -1)
  568. S.Push(lua.LString(text))
  569. return 1
  570. }))
  571. //标题的关键词、排除词过滤
  572. s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
  573. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  574. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  575. data := S.ToTable(-1)
  576. dataMap := util.TableToMap(data)
  577. ret := s.L.NewTable()
  578. num := 1
  579. for _, v := range dataMap {
  580. tmp := v.(map[string]interface{})
  581. isOk := false
  582. if title := qu.ObjToString(tmp["title"]); title != "" {
  583. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  584. isOk = true
  585. }
  586. }
  587. if isOk {
  588. ret.Insert(num, util.MapToLuaTable(S, tmp))
  589. num++
  590. }
  591. }
  592. S.Push(ret)
  593. return 1
  594. }))
  595. //标题的关键词、排除词过滤
  596. s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
  597. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  598. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  599. data := S.ToTable(-1)
  600. dataMap := util.TableToMap(data)
  601. if title := qu.ObjToString(dataMap["title"]); title != "" {
  602. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  603. S.Push(lua.LBool(true))
  604. return 1
  605. } else {
  606. qu.Debug(s.SCode, dataMap["href"], " title error")
  607. }
  608. } else {
  609. qu.Debug(s.SCode, dataMap["href"], " title error")
  610. }
  611. S.Push(lua.LBool(false))
  612. return 1
  613. }))
  614. //detail过滤
  615. s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
  616. /*
  617. 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
  618. 2.是否含汉字
  619. */
  620. reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|见附件|详见附件)")
  621. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  622. detail := S.ToString(-1)
  623. if reg1.MatchString(detail) {
  624. S.Push(lua.LBool(true))
  625. return 1
  626. }
  627. if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
  628. S.Push(lua.LBool(false))
  629. return 1
  630. }
  631. S.Push(lua.LBool(false))
  632. return 1
  633. }))
  634. //匹配汉字
  635. s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
  636. reg1 := regexp.MustCompile("(见附件|详见附件)")
  637. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  638. detail := S.ToString(-1)
  639. detail = reg1.ReplaceAllString(detail, "")
  640. ok := reg2.MatchString(detail)
  641. S.Push(lua.LBool(ok))
  642. return 1
  643. }))
  644. //aes ecb模式加密
  645. s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
  646. origData := S.ToString(-2)
  647. key := S.ToString(-1)
  648. bytekey := []byte(key)
  649. byteorigData := []byte(origData)
  650. cipher, _ := aes.NewCipher(generateKey([]byte(bytekey)))
  651. length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize
  652. plain := make([]byte, length*aes.BlockSize)
  653. copy(plain, byteorigData)
  654. pad := byte(len(plain) - len(byteorigData))
  655. for i := len(byteorigData); i < len(plain); i++ {
  656. plain[i] = pad
  657. }
  658. encrypted := make([]byte, len(plain))
  659. // 分组分块加密
  660. for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() {
  661. cipher.Encrypt(encrypted[bs:be], plain[bs:be])
  662. }
  663. result := base64.StdEncoding.EncodeToString(encrypted)
  664. S.Push(lua.LString(result))
  665. return 1
  666. }))
  667. //根据正文获取发布时间
  668. s.L.SetGlobal("getPublishtime", s.L.NewFunction(func(S *lua.LState) int {
  669. detail := S.ToString(-2)
  670. contenthtml := S.ToString(-1)
  671. publishtime := util.GetPublishtime([]string{contenthtml, detail})
  672. S.Push(lua.LString(publishtime))
  673. return 1
  674. }))
  675. //匹配
  676. s.L.SetGlobal("stringFind", s.L.NewFunction(func(S *lua.LState) int {
  677. regstr := S.ToString(-1)
  678. text := S.ToString(-2)
  679. reg := regexp.MustCompile(regstr)
  680. result := reg.FindString(text)
  681. isMatch := false
  682. if result != "" {
  683. isMatch = true
  684. }
  685. S.Push(lua.LString(result))
  686. S.Push(lua.LBool(isMatch))
  687. return 2
  688. }))
  689. //截取
  690. s.L.SetGlobal("stringSub", s.L.NewFunction(func(S *lua.LState) int {
  691. text := S.ToString(-3)
  692. start := S.ToInt(-2)
  693. end := S.ToInt(-1)
  694. result := ""
  695. if len(text) > 0 {
  696. textRune := []rune(text)
  697. textLen := len(textRune)
  698. if end < 0 {
  699. if start > 0 { //正向截取到倒数第end位
  700. result = string(textRune[start-1 : textLen+1+end])
  701. } else if start < 0 { //反向截取 从倒数第start位截取到倒数第end位
  702. result = string(textRune[textLen+start : textLen+1+end])
  703. }
  704. } else if start > 0 && end >= start && end <= textLen { //从第start个截取到第end个
  705. result = string(textRune[start-1 : end])
  706. }
  707. // if end == -1 {
  708. // if start >= 1 { //正向截取到结尾
  709. // result = string(textRune[start-1:])
  710. // } else if start < 0 && textLen+start >= 0 { //反向截取后缀
  711. // result = string(textRune[textLen+start:])
  712. // }
  713. // } else if start >= 1 && end <= textLen { //从第start个截取到第end个
  714. // result = string(textRune[start-1 : end])
  715. // }
  716. }
  717. S.Push(lua.LString(result))
  718. return 1
  719. }))
  720. //base64加密
  721. s.L.SetGlobal("encodeBase64", s.L.NewFunction(func(S *lua.LState) int {
  722. text := S.ToString(-1)
  723. base64Text := base64.StdEncoding.EncodeToString([]byte(text))
  724. S.Push(lua.LString(base64Text))
  725. return 1
  726. }))
  727. //base64解密
  728. s.L.SetGlobal("decodeBase64", s.L.NewFunction(func(S *lua.LState) int {
  729. text := S.ToString(-1)
  730. result := ""
  731. byteText, err := base64.StdEncoding.DecodeString(text)
  732. if err == nil {
  733. result = string(byteText)
  734. }
  735. S.Push(lua.LString(result))
  736. return 1
  737. }))
  738. //长度
  739. s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int {
  740. text := S.ToString(-1)
  741. textLen := len([]rune(text))
  742. S.Push(lua.LNumber(textLen))
  743. return 1
  744. }))
  745. //去除特殊标签中间内容
  746. s.L.SetGlobal("getPureContent", s.L.NewFunction(func(S *lua.LState) int {
  747. con := S.ToString(-1)
  748. reg := regexp.MustCompile("(?s)<(!%-%-|!--|style).*?(%-%-|--|style)>") //注释 css
  749. con = reg.ReplaceAllString(con, "")
  750. // indexArr := reg.FindAllStringIndex(con, -1)
  751. // for i := len(indexArr) - 1; i >= 0; i-- {
  752. // if index := indexArr[i]; len(index) == 2 {
  753. // con = con[:index[0]] + con[index[1]:]
  754. // }
  755. // }
  756. S.Push(lua.LString(con))
  757. return 1
  758. }))
  759. //interface转string
  760. s.L.SetGlobal("formatToString", s.L.NewFunction(func(S *lua.LState) int {
  761. strNum := S.ToString(-1)
  762. decimalNum, _ := decimal.NewFromString(strNum)
  763. S.Push(lua.LString(decimalNum.String()))
  764. return 1
  765. }))
  766. //获取验证码
  767. s.L.SetGlobal("getCodeByPath", s.L.NewFunction(func(S *lua.LState) int {
  768. path := S.ToString(-1)
  769. code := codegrpc.GetCodeByPath(path)
  770. S.Push(lua.LString(code))
  771. return 1
  772. }))
  773. return ""
  774. }
  775. func dealHref(pageListUrl, href string) string {
  776. returnUrl := ""
  777. if href != "" {
  778. r, _ := regexp.Compile("^./")
  779. match := r.MatchString(href)
  780. if match {
  781. url2 := r.ReplaceAllString(href, "")
  782. returnUrl = pageListUrl + url2
  783. }
  784. r2, _ := regexp.Compile("^/")
  785. match2 := r2.MatchString(href)
  786. if match2 {
  787. r3, _ := regexp.Compile("http://[^/]*/")
  788. domain := r3.FindString(pageListUrl)
  789. //fmt.Println(domain)
  790. url2 := r2.ReplaceAllString(href, "")
  791. returnUrl = domain + url2
  792. }
  793. }
  794. return returnUrl
  795. }
  796. func dealPublishTime(content string, pattern string) string {
  797. publishTime := ""
  798. if pattern == "yyyy-MM-dd HH:mm:ss" {
  799. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2}")
  800. publishTime = r.FindString(content)
  801. } else if pattern == "yyyy-MM-dd" {
  802. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}")
  803. publishTime = r.FindString(content)
  804. } else if pattern == "MM-dd" {
  805. r, _ := regexp.Compile("\\d{2}-\\d{2}")
  806. publishTime = r.FindString(content)
  807. }
  808. return publishTime
  809. }
  810. func getChildrenLen(sq *gq.Selection) (ret int) {
  811. sq.Children().Each(func(i int, sq2 *gq.Selection) {
  812. ret = i
  813. })
  814. return
  815. }
  816. //
  817. func (s *Script) Reload() {
  818. s.L.Close()
  819. s.LoadScript(s.SCode, s.ScriptFile, false)
  820. }
  821. //unicode转码
  822. func transUnic(str string) string {
  823. buf := bytes.NewBuffer(nil)
  824. i, j := 0, len(str)
  825. for i < j {
  826. x := i + 6
  827. if x > j {
  828. buf.WriteString(str[i:])
  829. break
  830. }
  831. if str[i] == '\\' && str[i+1] == 'u' {
  832. hex := str[i+2 : x]
  833. r, err := strconv.ParseUint(hex, 16, 64)
  834. if err == nil {
  835. buf.WriteRune(rune(r))
  836. } else {
  837. logger.Warn(err.Error())
  838. buf.WriteString(str[i:x])
  839. }
  840. i = x
  841. } else {
  842. buf.WriteByte(str[i])
  843. i++
  844. }
  845. }
  846. return buf.String()
  847. }
  848. //取得变量
  849. func (s *Script) GetVar(key string) string {
  850. return s.L.GetGlobal(key).String()
  851. }
  852. //
  853. func (s *Script) GetIntVar(key string) int {
  854. lv := s.L.GetGlobal(key)
  855. if v, ok := lv.(lua.LNumber); ok {
  856. return int(v)
  857. }
  858. return -1
  859. }
  860. //
  861. func (s *Script) GetBoolVar(key string) bool {
  862. lv := s.L.GetGlobal(key)
  863. if v, ok := lv.(lua.LBool); ok {
  864. return bool(v)
  865. }
  866. return false
  867. }
  868. func isWorkTime() {
  869. workTime = util.IsWorkTime()
  870. util.TimeAfterFunc(10*time.Minute, isWorkTime, TimeChan)
  871. }
  872. //设置睡眠时间
  873. func SleepTime(basetime int, times []time.Duration) {
  874. st := 0 //记录最后睡眠时长
  875. base := float64(basetime * 60)
  876. if times[3].Seconds() > base { //最后一次大于 basetime*60秒
  877. if times[2].Seconds() > base {
  878. n := 0
  879. if times[0].Seconds() > base {
  880. n++
  881. }
  882. if times[1].Seconds() > base {
  883. n++
  884. }
  885. st = n + 1
  886. } else if times[2].Seconds() < base && times[0].Seconds() > base && times[1].Seconds() > base {
  887. st = 1
  888. }
  889. }
  890. if st > 0 {
  891. time.Sleep(time.Duration(st) * time.Minute)
  892. }
  893. }
  894. func generateKey(key []byte) (genKey []byte) {
  895. genKey = make([]byte, 16)
  896. copy(genKey, key)
  897. for i := 16; i < len(key); {
  898. for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
  899. genKey[j] ^= key[i]
  900. }
  901. }
  902. return genKey
  903. }