script.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886
  1. /**
  2. 脚本加载+调用 封装,
  3. 前期走文件系统加载
  4. 后期走数据库配置,
  5. LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
  6. */
  7. package spider
  8. import (
  9. "bytes"
  10. "compress/gzip"
  11. "crypto/aes"
  12. "encoding/base64"
  13. "encoding/json"
  14. "io/ioutil"
  15. mu "mfw/util"
  16. "net/http"
  17. "net/url"
  18. "path"
  19. qu "qfw/util"
  20. _ "qfw/util/redis"
  21. "regexp"
  22. util "spiderutil"
  23. "strconv"
  24. "strings"
  25. "sync/atomic"
  26. "time"
  27. gq "github.com/PuerkitoBio/goquery"
  28. "github.com/cjoudrey/gluahttp"
  29. "github.com/donnie4w/go-logger/logger"
  30. lujson "github.com/yuin/gopher-json"
  31. "github.com/yuin/gopher-lua"
  32. "golang.org/x/text/encoding/simplifiedchinese"
  33. "golang.org/x/text/transform"
  34. )
  35. //脚本
  36. type Script struct {
  37. SCode, ScriptFile string
  38. Encoding string
  39. Userproxy bool
  40. //Ishttps bool
  41. ErrorNum int32 //错误数
  42. Downloader string //下载器
  43. TotalRequestNum int32 //总请求次数
  44. ToDayRequestNum int32 //今日请求次数
  45. YestoDayRequestNum int32 //昨日请求次数
  46. Timeout int64 //超时时间秒
  47. L *lua.LState
  48. NoDownloadNum int32 //未成功下载数
  49. LastThreeTimes []time.Duration //单条信息流程完成的时间,最后三次
  50. FileLastThreeTimes []time.Duration //附件下载单条信息流程完成的时间,最后三次
  51. }
  52. const (
  53. MAX_STEP = 5 //计算时的最大步长
  54. )
  55. var workTime = true
  56. //
  57. func init() {
  58. go isWorkTime()
  59. }
  60. var TimeSleepChan = make(chan bool, 1)
  61. //加载文件
  62. func (s *Script) LoadScript(code, script_file string, newstate bool) string {
  63. defer mu.Catch()
  64. s.SCode = code
  65. s.ScriptFile = script_file
  66. if util.Config.Working == 0 {
  67. if newstate {
  68. s.L = lua.NewState(lua.Options{
  69. RegistrySize: 256 * 20,
  70. CallStackSize: 256,
  71. IncludeGoStackTrace: false,
  72. })
  73. }
  74. } else { //节能模式从CC池中获取lua.LState
  75. if newstate { //队列模式的newstate主要区分是列表页爬虫CC还是三级页爬虫CC2
  76. lState := <-CC2
  77. s.L = lState
  78. } else {
  79. lState := <-CC
  80. s.L = lState
  81. }
  82. //logger.Debug("获取CC资源", script_file)
  83. }
  84. s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
  85. s.L.PreloadModule("json", lujson.Loader)
  86. if err := s.L.DoString(script_file); err != nil {
  87. logger.Debug(code + ",加载lua脚本错误:" + err.Error())
  88. //panic(code + ",加载lua脚本错误:" + err.Error())
  89. }
  90. s.Encoding = s.GetVar("spiderPageEncoding")
  91. s.Userproxy = s.GetBoolVar("spiderUserProxy")
  92. //暴露go方法
  93. //download(url,head) 普通下载
  94. s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
  95. if s.LastThreeTimes == nil {
  96. s.LastThreeTimes = make([]time.Duration, 4)
  97. }
  98. if util.Config.IsDelay {
  99. SleepTime(1, s.LastThreeTimes) //睡眠时间
  100. }
  101. start := time.Now() //起始时间
  102. head := S.ToTable(-1)
  103. url := S.ToString(-2)
  104. ishttps := S.ToBool(-3)
  105. charset := S.ToString(-4)
  106. if charset == "" {
  107. charset = s.Encoding
  108. }
  109. ret := Download(s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  110. S.Push(lua.LString(ret))
  111. atomic.AddInt32(&s.ToDayRequestNum, 1)
  112. atomic.AddInt32(&s.TotalRequestNum, 1)
  113. end := time.Since(start)
  114. if len(s.LastThreeTimes) >= 4 {
  115. s.LastThreeTimes = s.LastThreeTimes[1:]
  116. }
  117. s.LastThreeTimes = append(s.LastThreeTimes, end)
  118. return 1
  119. }))
  120. //高级下载downloadAdv(url,method,param,head,cookie)
  121. s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
  122. if s.LastThreeTimes == nil {
  123. s.LastThreeTimes = make([]time.Duration, 4)
  124. }
  125. if util.Config.IsDelay {
  126. SleepTime(1, s.LastThreeTimes) //睡眠时间
  127. }
  128. start := time.Now() //起始时间
  129. cookie := S.ToString(-1)
  130. head := S.ToTable(-2)
  131. param := S.ToTable(-3)
  132. method := S.ToString(-4)
  133. url := S.ToString(-5)
  134. ishttps := S.ToBool(-6)
  135. charset := S.ToString(-7)
  136. if charset == "" {
  137. charset = s.Encoding
  138. }
  139. var mycookie []*http.Cookie
  140. json.Unmarshal([]byte(cookie), &mycookie)
  141. var ret string
  142. var retcookie []*http.Cookie
  143. if param == nil {
  144. ptext := map[string]interface{}{"text": S.ToString(-3)}
  145. ret, retcookie = DownloadAdv(s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  146. } else {
  147. ret, retcookie = DownloadAdv(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  148. }
  149. S.Push(lua.LString(ret))
  150. scookie, _ := json.Marshal(retcookie)
  151. S.Push(lua.LString(scookie))
  152. atomic.AddInt32(&s.ToDayRequestNum, 1)
  153. atomic.AddInt32(&s.TotalRequestNum, 1)
  154. end := time.Since(start)
  155. if len(s.LastThreeTimes) >= 4 {
  156. s.LastThreeTimes = s.LastThreeTimes[1:]
  157. }
  158. s.LastThreeTimes = append(s.LastThreeTimes, end)
  159. return 2
  160. }))
  161. //保存验证错误日志
  162. s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
  163. code := S.ToString(-4)
  164. name := S.ToString(-3)
  165. url := S.ToString(-2)
  166. content := S.ToString(-1)
  167. saveVerificationLog(code, name, url, content)
  168. atomic.AddInt32(&s.ErrorNum, 1)
  169. atomic.AddInt32(&s.NoDownloadNum, 1)
  170. //防止恶意增加日志
  171. util.TimeSleepFunc(5*time.Second, TimeSleepChan)
  172. return 0
  173. }))
  174. //添加改版日志
  175. s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
  176. url := S.ToString(-2)
  177. str := S.ToString(-1)
  178. logger.Error(s.SCode, url, str)
  179. return 0
  180. }))
  181. //查找信息是否存在(作废)
  182. s.L.SetGlobal("findHasExit", s.L.NewFunction(func(S *lua.LState) int {
  183. //c := S.ToString(-2)
  184. //q := S.ToString(-1)
  185. //b := findHasExit(c, q)
  186. S.Push(lua.LBool(false))
  187. return 1
  188. }))
  189. s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
  190. nodetype := S.ToString(-3)
  191. gpath := S.ToString(-2)
  192. content := S.ToString(-1)
  193. ret := util.FindOneText(gpath, content, nodetype)
  194. S.Push(ret)
  195. return 1
  196. }))
  197. s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
  198. gpath := S.ToString(-2)
  199. content := S.ToString(-1)
  200. ret := util.FindContentText(gpath, content)
  201. S.Push(ret)
  202. return 1
  203. }))
  204. s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
  205. nodetype := S.ToString(-3)
  206. gpath := S.ToString(-2)
  207. content := S.ToString(-1)
  208. ret := util.FindOneHtml(gpath, content, nodetype)
  209. S.Push(ret)
  210. return 1
  211. }))
  212. s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
  213. gpath := S.ToString(-2)
  214. content := S.ToString(-1)
  215. ret := s.L.NewTable()
  216. util.FindListText(gpath, content, ret)
  217. S.Push(ret)
  218. return 1
  219. }))
  220. s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
  221. gpath := S.ToString(-2)
  222. content := S.ToString(-1)
  223. ret := s.L.NewTable()
  224. util.FindListHtml(gpath, content, ret)
  225. S.Push(ret)
  226. return 1
  227. }))
  228. // s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int {
  229. // update := [][]map[string]interface{}{}
  230. // query := map[string]interface{}{"state": 0}
  231. // data, _ := Mgo.Find(util.Config.TmpCollName, query, `{"_id":-1}`, nil, false, 0, 10)
  232. // pageList := []interface{}{}
  233. // for _, d := range *data {
  234. // tmpMap := map[string]string{}
  235. // tmpMap["title"] = qu.ObjToString(d["title"])
  236. // tmpMap["detail"] = qu.ObjToString(d["detail"])
  237. // tmpMap["href"] = qu.ObjToString(d["href"])
  238. // publishtime := qu.Int64All(d["publishtime"])
  239. // tmpMap["publishtime"] = qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout)
  240. // tmpMap["_id"] = qu.BsonIdToSId(d["_id"])
  241. // pageList = append(pageList, tmpMap)
  242. // update = append(update, []map[string]interface{}{
  243. // map[string]interface{}{"_id": d["_id"]},
  244. // map[string]interface{}{"$set": map[string]interface{}{"state": 1}},
  245. // })
  246. // }
  247. // ret := util.MapToTable(s.L, pageList)
  248. // S.Push(ret)
  249. // if len(update) > 0 {
  250. // Mgo.UpdateBulk(util.Config.TmpCollName, update...)
  251. // }
  252. // return 1
  253. // }))
  254. s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
  255. qmap := S.ToTable(-2)
  256. content := S.ToString(-1)
  257. ret := s.L.NewTable()
  258. util.FindMap(qmap, content, ret)
  259. S.Push(ret)
  260. return 1
  261. }))
  262. //公示暴露方式
  263. s.L.SetGlobal("getEcpsCode", s.L.NewFunction(func(S *lua.LState) int {
  264. area := strings.ToUpper(S.ToString(-2))
  265. content := S.ToString(-1)
  266. code, state := util.GetEcpsCode(area, []byte(content))
  267. if state == "wx" {
  268. code, _ = GetCodeByWx([]byte(content))
  269. }
  270. S.Push(lua.LString(code))
  271. return 1
  272. }))
  273. //调用jsvm
  274. s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
  275. js := S.ToString(-1)
  276. ret := s.L.NewTable()
  277. if js == "" {
  278. ret.RawSet(lua.LString("val"), lua.LString(""))
  279. ret.RawSet(lua.LString("err"), lua.LString("js is null"))
  280. } else {
  281. rep := util.JsVmPost(util.Config.JsVmUrl, js)
  282. ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
  283. ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
  284. }
  285. S.Push(ret)
  286. return 1
  287. }))
  288. //指定下载器
  289. s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
  290. s.Downloader = GetOneDownloader()
  291. S.Push(lua.LString(s.Downloader))
  292. return 1
  293. }))
  294. //指定下载器file
  295. s.L.SetGlobal("changeDownloaderFile", s.L.NewFunction(func(S *lua.LState) int {
  296. s.Downloader = GetOneDownloaderFile()
  297. S.Push(lua.LString(s.Downloader))
  298. return 1
  299. }))
  300. //手工延时
  301. s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
  302. // if workTime {
  303. // util.TimeSleepFunc(time.Duration(S.ToInt(-1))*time.Second, TimeSleepChan)
  304. // } else {
  305. // util.TimeSleepFunc(1*time.Second, TimeSleepChan)
  306. // }
  307. util.TimeSleepFunc(time.Second*2, TimeSleepChan)
  308. return 0
  309. }))
  310. //编码解码
  311. s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
  312. codeType := strings.ToLower(S.ToString(-2))
  313. str := S.CheckString(-1)
  314. switch codeType {
  315. case "unicode":
  316. str = transUnic(str)
  317. case "urlencode_gbk":
  318. data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
  319. l, _ := url.Parse("http://a.com/?" + string(data))
  320. tmpstr := l.Query().Encode()
  321. if len(tmpstr) > 1 {
  322. str = tmpstr[0 : len(tmpstr)-1]
  323. } else {
  324. str = ""
  325. }
  326. case "urlencode_utf8":
  327. l, _ := url.Parse("http://a.com/?" + str)
  328. tmpstr := l.Query().Encode()
  329. if len(tmpstr) > 1 {
  330. str = tmpstr[0 : len(tmpstr)-1]
  331. } else {
  332. str = ""
  333. }
  334. case "urldecode_utf8":
  335. str, _ = url.QueryUnescape(str)
  336. case "decode64":
  337. str = util.DecodeB64(str)
  338. case "encodemd5":
  339. str = qu.GetMd5String(str)
  340. case "htmldecode": //html实体码
  341. //txt := `<div align="left" style="margin-left: 0pt;"><span style='font-family:; font-size:13px; color:#000000'>&#22826;&#38451;&#23707;&#29305;&#21220;&#28040;&#38450;&#31449;&#12289;&#26494;&#28006;&#29305;&#21220;&#28040;&#38450;&#31449;&#24314;&#35774;&#39033;&#30446;&#35774;&#35745;&#20013;&#26631;&#20844;&#31034;</span></div>`
  342. str = S.ToString(-1)
  343. reg, _ := regexp.Compile("&#\\d+;")
  344. str = reg.ReplaceAllStringFunc(str, func(src string) string {
  345. v, _ := strconv.Atoi(src[2 : len(src)-1])
  346. return string(rune(v))
  347. })
  348. }
  349. S.Push(lua.LString(str))
  350. return 1
  351. }))
  352. //如果服务端返回的html是gzip压缩过格式的 这里需要转一下
  353. s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
  354. html := S.ToString(-1)
  355. bs := []byte(html)
  356. gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
  357. bs, _ = ioutil.ReadAll(gzipreader)
  358. S.Push(lua.LString(bs))
  359. return 1
  360. }))
  361. //luamaker提供的分析列表页url地址 获取列表数据公用方法
  362. s.L.SetGlobal("getSimpleListPage", s.L.NewFunction(func(S *lua.LState) int {
  363. html := S.ToString(-3)
  364. date_pattern := S.ToString(-2)
  365. pageListUrl := S.ToString(-1) //列表页url
  366. bs := []byte(html)
  367. tmparr := []string{}
  368. tmpret := []int{}
  369. re, _ := regexp.Compile(`采购|招标|公示|公告|意见|结果|通知|工程`)
  370. doc, _ := gq.NewDocumentFromReader(bytes.NewReader(bs))
  371. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  372. text := sq.Text()
  373. if len(text) < 30 {
  374. return
  375. }
  376. tmparr = append(tmparr, text)
  377. if re.MatchString(text) {
  378. tmpret = append(tmpret, 1)
  379. //logger.Debug(text)
  380. } else {
  381. tmpret = append(tmpret, 0)
  382. }
  383. })
  384. logger.Debug(tmpret)
  385. //线性分析,算周边,只算周围5步的点
  386. tmplen, thepos, themax := len(tmpret), -1, 0
  387. for i := 0; i < tmplen; i++ {
  388. if tmpret[i] == 0 {
  389. continue
  390. }
  391. start, end := i-MAX_STEP, i+MAX_STEP
  392. if start < 0 {
  393. start = 0
  394. }
  395. if end > tmplen {
  396. end = tmplen
  397. }
  398. tmp := 0
  399. //从当前位置往左,往右找连续点
  400. for j := i; j > start; j-- {
  401. if tmpret[j] == 1 {
  402. tmp++
  403. } else {
  404. break
  405. }
  406. }
  407. for j := i; j < end; j++ {
  408. if tmpret[j] == 1 {
  409. tmp++
  410. } else {
  411. break
  412. }
  413. }
  414. if tmp > themax {
  415. themax = tmp
  416. thepos = i
  417. }
  418. } //end of for...
  419. //logger.Debug("找位置完成")
  420. //验证
  421. if thepos == -1 {
  422. logger.Error("完蛋,找不到")
  423. panic("不支持啊,失败啊")
  424. }
  425. //下边是找父容器
  426. var thelink *gq.Selection
  427. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  428. if sq.Text() == tmparr[thepos] {
  429. thelink = sq
  430. }
  431. })
  432. isfind := false
  433. //同样Path向上找,不超过5步
  434. for i := 0; i < MAX_STEP; i++ {
  435. thelink = thelink.Parent()
  436. clen := getChildrenLen(thelink)
  437. if clen >= themax-1 {
  438. isfind = true
  439. break
  440. }
  441. //logger.Debug("TAG:::", thelink.Nodes[0].Data, clen)
  442. }
  443. //找到列表
  444. pageList := []interface{}{}
  445. if isfind {
  446. thelink.Children().Each(func(i int, sq *gq.Selection) {
  447. page := map[string]string{}
  448. link_sq := sq.Find("a")
  449. href := link_sq.AttrOr("href", "")
  450. text := link_sq.Text()
  451. page["title"] = text
  452. page["href"] = dealHref(pageListUrl, href)
  453. page["publishtime"] = dealPublishTime(strings.TrimSpace(sq.Text()), date_pattern)
  454. //logger.Debug(i)
  455. pageList = append(pageList, page)
  456. })
  457. } else {
  458. logger.Error("完蛋,找父亲节点失败啊")
  459. //panic("不支持啊,失败啊")
  460. }
  461. ret := util.MapToTable(s.L, pageList)
  462. S.Push(ret)
  463. return 1
  464. }))
  465. //招投标信息标题判重
  466. s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  467. S.Push(lua.LBool(false))
  468. return 1
  469. }))
  470. //招标信息判重新方法 2016-12-14 wanghuidong
  471. s.L.SetGlobal("urlRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  472. S.Push(lua.LBool(false))
  473. return 1
  474. }))
  475. //将url放入内存缓存 2016-12-14 wanghuidong
  476. s.L.SetGlobal("putUrl2Redis", s.L.NewFunction(func(S *lua.LState) int {
  477. //url := S.ToString(-1)
  478. return 1
  479. }))
  480. //解析附件中的word、pdf
  481. s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
  482. ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
  483. str := S.ToString(-2)
  484. extension := S.ToString(-1)
  485. bs, _ := base64.StdEncoding.DecodeString(str)
  486. bs = append([]byte{ext[extension]}, bs...)
  487. msgid := mu.UUID(8)
  488. Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
  489. return 1
  490. }))
  491. //下载附件download(url,method,param,head,cookie,fileName)
  492. s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
  493. if s.FileLastThreeTimes == nil {
  494. s.FileLastThreeTimes = make([]time.Duration, 4)
  495. }
  496. if util.Config.IsDelay {
  497. SleepTime(3, s.FileLastThreeTimes) //睡眠时间
  498. }
  499. start := time.Now() //起始时间
  500. cookie := S.ToString(-1)
  501. head := S.ToTable(-2)
  502. param := S.ToTable(-3)
  503. method := S.ToString(-4)
  504. url := S.ToString(-5)
  505. fileName := S.ToString(-6)
  506. ishttps := strings.Contains(url, "https")
  507. var mycookie []*http.Cookie
  508. if cookie != "{}" {
  509. json.Unmarshal([]byte(cookie), &mycookie)
  510. } else {
  511. mycookie = make([]*http.Cookie, 0)
  512. }
  513. fileName = strings.TrimSpace(fileName)
  514. url = strings.TrimSpace(url)
  515. ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
  516. url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
  517. if strings.TrimSpace(ftype) == "" {
  518. if len(path.Ext(name)) > 0 {
  519. ftype = path.Ext(name)[1:]
  520. }
  521. }
  522. S.Push(lua.LString(url))
  523. S.Push(lua.LString(name))
  524. S.Push(lua.LString(size))
  525. S.Push(lua.LString(ftype))
  526. S.Push(lua.LString(fid))
  527. atomic.AddInt32(&s.ToDayRequestNum, 1)
  528. atomic.AddInt32(&s.TotalRequestNum, 1)
  529. end := time.Since(start)
  530. if len(s.FileLastThreeTimes) >= 4 {
  531. s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
  532. }
  533. s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
  534. return 5
  535. }))
  536. s.L.SetGlobal("clearMemoeryCache", s.L.NewFunction(func(S *lua.LState) int {
  537. /*title := S.ToString(-1)
  538. isExist, _ := redis.Exists("title_repeat_judgement", "title_repeat_"+title)
  539. if isExist {
  540. redis.Del("title_repeat_judgement", "title_repeat_"+title)
  541. }*/
  542. return 1
  543. }))
  544. //支持正则,提取
  545. s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
  546. index := int(S.ToNumber(-1))
  547. regstr := S.ToString(-2)
  548. text := S.ToString(-3)
  549. reg := regexp.MustCompile(regstr)
  550. reps := reg.FindAllStringSubmatchIndex(text, -1)
  551. ret := s.L.NewTable()
  552. number := 0
  553. for _, v := range reps {
  554. number++
  555. ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
  556. }
  557. S.Push(ret)
  558. return 1
  559. }))
  560. //支持替换
  561. s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
  562. text := S.ToString(-3)
  563. old := S.ToString(-2)
  564. repl := S.ToString(-1)
  565. text = strings.Replace(text, old, repl, -1)
  566. S.Push(lua.LString(text))
  567. return 1
  568. }))
  569. //标题的关键词、排除词过滤
  570. s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
  571. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  572. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  573. data := S.ToTable(-1)
  574. dataMap := util.TableToMap(data)
  575. ret := s.L.NewTable()
  576. num := 1
  577. for _, v := range dataMap {
  578. tmp := v.(map[string]interface{})
  579. isOk := false
  580. if title := qu.ObjToString(tmp["title"]); title != "" {
  581. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  582. isOk = true
  583. }
  584. }
  585. if isOk {
  586. ret.Insert(num, util.MapToLuaTable(S, tmp))
  587. num++
  588. }
  589. }
  590. S.Push(ret)
  591. return 1
  592. }))
  593. //标题的关键词、排除词过滤
  594. s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
  595. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  596. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  597. data := S.ToTable(-1)
  598. dataMap := util.TableToMap(data)
  599. if title := qu.ObjToString(dataMap["title"]); title != "" {
  600. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  601. S.Push(lua.LBool(true))
  602. return 1
  603. } else {
  604. qu.Debug(s.SCode, dataMap["href"], " title error")
  605. }
  606. } else {
  607. qu.Debug(s.SCode, dataMap["href"], " title error")
  608. }
  609. S.Push(lua.LBool(false))
  610. return 1
  611. }))
  612. //detail过滤
  613. s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
  614. /*
  615. 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
  616. 2.是否含汉字
  617. */
  618. reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|见附件|详见附件)")
  619. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  620. detail := S.ToString(-1)
  621. if reg1.MatchString(detail) {
  622. S.Push(lua.LBool(true))
  623. return 1
  624. }
  625. if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
  626. S.Push(lua.LBool(false))
  627. return 1
  628. }
  629. S.Push(lua.LBool(false))
  630. return 1
  631. }))
  632. //匹配汉字
  633. s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
  634. reg1 := regexp.MustCompile("(见附件|详见附件)")
  635. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  636. detail := S.ToString(-1)
  637. detail = reg1.ReplaceAllString(detail, "")
  638. ok := reg2.MatchString(detail)
  639. S.Push(lua.LBool(ok))
  640. return 1
  641. }))
  642. //aes ecb模式加密
  643. s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
  644. origData := S.ToString(-2)
  645. key := S.ToString(-1)
  646. bytekey := []byte(key)
  647. byteorigData := []byte(origData)
  648. cipher, _ := aes.NewCipher(generateKey([]byte(bytekey)))
  649. length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize
  650. plain := make([]byte, length*aes.BlockSize)
  651. copy(plain, byteorigData)
  652. pad := byte(len(plain) - len(byteorigData))
  653. for i := len(byteorigData); i < len(plain); i++ {
  654. plain[i] = pad
  655. }
  656. encrypted := make([]byte, len(plain))
  657. // 分组分块加密
  658. for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() {
  659. cipher.Encrypt(encrypted[bs:be], plain[bs:be])
  660. }
  661. result := base64.StdEncoding.EncodeToString(encrypted)
  662. S.Push(lua.LString(result))
  663. return 1
  664. }))
  665. //根据正文获取发布时间
  666. s.L.SetGlobal("getPublishtime", s.L.NewFunction(func(S *lua.LState) int {
  667. detail := S.ToString(-2)
  668. contenthtml := S.ToString(-1)
  669. publishtime := util.GetPublishtime([]string{contenthtml, detail})
  670. S.Push(lua.LString(publishtime))
  671. return 1
  672. }))
  673. //匹配
  674. s.L.SetGlobal("stringFind", s.L.NewFunction(func(S *lua.LState) int {
  675. regstr := S.ToString(-1)
  676. text := S.ToString(-2)
  677. reg := regexp.MustCompile(regstr)
  678. result := reg.FindString(text)
  679. isMatch := false
  680. if result != "" {
  681. isMatch = true
  682. }
  683. S.Push(lua.LString(result))
  684. S.Push(lua.LBool(isMatch))
  685. return 2
  686. }))
  687. //截取
  688. s.L.SetGlobal("stringSub", s.L.NewFunction(func(S *lua.LState) int {
  689. text := S.ToString(-3)
  690. start := S.ToInt(-2)
  691. end := S.ToInt(-1)
  692. result := ""
  693. if len(text) > 0 {
  694. textRune := []rune(text)
  695. textLen := len(textRune)
  696. if end == -1 {
  697. if start >= 1 { //正向截取到结尾
  698. result = string(textRune[start-1:])
  699. } else if start < 0 && textLen+start >= 0 { //反向截取后缀
  700. result = string(textRune[textLen+start:])
  701. }
  702. } else if start >= 1 && end <= textLen { //从第start个截取到第end个
  703. result = string(textRune[start-1 : end])
  704. }
  705. }
  706. S.Push(lua.LString(result))
  707. return 1
  708. }))
  709. //长度
  710. s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int {
  711. text := S.ToString(-1)
  712. textLen := len([]rune(text))
  713. S.Push(lua.LNumber(textLen))
  714. return 1
  715. }))
  716. //去除特殊标签中间内容
  717. s.L.SetGlobal("getPureContent", s.L.NewFunction(func(S *lua.LState) int {
  718. con := S.ToString(-1)
  719. reg := regexp.MustCompile("(?s)<(!%-%-|!--|style).*?(%-%-|--|style)>") //注释 css
  720. con = reg.ReplaceAllString(con, "")
  721. // indexArr := reg.FindAllStringIndex(con, -1)
  722. // for i := len(indexArr) - 1; i >= 0; i-- {
  723. // if index := indexArr[i]; len(index) == 2 {
  724. // con = con[:index[0]] + con[index[1]:]
  725. // }
  726. // }
  727. S.Push(lua.LString(con))
  728. return 1
  729. }))
  730. return ""
  731. }
  732. func dealHref(pageListUrl, href string) string {
  733. returnUrl := ""
  734. if href != "" {
  735. r, _ := regexp.Compile("^./")
  736. match := r.MatchString(href)
  737. if match {
  738. url2 := r.ReplaceAllString(href, "")
  739. returnUrl = pageListUrl + url2
  740. }
  741. r2, _ := regexp.Compile("^/")
  742. match2 := r2.MatchString(href)
  743. if match2 {
  744. r3, _ := regexp.Compile("http://[^/]*/")
  745. domain := r3.FindString(pageListUrl)
  746. //fmt.Println(domain)
  747. url2 := r2.ReplaceAllString(href, "")
  748. returnUrl = domain + url2
  749. }
  750. }
  751. return returnUrl
  752. }
  753. func dealPublishTime(content string, pattern string) string {
  754. publishTime := ""
  755. if pattern == "yyyy-MM-dd HH:mm:ss" {
  756. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2}")
  757. publishTime = r.FindString(content)
  758. } else if pattern == "yyyy-MM-dd" {
  759. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}")
  760. publishTime = r.FindString(content)
  761. } else if pattern == "MM-dd" {
  762. r, _ := regexp.Compile("\\d{2}-\\d{2}")
  763. publishTime = r.FindString(content)
  764. }
  765. return publishTime
  766. }
  767. func getChildrenLen(sq *gq.Selection) (ret int) {
  768. sq.Children().Each(func(i int, sq2 *gq.Selection) {
  769. ret = i
  770. })
  771. return
  772. }
  773. //
  774. func (s *Script) Reload() {
  775. s.L.Close()
  776. s.LoadScript(s.SCode, s.ScriptFile, false)
  777. }
  778. //unicode转码
  779. func transUnic(str string) string {
  780. buf := bytes.NewBuffer(nil)
  781. i, j := 0, len(str)
  782. for i < j {
  783. x := i + 6
  784. if x > j {
  785. buf.WriteString(str[i:])
  786. break
  787. }
  788. if str[i] == '\\' && str[i+1] == 'u' {
  789. hex := str[i+2 : x]
  790. r, err := strconv.ParseUint(hex, 16, 64)
  791. if err == nil {
  792. buf.WriteRune(rune(r))
  793. } else {
  794. logger.Warn(err.Error())
  795. buf.WriteString(str[i:x])
  796. }
  797. i = x
  798. } else {
  799. buf.WriteByte(str[i])
  800. i++
  801. }
  802. }
  803. return buf.String()
  804. }
  805. //取得变量
  806. func (s *Script) GetVar(key string) string {
  807. return s.L.GetGlobal(key).String()
  808. }
  809. //
  810. func (s *Script) GetIntVar(key string) int {
  811. lv := s.L.GetGlobal(key)
  812. if v, ok := lv.(lua.LNumber); ok {
  813. return int(v)
  814. }
  815. return -1
  816. }
  817. //
  818. func (s *Script) GetBoolVar(key string) bool {
  819. lv := s.L.GetGlobal(key)
  820. if v, ok := lv.(lua.LBool); ok {
  821. return bool(v)
  822. }
  823. return false
  824. }
  825. func isWorkTime() {
  826. workTime = util.IsWorkTime()
  827. util.TimeAfterFunc(10*time.Minute, isWorkTime, TimeChan)
  828. }
  829. //设置睡眠时间
  830. func SleepTime(basetime int, times []time.Duration) {
  831. st := 0 //记录最后睡眠时长
  832. base := float64(basetime * 60)
  833. if times[3].Seconds() > base { //最后一次大于 basetime*60秒
  834. if times[2].Seconds() > base {
  835. n := 0
  836. if times[0].Seconds() > base {
  837. n++
  838. }
  839. if times[1].Seconds() > base {
  840. n++
  841. }
  842. st = n + 1
  843. } else if times[2].Seconds() < base && times[0].Seconds() > base && times[1].Seconds() > base {
  844. st = 1
  845. }
  846. }
  847. if st > 0 {
  848. time.Sleep(time.Duration(st) * time.Minute)
  849. }
  850. }
  851. func generateKey(key []byte) (genKey []byte) {
  852. genKey = make([]byte, 16)
  853. copy(genKey, key)
  854. for i := 16; i < len(key); {
  855. for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
  856. genKey[j] ^= key[i]
  857. }
  858. }
  859. return genKey
  860. }