script.go 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911
  1. /**
  2. 脚本加载+调用 封装,
  3. 前期走文件系统加载
  4. 后期走数据库配置,
  5. LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
  6. */
  7. package spider
  8. import (
  9. codegrpc "analysiscode"
  10. "bytes"
  11. "compress/gzip"
  12. "crypto/aes"
  13. "encoding/base64"
  14. "encoding/json"
  15. "io/ioutil"
  16. mu "mfw/util"
  17. "net/http"
  18. "net/url"
  19. "path"
  20. qu "qfw/util"
  21. _ "qfw/util/redis"
  22. "regexp"
  23. util "spiderutil"
  24. "strconv"
  25. "strings"
  26. "sync/atomic"
  27. "time"
  28. "github.com/shopspring/decimal"
  29. gq "github.com/PuerkitoBio/goquery"
  30. "github.com/cjoudrey/gluahttp"
  31. "github.com/donnie4w/go-logger/logger"
  32. lujson "github.com/yuin/gopher-json"
  33. "github.com/yuin/gopher-lua"
  34. "golang.org/x/text/encoding/simplifiedchinese"
  35. "golang.org/x/text/transform"
  36. )
  37. //脚本
  38. type Script struct {
  39. SCode, ScriptFile string
  40. Encoding string
  41. Userproxy bool
  42. //Ishttps bool
  43. ErrorNum int32 //错误数
  44. Downloader string //下载器
  45. TotalRequestNum int32 //总请求次数
  46. ToDayRequestNum int32 //今日请求次数
  47. YestoDayRequestNum int32 //昨日请求次数
  48. Timeout int64 //超时时间秒
  49. L *lua.LState
  50. NoDownloadNum int32 //未成功下载数
  51. LastThreeTimes []time.Duration //单条信息流程完成的时间,最后三次
  52. FileLastThreeTimes []time.Duration //附件下载单条信息流程完成的时间,最后三次
  53. }
  54. const (
  55. MAX_STEP = 5 //计算时的最大步长
  56. )
  57. var workTime = true
  58. //
  59. // func init() {
  60. // go isWorkTime()
  61. // }
  62. var TimeSleepChan = make(chan bool, 1)
  63. //加载文件
  64. func (s *Script) LoadScript(code, script_file string, newstate bool) string {
  65. defer mu.Catch()
  66. s.SCode = code
  67. s.ScriptFile = script_file
  68. s.L = lua.NewState(lua.Options{
  69. RegistrySize: 256 * 20,
  70. CallStackSize: 256,
  71. IncludeGoStackTrace: false,
  72. })
  73. s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
  74. s.L.PreloadModule("json", lujson.Loader)
  75. if err := s.L.DoString(script_file); err != nil {
  76. logger.Debug(code + ",加载lua脚本错误:" + err.Error())
  77. //panic(code + ",加载lua脚本错误:" + err.Error())
  78. }
  79. s.Encoding = s.GetVar("spiderPageEncoding")
  80. s.Userproxy = s.GetBoolVar("spiderUserProxy")
  81. //暴露go方法
  82. //download(url,head) 普通下载
  83. s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
  84. head := S.ToTable(-1)
  85. url := S.ToString(-2)
  86. ishttps := S.ToBool(-3)
  87. charset := S.ToString(-4)
  88. if charset == "" {
  89. charset = s.Encoding
  90. }
  91. ret := Download(s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  92. S.Push(lua.LString(ret))
  93. atomic.AddInt32(&s.ToDayRequestNum, 1)
  94. atomic.AddInt32(&s.TotalRequestNum, 1)
  95. return 1
  96. }))
  97. //高级下载downloadAdv(url,method,param,head,cookie)
  98. s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
  99. cookie := S.ToString(-1)
  100. head := S.ToTable(-2)
  101. param := S.ToTable(-3)
  102. method := S.ToString(-4)
  103. url := S.ToString(-5)
  104. ishttps := S.ToBool(-6)
  105. charset := S.ToString(-7)
  106. if charset == "" {
  107. charset = s.Encoding
  108. }
  109. var mycookie []*http.Cookie
  110. json.Unmarshal([]byte(cookie), &mycookie)
  111. var ret string
  112. var retcookie []*http.Cookie
  113. var headers = map[string]interface{}{}
  114. if param == nil {
  115. ptext := map[string]interface{}{"text": S.ToString(-3)}
  116. ret, retcookie, headers = DownloadAdv(s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  117. } else {
  118. ret, retcookie, headers = DownloadAdv(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  119. }
  120. S.Push(lua.LString(ret))
  121. scookie, _ := json.Marshal(retcookie)
  122. S.Push(lua.LString(scookie))
  123. hTable := util.MapToLuaTable(S, headers)
  124. S.Push(hTable)
  125. atomic.AddInt32(&s.ToDayRequestNum, 1)
  126. atomic.AddInt32(&s.TotalRequestNum, 1)
  127. return 3
  128. }))
  129. //保存验证错误日志
  130. s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
  131. code := S.ToString(-4)
  132. name := S.ToString(-3)
  133. url := S.ToString(-2)
  134. content := S.ToString(-1)
  135. saveVerificationLog(code, name, url, content)
  136. atomic.AddInt32(&s.ErrorNum, 1)
  137. atomic.AddInt32(&s.NoDownloadNum, 1)
  138. //防止恶意增加日志
  139. util.TimeSleepFunc(5*time.Second, TimeSleepChan)
  140. return 0
  141. }))
  142. //添加改版日志
  143. s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
  144. url := S.ToString(-2)
  145. str := S.ToString(-1)
  146. logger.Error(s.SCode, url, str)
  147. return 0
  148. }))
  149. //查找信息是否存在(作废)
  150. s.L.SetGlobal("findHasExit", s.L.NewFunction(func(S *lua.LState) int {
  151. //c := S.ToString(-2)
  152. //q := S.ToString(-1)
  153. //b := findHasExit(c, q)
  154. S.Push(lua.LBool(false))
  155. return 1
  156. }))
  157. s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
  158. nodetype := S.ToString(-3)
  159. gpath := S.ToString(-2)
  160. content := S.ToString(-1)
  161. ret := util.FindOneText(gpath, content, nodetype)
  162. S.Push(ret)
  163. return 1
  164. }))
  165. s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
  166. gpath := S.ToString(-2)
  167. content := S.ToString(-1)
  168. ret := util.FindContentText(gpath, content)
  169. S.Push(ret)
  170. return 1
  171. }))
  172. s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
  173. nodetype := S.ToString(-3)
  174. gpath := S.ToString(-2)
  175. content := S.ToString(-1)
  176. ret := util.FindOneHtml(gpath, content, nodetype)
  177. S.Push(ret)
  178. return 1
  179. }))
  180. s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
  181. gpath := S.ToString(-2)
  182. content := S.ToString(-1)
  183. ret := s.L.NewTable()
  184. util.FindListText(gpath, content, ret)
  185. S.Push(ret)
  186. return 1
  187. }))
  188. s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
  189. gpath := S.ToString(-2)
  190. content := S.ToString(-1)
  191. ret := s.L.NewTable()
  192. util.FindListHtml(gpath, content, ret)
  193. S.Push(ret)
  194. return 1
  195. }))
  196. //推送列表页下载数据量
  197. s.L.SetGlobal("sendListNum", s.L.NewFunction(func(S *lua.LState) int {
  198. //table := S.ToTable(-1)
  199. //list := util.TableToMap(table)
  200. return 1
  201. }))
  202. s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
  203. qmap := S.ToTable(-2)
  204. content := S.ToString(-1)
  205. ret := s.L.NewTable()
  206. util.FindMap(qmap, content, ret)
  207. S.Push(ret)
  208. return 1
  209. }))
  210. //公示暴露方式
  211. s.L.SetGlobal("getEcpsCode", s.L.NewFunction(func(S *lua.LState) int {
  212. area := strings.ToUpper(S.ToString(-2))
  213. content := S.ToString(-1)
  214. code, state := util.GetEcpsCode(area, []byte(content))
  215. if state == "wx" {
  216. code, _ = GetCodeByWx([]byte(content))
  217. }
  218. S.Push(lua.LString(code))
  219. return 1
  220. }))
  221. //调用jsvm
  222. s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
  223. js := S.ToString(-1)
  224. ret := s.L.NewTable()
  225. if js == "" {
  226. ret.RawSet(lua.LString("val"), lua.LString(""))
  227. ret.RawSet(lua.LString("err"), lua.LString("js is null"))
  228. } else {
  229. rep := util.JsVmPost(util.Config.JsVmUrl, js)
  230. ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
  231. ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
  232. }
  233. S.Push(ret)
  234. return 1
  235. }))
  236. //指定下载器
  237. s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
  238. s.Downloader = GetOneDownloader()
  239. S.Push(lua.LString(s.Downloader))
  240. return 1
  241. }))
  242. //指定下载器file
  243. s.L.SetGlobal("changeDownloaderFile", s.L.NewFunction(func(S *lua.LState) int {
  244. s.Downloader = GetOneDownloaderFile()
  245. S.Push(lua.LString(s.Downloader))
  246. return 1
  247. }))
  248. //手工延时
  249. s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
  250. // if workTime {
  251. // util.TimeSleepFunc(time.Duration(S.ToInt(-1))*time.Second, TimeSleepChan)
  252. // } else {
  253. // util.TimeSleepFunc(1*time.Second, TimeSleepChan)
  254. // }
  255. util.TimeSleepFunc(time.Second*2, TimeSleepChan)
  256. return 0
  257. }))
  258. //编码解码
  259. s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
  260. codeType := strings.ToLower(S.ToString(-2))
  261. str := S.CheckString(-1)
  262. switch codeType {
  263. case "unicode":
  264. str = strings.Replace(str, "%u", "\\u", -1)
  265. str = transUnic(str)
  266. case "urlencode_gbk":
  267. data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
  268. l, _ := url.Parse("http://a.com/?" + string(data))
  269. tmpstr := l.Query().Encode()
  270. if len(tmpstr) > 1 {
  271. str = tmpstr[0 : len(tmpstr)-1]
  272. } else {
  273. str = ""
  274. }
  275. case "urlencode_utf8":
  276. l, _ := url.Parse("http://a.com/?" + str)
  277. tmpstr := l.Query().Encode()
  278. if len(tmpstr) > 1 {
  279. str = tmpstr[0 : len(tmpstr)-1]
  280. } else {
  281. str = ""
  282. }
  283. case "urldecode_utf8":
  284. str, _ = url.QueryUnescape(str)
  285. case "decode64":
  286. str = util.DecodeB64(str)
  287. case "encodemd5":
  288. str = qu.GetMd5String(str)
  289. case "htmldecode": //html实体码
  290. //txt := `<div align="left" style="margin-left: 0pt;"><span style='font-family:; font-size:13px; color:#000000'>&#22826;&#38451;&#23707;&#29305;&#21220;&#28040;&#38450;&#31449;&#12289;&#26494;&#28006;&#29305;&#21220;&#28040;&#38450;&#31449;&#24314;&#35774;&#39033;&#30446;&#35774;&#35745;&#20013;&#26631;&#20844;&#31034;</span></div>`
  291. str = S.ToString(-1)
  292. reg, _ := regexp.Compile("&#\\d+;")
  293. str = reg.ReplaceAllStringFunc(str, func(src string) string {
  294. v, _ := strconv.Atoi(src[2 : len(src)-1])
  295. return string(rune(v))
  296. })
  297. }
  298. S.Push(lua.LString(str))
  299. return 1
  300. }))
  301. //如果服务端返回的html是gzip压缩过格式的 这里需要转一下
  302. s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
  303. html := S.ToString(-1)
  304. bs := []byte(html)
  305. gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
  306. bs, _ = ioutil.ReadAll(gzipreader)
  307. S.Push(lua.LString(bs))
  308. return 1
  309. }))
  310. //luamaker提供的分析列表页url地址 获取列表数据公用方法
  311. s.L.SetGlobal("getSimpleListPage", s.L.NewFunction(func(S *lua.LState) int {
  312. html := S.ToString(-3)
  313. date_pattern := S.ToString(-2)
  314. pageListUrl := S.ToString(-1) //列表页url
  315. bs := []byte(html)
  316. tmparr := []string{}
  317. tmpret := []int{}
  318. re, _ := regexp.Compile(`采购|招标|公示|公告|意见|结果|通知|工程`)
  319. doc, _ := gq.NewDocumentFromReader(bytes.NewReader(bs))
  320. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  321. text := sq.Text()
  322. if len(text) < 30 {
  323. return
  324. }
  325. tmparr = append(tmparr, text)
  326. if re.MatchString(text) {
  327. tmpret = append(tmpret, 1)
  328. //logger.Debug(text)
  329. } else {
  330. tmpret = append(tmpret, 0)
  331. }
  332. })
  333. logger.Debug(tmpret)
  334. //线性分析,算周边,只算周围5步的点
  335. tmplen, thepos, themax := len(tmpret), -1, 0
  336. for i := 0; i < tmplen; i++ {
  337. if tmpret[i] == 0 {
  338. continue
  339. }
  340. start, end := i-MAX_STEP, i+MAX_STEP
  341. if start < 0 {
  342. start = 0
  343. }
  344. if end > tmplen {
  345. end = tmplen
  346. }
  347. tmp := 0
  348. //从当前位置往左,往右找连续点
  349. for j := i; j > start; j-- {
  350. if tmpret[j] == 1 {
  351. tmp++
  352. } else {
  353. break
  354. }
  355. }
  356. for j := i; j < end; j++ {
  357. if tmpret[j] == 1 {
  358. tmp++
  359. } else {
  360. break
  361. }
  362. }
  363. if tmp > themax {
  364. themax = tmp
  365. thepos = i
  366. }
  367. } //end of for...
  368. //logger.Debug("找位置完成")
  369. //验证
  370. if thepos == -1 {
  371. logger.Error("完蛋,找不到")
  372. panic("不支持啊,失败啊")
  373. }
  374. //下边是找父容器
  375. var thelink *gq.Selection
  376. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  377. if sq.Text() == tmparr[thepos] {
  378. thelink = sq
  379. }
  380. })
  381. isfind := false
  382. //同样Path向上找,不超过5步
  383. for i := 0; i < MAX_STEP; i++ {
  384. thelink = thelink.Parent()
  385. clen := getChildrenLen(thelink)
  386. if clen >= themax-1 {
  387. isfind = true
  388. break
  389. }
  390. //logger.Debug("TAG:::", thelink.Nodes[0].Data, clen)
  391. }
  392. //找到列表
  393. pageList := []interface{}{}
  394. if isfind {
  395. thelink.Children().Each(func(i int, sq *gq.Selection) {
  396. page := map[string]string{}
  397. link_sq := sq.Find("a")
  398. href := link_sq.AttrOr("href", "")
  399. text := link_sq.Text()
  400. page["title"] = text
  401. page["href"] = dealHref(pageListUrl, href)
  402. page["publishtime"] = dealPublishTime(strings.TrimSpace(sq.Text()), date_pattern)
  403. //logger.Debug(i)
  404. pageList = append(pageList, page)
  405. })
  406. } else {
  407. logger.Error("完蛋,找父亲节点失败啊")
  408. //panic("不支持啊,失败啊")
  409. }
  410. ret := util.MapToTable(s.L, pageList)
  411. S.Push(ret)
  412. return 1
  413. }))
  414. //招投标信息标题判重
  415. s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  416. S.Push(lua.LBool(false))
  417. return 1
  418. }))
  419. //招标信息判重新方法 2016-12-14 wanghuidong
  420. s.L.SetGlobal("urlRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  421. S.Push(lua.LBool(false))
  422. return 1
  423. }))
  424. //将url放入内存缓存 2016-12-14 wanghuidong
  425. s.L.SetGlobal("putUrl2Redis", s.L.NewFunction(func(S *lua.LState) int {
  426. //url := S.ToString(-1)
  427. return 1
  428. }))
  429. //解析附件中的word、pdf
  430. s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
  431. ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
  432. str := S.ToString(-2)
  433. extension := S.ToString(-1)
  434. bs, _ := base64.StdEncoding.DecodeString(str)
  435. bs = append([]byte{ext[extension]}, bs...)
  436. msgid := mu.UUID(8)
  437. Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
  438. return 1
  439. }))
  440. //下载附件download(url,method,param,head,cookie,fileName)
  441. s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
  442. cookie := S.ToString(-1)
  443. head := S.ToTable(-2)
  444. param := S.ToTable(-3)
  445. method := S.ToString(-4)
  446. url := S.ToString(-5)
  447. fileName := S.ToString(-6)
  448. ishttps := strings.Contains(url, "https")
  449. var mycookie []*http.Cookie
  450. if cookie != "{}" {
  451. json.Unmarshal([]byte(cookie), &mycookie)
  452. } else {
  453. mycookie = make([]*http.Cookie, 0)
  454. }
  455. fileName = strings.TrimSpace(fileName)
  456. url = strings.TrimSpace(url)
  457. ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
  458. url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
  459. if strings.TrimSpace(ftype) == "" {
  460. if len(path.Ext(name)) > 0 {
  461. ftype = path.Ext(name)[1:]
  462. }
  463. }
  464. S.Push(lua.LString(url))
  465. S.Push(lua.LString(name))
  466. S.Push(lua.LString(size))
  467. S.Push(lua.LString(ftype))
  468. S.Push(lua.LString(fid))
  469. return 5
  470. }))
  471. s.L.SetGlobal("clearMemoeryCache", s.L.NewFunction(func(S *lua.LState) int {
  472. /*title := S.ToString(-1)
  473. isExist, _ := redis.Exists("title_repeat_judgement", "title_repeat_"+title)
  474. if isExist {
  475. redis.Del("title_repeat_judgement", "title_repeat_"+title)
  476. }*/
  477. return 1
  478. }))
  479. //支持正则,提取
  480. s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
  481. index := int(S.ToNumber(-1))
  482. regstr := S.ToString(-2)
  483. text := S.ToString(-3)
  484. reg := regexp.MustCompile(regstr)
  485. reps := reg.FindAllStringSubmatchIndex(text, -1)
  486. ret := s.L.NewTable()
  487. number := 0
  488. for _, v := range reps {
  489. number++
  490. ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
  491. }
  492. S.Push(ret)
  493. return 1
  494. }))
  495. //支持替换
  496. s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
  497. text := S.ToString(-3)
  498. old := S.ToString(-2)
  499. repl := S.ToString(-1)
  500. text = strings.Replace(text, old, repl, -1)
  501. S.Push(lua.LString(text))
  502. return 1
  503. }))
  504. //列表页标题的关键词、排除词过滤
  505. s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
  506. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  507. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  508. data := S.ToTable(-1)
  509. dataMap := util.TableToMap(data)
  510. ret := s.L.NewTable()
  511. num := 1
  512. for _, v := range dataMap {
  513. tmp := v.(map[string]interface{})
  514. isOk := false
  515. if title := qu.ObjToString(tmp["title"]); title != "" {
  516. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  517. isOk = true
  518. }
  519. }
  520. if isOk {
  521. ret.Insert(num, util.MapToLuaTable(S, tmp))
  522. num++
  523. }
  524. }
  525. S.Push(ret)
  526. return 1
  527. }))
  528. // 三级页标题的关键词、排除词过滤
  529. s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
  530. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  531. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  532. data := S.ToTable(-1)
  533. dataMap := util.TableToMap(data)
  534. if title := qu.ObjToString(dataMap["title"]); title != "" {
  535. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  536. S.Push(lua.LBool(true))
  537. return 1
  538. } else {
  539. logger.Debug(s.SCode, dataMap["href"], " title error")
  540. }
  541. } else {
  542. logger.Debug(s.SCode, dataMap["href"], " title error")
  543. }
  544. S.Push(lua.LBool(false))
  545. return 1
  546. }))
  547. //detail过滤
  548. s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
  549. /*
  550. 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
  551. 2.是否含汉字
  552. */
  553. reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|详见附件|见附件)")
  554. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  555. detail := S.ToString(-1)
  556. if reg1.MatchString(detail) {
  557. S.Push(lua.LBool(true))
  558. return 1
  559. }
  560. if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
  561. S.Push(lua.LBool(false))
  562. return 1
  563. }
  564. S.Push(lua.LBool(false))
  565. return 1
  566. }))
  567. //匹配汉字
  568. s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
  569. reg1 := regexp.MustCompile("(见附件|详见附件)")
  570. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  571. detail := S.ToString(-1)
  572. detail = reg1.ReplaceAllString(detail, "")
  573. ok := reg2.MatchString(detail)
  574. S.Push(lua.LBool(ok))
  575. return 1
  576. }))
  577. //aes ecb模式加密
  578. s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
  579. origData := S.ToString(-2)
  580. key := S.ToString(-1)
  581. bytekey := []byte(key)
  582. byteorigData := []byte(origData)
  583. cipher, _ := aes.NewCipher(generateKey([]byte(bytekey)))
  584. length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize
  585. plain := make([]byte, length*aes.BlockSize)
  586. copy(plain, byteorigData)
  587. pad := byte(len(plain) - len(byteorigData))
  588. for i := len(byteorigData); i < len(plain); i++ {
  589. plain[i] = pad
  590. }
  591. encrypted := make([]byte, len(plain))
  592. // 分组分块加密
  593. for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() {
  594. cipher.Encrypt(encrypted[bs:be], plain[bs:be])
  595. }
  596. result := base64.StdEncoding.EncodeToString(encrypted)
  597. S.Push(lua.LString(result))
  598. return 1
  599. }))
  600. //根据正文获取发布时间
  601. s.L.SetGlobal("getPublishtime", s.L.NewFunction(func(S *lua.LState) int {
  602. detail := S.ToString(-2)
  603. contenthtml := S.ToString(-1)
  604. publishtime := util.GetPublishtime([]string{contenthtml, detail})
  605. S.Push(lua.LString(publishtime))
  606. return 1
  607. }))
  608. //匹配
  609. s.L.SetGlobal("stringFind", s.L.NewFunction(func(S *lua.LState) int {
  610. regstr := S.ToString(-1)
  611. text := S.ToString(-2)
  612. reg := regexp.MustCompile(regstr)
  613. result := reg.FindString(text)
  614. isMatch := false
  615. if result != "" {
  616. isMatch = true
  617. }
  618. S.Push(lua.LString(result))
  619. S.Push(lua.LBool(isMatch))
  620. return 2
  621. }))
  622. //截取
  623. s.L.SetGlobal("stringSub", s.L.NewFunction(func(S *lua.LState) int {
  624. text := S.ToString(-3)
  625. start := S.ToInt(-2)
  626. end := S.ToInt(-1)
  627. result := ""
  628. if len(text) > 0 {
  629. textRune := []rune(text)
  630. textLen := len(textRune)
  631. if end < 0 {
  632. if start > 0 { //正向截取到倒数第end位
  633. result = string(textRune[start-1 : textLen+1+end])
  634. } else if start < 0 { //反向截取 从倒数第start位截取到倒数第end位
  635. result = string(textRune[textLen+start : textLen+1+end])
  636. }
  637. } else if start > 0 && end >= start && end <= textLen { //从第start个截取到第end个
  638. result = string(textRune[start-1 : end])
  639. }
  640. // if end == -1 {
  641. // if start >= 1 { //正向截取到结尾
  642. // result = string(textRune[start-1:])
  643. // } else if start < 0 && textLen+start >= 0 { //反向截取后缀
  644. // result = string(textRune[textLen+start:])
  645. // }
  646. // } else if start >= 1 && end <= textLen { //从第start个截取到第end个
  647. // result = string(textRune[start-1 : end])
  648. // }
  649. }
  650. S.Push(lua.LString(result))
  651. return 1
  652. }))
  653. //base64加密
  654. s.L.SetGlobal("encodeBase64", s.L.NewFunction(func(S *lua.LState) int {
  655. text := S.ToString(-1)
  656. base64Text := base64.StdEncoding.EncodeToString([]byte(text))
  657. S.Push(lua.LString(base64Text))
  658. return 1
  659. }))
  660. //base64解密
  661. s.L.SetGlobal("decodeBase64", s.L.NewFunction(func(S *lua.LState) int {
  662. text := S.ToString(-1)
  663. result := ""
  664. byteText, err := base64.StdEncoding.DecodeString(text)
  665. if err == nil {
  666. result = string(byteText)
  667. }
  668. S.Push(lua.LString(result))
  669. return 1
  670. }))
  671. //长度
  672. s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int {
  673. text := S.ToString(-1)
  674. textLen := len([]rune(text))
  675. S.Push(lua.LNumber(textLen))
  676. return 1
  677. }))
  678. //去除特殊标签中间内容
  679. s.L.SetGlobal("getPureContent", s.L.NewFunction(func(S *lua.LState) int {
  680. con := S.ToString(-1)
  681. reg := regexp.MustCompile("(?s)<(!%-%-|!--|style).*?(%-%-|--|style)>") //注释 css
  682. con = reg.ReplaceAllString(con, "")
  683. // indexArr := reg.FindAllStringIndex(con, -1)
  684. // for i := len(indexArr) - 1; i >= 0; i-- {
  685. // if index := indexArr[i]; len(index) == 2 {
  686. // con = con[:index[0]] + con[index[1]:]
  687. // }
  688. // }
  689. S.Push(lua.LString(con))
  690. return 1
  691. }))
  692. //interface转string
  693. s.L.SetGlobal("formatToString", s.L.NewFunction(func(S *lua.LState) int {
  694. strNum := S.ToString(-1)
  695. decimalNum, _ := decimal.NewFromString(strNum)
  696. S.Push(lua.LString(decimalNum.String()))
  697. return 1
  698. }))
  699. //获取验证码
  700. s.L.SetGlobal("getCodeByPath", s.L.NewFunction(func(S *lua.LState) int {
  701. cookie := S.ToString(-1)
  702. head := S.ToTable(-2)
  703. stype := S.ToString(-3)
  704. path := S.ToString(-4)
  705. headMap := util.GetTable(head)
  706. //qu.Debug("cookie----------", cookie)
  707. //qu.Debug("headMap----------", headMap)
  708. headJsonStr := ""
  709. headByte, err := json.Marshal(headMap)
  710. if err == nil {
  711. headJsonStr = string(headByte)
  712. }
  713. code, respHead, respCookie := codegrpc.GetCodeByPath(path, stype, headJsonStr, cookie)
  714. //qu.Debug("code====", code)
  715. //qu.Debug("respHead====", respHead)
  716. //qu.Debug("respCookie====", respCookie)
  717. S.Push(lua.LString(code))
  718. respHeadMap := map[string]interface{}{}
  719. json.Unmarshal([]byte(respHead), &respHeadMap)
  720. hTable := util.MapToLuaTable(S, respHeadMap)
  721. S.Push(hTable)
  722. S.Push(lua.LString(respCookie))
  723. return 3
  724. }))
  725. s.L.SetGlobal("newDownloadFile", s.L.NewFunction(func(S *lua.LState) int {
  726. cookie := S.ToString(-1)
  727. head := S.ToTable(-2)
  728. param := S.ToTable(-3)
  729. method := S.ToString(-4)
  730. url := S.ToString(-5)
  731. fileName := S.ToString(-6)
  732. ishttps := strings.Contains(url, "https")
  733. var mycookie []*http.Cookie
  734. if cookie != "{}" {
  735. json.Unmarshal([]byte(cookie), &mycookie)
  736. } else {
  737. mycookie = make([]*http.Cookie, 0)
  738. }
  739. fileName = strings.TrimSpace(fileName)
  740. url = strings.TrimSpace(url)
  741. ret := NewDownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout, false)
  742. url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
  743. if strings.TrimSpace(ftype) == "" {
  744. if len(path.Ext(name)) > 0 {
  745. ftype = path.Ext(name)[1:]
  746. }
  747. }
  748. S.Push(lua.LString(url))
  749. S.Push(lua.LString(name))
  750. S.Push(lua.LString(size))
  751. S.Push(lua.LString(ftype))
  752. S.Push(lua.LString(fid))
  753. return 5
  754. }))
  755. return ""
  756. }
  757. func dealHref(pageListUrl, href string) string {
  758. returnUrl := ""
  759. if href != "" {
  760. r, _ := regexp.Compile("^./")
  761. match := r.MatchString(href)
  762. if match {
  763. url2 := r.ReplaceAllString(href, "")
  764. returnUrl = pageListUrl + url2
  765. }
  766. r2, _ := regexp.Compile("^/")
  767. match2 := r2.MatchString(href)
  768. if match2 {
  769. r3, _ := regexp.Compile("http://[^/]*/")
  770. domain := r3.FindString(pageListUrl)
  771. //fmt.Println(domain)
  772. url2 := r2.ReplaceAllString(href, "")
  773. returnUrl = domain + url2
  774. }
  775. }
  776. return returnUrl
  777. }
  778. func dealPublishTime(content string, pattern string) string {
  779. publishTime := ""
  780. if pattern == "yyyy-MM-dd HH:mm:ss" {
  781. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2}")
  782. publishTime = r.FindString(content)
  783. } else if pattern == "yyyy-MM-dd" {
  784. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}")
  785. publishTime = r.FindString(content)
  786. } else if pattern == "MM-dd" {
  787. r, _ := regexp.Compile("\\d{2}-\\d{2}")
  788. publishTime = r.FindString(content)
  789. }
  790. return publishTime
  791. }
  792. func getChildrenLen(sq *gq.Selection) (ret int) {
  793. sq.Children().Each(func(i int, sq2 *gq.Selection) {
  794. ret = i
  795. })
  796. return
  797. }
  798. //
  799. // func (s *Script) Reload() {
  800. // s.L.Close()
  801. // s.LoadScript(s.SCode, s.ScriptFile, false)
  802. // }
  803. //unicode转码
  804. func transUnic(str string) string {
  805. buf := bytes.NewBuffer(nil)
  806. i, j := 0, len(str)
  807. for i < j {
  808. x := i + 6
  809. if x > j {
  810. buf.WriteString(str[i:])
  811. break
  812. }
  813. if str[i] == '\\' && str[i+1] == 'u' {
  814. hex := str[i+2 : x]
  815. r, err := strconv.ParseUint(hex, 16, 64)
  816. if err == nil {
  817. buf.WriteRune(rune(r))
  818. } else {
  819. logger.Warn(err.Error())
  820. buf.WriteString(str[i:x])
  821. }
  822. i = x
  823. } else {
  824. buf.WriteByte(str[i])
  825. i++
  826. }
  827. }
  828. return buf.String()
  829. }
  830. //取得变量
  831. func (s *Script) GetVar(key string) string {
  832. return s.L.GetGlobal(key).String()
  833. }
  834. //
  835. func (s *Script) GetIntVar(key string) int {
  836. lv := s.L.GetGlobal(key)
  837. if v, ok := lv.(lua.LNumber); ok {
  838. return int(v)
  839. }
  840. return -1
  841. }
  842. //
  843. func (s *Script) GetBoolVar(key string) bool {
  844. lv := s.L.GetGlobal(key)
  845. if v, ok := lv.(lua.LBool); ok {
  846. return bool(v)
  847. }
  848. return false
  849. }
  850. func generateKey(key []byte) (genKey []byte) {
  851. genKey = make([]byte, 16)
  852. copy(genKey, key)
  853. for i := 16; i < len(key); {
  854. for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
  855. genKey[j] ^= key[i]
  856. }
  857. }
  858. return genKey
  859. }
  860. // func isWorkTime() {
  861. // workTime = util.IsWorkTime()
  862. // util.TimeAfterFunc(10*time.Minute, isWorkTime, TimeChan)
  863. // }
  864. // //设置睡眠时间
  865. // func SleepTime(basetime int, times []time.Duration) {
  866. // st := 0 //记录最后睡眠时长
  867. // base := float64(basetime * 60)
  868. // if times[3].Seconds() > base { //最后一次大于 basetime*60秒
  869. // if times[2].Seconds() > base {
  870. // n := 0
  871. // if times[0].Seconds() > base {
  872. // n++
  873. // }
  874. // if times[1].Seconds() > base {
  875. // n++
  876. // }
  877. // st = n + 1
  878. // } else if times[2].Seconds() < base && times[0].Seconds() > base && times[1].Seconds() > base {
  879. // st = 1
  880. // }
  881. // }
  882. // if st > 0 {
  883. // time.Sleep(time.Duration(st) * time.Minute)
  884. // }
  885. // }