script.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503
  1. /**
  2. 脚本加载+调用 封装,
  3. 前期走文件系统加载
  4. 后期走数据库配置,
  5. LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
  6. */
  7. package spider
  8. import (
  9. "bytes"
  10. "compress/gzip"
  11. "crypto/aes"
  12. "encoding/base64"
  13. "encoding/json"
  14. "io/ioutil"
  15. mu "mfw/util"
  16. "net/http"
  17. "net/url"
  18. "path"
  19. qu "qfw/util"
  20. "regexp"
  21. util "spiderutil"
  22. "strconv"
  23. "strings"
  24. "time"
  25. "golang.org/x/text/encoding/simplifiedchinese"
  26. "golang.org/x/text/transform"
  27. "github.com/cjoudrey/gluahttp"
  28. lujson "github.com/yuin/gopher-json"
  29. "github.com/yuin/gopher-lua"
  30. )
  31. //脚本
  32. type Script struct {
  33. SCode, ScriptFile string
  34. Encoding string
  35. Downloader string //下载器
  36. Timeout int64 //超时时间秒
  37. L *lua.LState
  38. Test_luareqcount int //脚本请求次数
  39. Test_goreqtime int //go发起次数(时间)
  40. Test_goreqlist int //go发起次数(列表)
  41. Test_goreqcon int //go发起次数(正文)
  42. }
  43. //加载文件
  44. func (s *Script) LoadScript(downloadnode, script string, isfile ...string) {
  45. s.ScriptFile = script
  46. options := lua.Options{
  47. RegistrySize: 256 * 20,
  48. CallStackSize: 256,
  49. IncludeGoStackTrace: false,
  50. }
  51. s.L = lua.NewState(options)
  52. //s.L.ScriptFileName = s.SCode
  53. s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
  54. s.L.PreloadModule("json", lujson.Loader)
  55. if len(isfile) > 0 {
  56. if err := s.L.DoFile(script); err != nil {
  57. panic("加载lua脚本错误" + err.Error())
  58. }
  59. } else {
  60. if err := s.L.DoString(script); err != nil {
  61. panic("加载lua脚本错误" + err.Error())
  62. }
  63. }
  64. s.Encoding = s.GetVar("spiderPageEncoding")
  65. //暴露go方法
  66. //download(url,head) 普通下载
  67. s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
  68. head := S.ToTable(-1)
  69. url := S.ToString(-2)
  70. ishttps := S.ToBool(-3)
  71. charset := S.ToString(-4)
  72. if charset == "" {
  73. charset = s.Encoding
  74. }
  75. ret := Download(downloadnode, s.Downloader, url, "get", util.GetTable(head), charset, false, ishttps, "", s.Timeout)
  76. S.Push(lua.LString(ret))
  77. s.Test_luareqcount++
  78. return 1
  79. }))
  80. s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
  81. gpath := S.ToString(-2)
  82. content := S.ToString(-1)
  83. ret := util.FindContentText(gpath, content)
  84. S.Push(ret)
  85. return 1
  86. }))
  87. //高级下载download(url,method,param,head,cookie)
  88. s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
  89. cookie := S.ToString(-1)
  90. head := S.ToTable(-2)
  91. param := S.ToTable(-3)
  92. method := S.ToString(-4)
  93. url := S.ToString(-5)
  94. ishttps := S.ToBool(-6)
  95. charset := S.ToString(-7)
  96. if charset == "" {
  97. charset = s.Encoding
  98. }
  99. var mycookie []*http.Cookie
  100. json.Unmarshal([]byte(cookie), &mycookie)
  101. var ret string
  102. var retcookie []*http.Cookie
  103. if param == nil {
  104. ptext := map[string]interface{}{"text": S.ToString(-3)}
  105. ret, retcookie = DownloadAdv(downloadnode, s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, false, ishttps, "", s.Timeout)
  106. } else {
  107. ret, retcookie = DownloadAdv(downloadnode, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, false, ishttps, "", s.Timeout)
  108. }
  109. S.Push(lua.LString(ret))
  110. scookie, _ := json.Marshal(retcookie)
  111. S.Push(lua.LString(scookie))
  112. s.Test_luareqcount++
  113. return 2
  114. }))
  115. s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
  116. nodetype := S.ToString(-3)
  117. gpath := S.ToString(-2)
  118. content := S.ToString(-1)
  119. ret := util.FindOneText(gpath, content, nodetype)
  120. S.Push(ret)
  121. return 1
  122. }))
  123. s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
  124. nodetype := S.ToString(-3)
  125. gpath := S.ToString(-2)
  126. content := S.ToString(-1)
  127. ret := util.FindOneHtml(gpath, content, nodetype)
  128. S.Push(ret)
  129. return 1
  130. }))
  131. s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
  132. gpath := S.ToString(-2)
  133. content := S.ToString(-1)
  134. ret := s.L.NewTable()
  135. util.FindListText(gpath, content, ret)
  136. S.Push(ret)
  137. return 1
  138. }))
  139. s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
  140. gpath := S.ToString(-2)
  141. content := S.ToString(-1)
  142. ret := s.L.NewTable()
  143. util.FindListHtml(gpath, content, ret)
  144. S.Push(ret)
  145. return 1
  146. }))
  147. s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
  148. qmap := S.ToTable(-2)
  149. content := S.ToString(-1)
  150. ret := s.L.NewTable()
  151. util.FindMap(qmap, content, ret)
  152. S.Push(ret)
  153. return 1
  154. }))
  155. //调用jsvm
  156. s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
  157. js := S.ToString(-1)
  158. ret := s.L.NewTable()
  159. if js == "" {
  160. ret.RawSet(lua.LString("val"), lua.LString(""))
  161. ret.RawSet(lua.LString("err"), lua.LString("js is null"))
  162. } else {
  163. rep := util.JsVmPost(util.Config.JsVmUrl, js)
  164. ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
  165. ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
  166. }
  167. S.Push(ret)
  168. return 1
  169. }))
  170. //指定下载器
  171. s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
  172. s.Downloader = GetOneDownloader()
  173. S.Push(lua.LString(s.Downloader))
  174. return 1
  175. }))
  176. //手工延时
  177. s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
  178. time.Sleep(1 * time.Second)
  179. return 0
  180. }))
  181. //编码解码
  182. s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
  183. codeType := strings.ToLower(S.ToString(-2))
  184. str := S.CheckString(-1)
  185. switch codeType {
  186. case "unicode":
  187. str = strings.Replace(str, "%u", "\\u", -1)
  188. str = transUnic(str)
  189. case "urlencode_gbk":
  190. data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
  191. l, _ := url.Parse("http://a.com/?" + string(data))
  192. tmpstr := l.Query().Encode()
  193. if len(tmpstr) > 1 {
  194. str = tmpstr[0 : len(tmpstr)-1]
  195. } else {
  196. str = ""
  197. }
  198. case "urlencode_utf8":
  199. l, _ := url.Parse("http://a.com/?" + str)
  200. tmpstr := l.Query().Encode()
  201. if len(tmpstr) > 1 {
  202. str = tmpstr[0 : len(tmpstr)-1]
  203. } else {
  204. str = ""
  205. }
  206. case "urldecode_utf8":
  207. str, _ = url.QueryUnescape(str)
  208. case "decode64":
  209. str = util.DecodeB64(str)
  210. case "encodemd5":
  211. str = qu.GetMd5String(str)
  212. case "htmldecode": //html实体码
  213. //txt := `<div align="left" style="margin-left: 0pt;"><span style='font-family:; font-size:13px; color:#000000'>&#22826;&#38451;&#23707;&#29305;&#21220;&#28040;&#38450;&#31449;&#12289;&#26494;&#28006;&#29305;&#21220;&#28040;&#38450;&#31449;&#24314;&#35774;&#39033;&#30446;&#35774;&#35745;&#20013;&#26631;&#20844;&#31034;</span></div>`
  214. str = S.ToString(-1)
  215. reg, _ := regexp.Compile("&#\\d+;")
  216. str = reg.ReplaceAllStringFunc(str, func(src string) string {
  217. v, _ := strconv.Atoi(src[2 : len(src)-1])
  218. return string(rune(v))
  219. })
  220. }
  221. S.Push(lua.LString(str))
  222. return 1
  223. }))
  224. //保存错误日志
  225. s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
  226. return 0
  227. }))
  228. //添加改版日志
  229. s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
  230. return 0
  231. }))
  232. //如果服务端返回的html是gzip压缩过格式的 这里需要转一下
  233. s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
  234. html := S.ToString(-1)
  235. bs := []byte(html)
  236. gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
  237. bs, _ = ioutil.ReadAll(gzipreader)
  238. S.Push(lua.LString(bs))
  239. return 1
  240. }))
  241. s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  242. bResult := false
  243. S.Push(lua.LBool(bResult))
  244. return 1
  245. }))
  246. //解析附件中的word、pdf
  247. s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
  248. ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
  249. str := S.ToString(-2)
  250. extension := S.ToString(-1)
  251. bs, _ := base64.StdEncoding.DecodeString(str)
  252. bs = append([]byte{ext[extension]}, bs...)
  253. msgid := mu.UUID(8)
  254. Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
  255. return 1
  256. }))
  257. //下载附件download(url,method,param,head,cookie,fileName)
  258. s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
  259. cookie := S.ToString(-1)
  260. head := S.ToTable(-2)
  261. param := S.ToTable(-3)
  262. method := S.ToString(-4)
  263. url := S.ToString(-5)
  264. fileName := S.ToString(-6)
  265. ishttps := strings.Contains(url, "https")
  266. var mycookie []*http.Cookie
  267. if cookie != "{}" {
  268. json.Unmarshal([]byte(cookie), &mycookie)
  269. } else {
  270. mycookie = make([]*http.Cookie, 0)
  271. }
  272. fileName = strings.TrimSpace(fileName)
  273. url = strings.TrimSpace(url)
  274. ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout)
  275. name, size, ftype, fid := "", "", "", ""
  276. qu.Debug(GarbledCodeReg.FindAllString(string(ret), -1), len(ret))
  277. if ret == nil || len(ret) < 1024*5 {
  278. qu.Debug("下载文件出错!")
  279. } else {
  280. ftype = qu.GetFileType(ret)
  281. if (ftype == "docx" || ftype == "doc") && len(GarbledCodeReg.FindAllString(string(ret), -1)) > 10 {
  282. url, name, size, ftype, fid = "附件中含有乱码", "附件中含有乱码", "", "", ""
  283. } else {
  284. url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
  285. }
  286. }
  287. if strings.TrimSpace(ftype) == "" {
  288. if len(path.Ext(name)) > 0 {
  289. ftype = path.Ext(name)[1:]
  290. }
  291. }
  292. S.Push(lua.LString(url))
  293. S.Push(lua.LString(name))
  294. S.Push(lua.LString(size))
  295. S.Push(lua.LString(ftype))
  296. S.Push(lua.LString(fid))
  297. return 5
  298. }))
  299. //支持正则
  300. s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
  301. index := int(S.ToNumber(-1))
  302. regstr := S.ToString(-2)
  303. text := S.ToString(-3)
  304. reg := regexp.MustCompile(regstr)
  305. reps := reg.FindAllStringSubmatchIndex(text, -1)
  306. ret := s.L.NewTable()
  307. number := 0
  308. for _, v := range reps {
  309. number++
  310. ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
  311. }
  312. S.Push(ret)
  313. return 1
  314. }))
  315. //支持替换
  316. s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
  317. text := S.ToString(-3)
  318. old := S.ToString(-2)
  319. repl := S.ToString(-1)
  320. text = strings.Replace(text, old, repl, -1)
  321. S.Push(lua.LString(text))
  322. return 1
  323. }))
  324. //标题的关键词、排除词过滤
  325. s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
  326. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  327. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  328. data := S.ToTable(-1)
  329. dataMap := util.TableToMap(data)
  330. ret := s.L.NewTable()
  331. num := 1
  332. for _, v := range dataMap {
  333. tmp := v.(map[string]interface{})
  334. isOk := false
  335. if title := qu.ObjToString(tmp["title"]); title != "" {
  336. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  337. isOk = true
  338. }
  339. }
  340. if isOk {
  341. ret.Insert(num, util.MapToLuaTable(S, tmp))
  342. num++
  343. }
  344. }
  345. S.Push(ret)
  346. return 1
  347. }))
  348. //标题的关键词、排除词过滤
  349. s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
  350. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  351. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  352. data := S.ToTable(-1)
  353. dataMap := util.TableToMap(data)
  354. if title := qu.ObjToString(dataMap["title"]); title != "" {
  355. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  356. S.Push(lua.LBool(true))
  357. return 1
  358. } else {
  359. qu.Debug(s.SCode, dataMap["href"], " title error")
  360. }
  361. } else {
  362. qu.Debug(s.SCode, dataMap["href"], " title error")
  363. }
  364. S.Push(lua.LBool(false))
  365. return 1
  366. }))
  367. //detail过滤
  368. s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
  369. /*
  370. 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
  371. 2.是否含汉字
  372. */
  373. reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|详见附件|见附件)")
  374. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  375. detail := S.ToString(-1)
  376. if reg1.MatchString(detail) {
  377. S.Push(lua.LBool(true))
  378. return 1
  379. }
  380. if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
  381. S.Push(lua.LBool(false))
  382. return 1
  383. }
  384. S.Push(lua.LBool(false))
  385. return 1
  386. }))
  387. //匹配汉字
  388. s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
  389. reg1 := regexp.MustCompile("(见附件|详见附件)")
  390. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  391. detail := S.ToString(-1)
  392. detail = reg1.ReplaceAllString(detail, "")
  393. ok := reg2.MatchString(detail)
  394. S.Push(lua.LBool(ok))
  395. return 1
  396. }))
  397. //aes ecb模式加密
  398. s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
  399. origData := S.ToString(-2)
  400. key := S.ToString(-1)
  401. bytekey := []byte(key)
  402. byteorigData := []byte(origData)
  403. cipher, _ := aes.NewCipher(generateKey([]byte(bytekey)))
  404. length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize
  405. plain := make([]byte, length*aes.BlockSize)
  406. copy(plain, byteorigData)
  407. pad := byte(len(plain) - len(byteorigData))
  408. for i := len(byteorigData); i < len(plain); i++ {
  409. plain[i] = pad
  410. }
  411. encrypted := make([]byte, len(plain))
  412. // 分组分块加密
  413. for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() {
  414. cipher.Encrypt(encrypted[bs:be], plain[bs:be])
  415. }
  416. result := base64.StdEncoding.EncodeToString(encrypted)
  417. S.Push(lua.LString(result))
  418. return 1
  419. }))
  420. }
  421. //
  422. func (s *Script) Reload() {
  423. s.L.Close()
  424. s.LoadScript("", s.ScriptFile)
  425. }
  426. //unicode转码
  427. func transUnic(str string) string {
  428. buf := bytes.NewBuffer(nil)
  429. i, j := 0, len(str)
  430. for i < j {
  431. x := i + 6
  432. if x > j {
  433. buf.WriteString(str[i:])
  434. break
  435. }
  436. if str[i] == '\\' && str[i+1] == 'u' {
  437. hex := str[i+2 : x]
  438. r, err := strconv.ParseUint(hex, 16, 64)
  439. if err == nil {
  440. buf.WriteRune(rune(r))
  441. } else {
  442. buf.WriteString(str[i:x])
  443. }
  444. i = x
  445. } else {
  446. buf.WriteByte(str[i])
  447. i++
  448. }
  449. }
  450. return buf.String()
  451. }
  452. //取得变量
  453. func (s *Script) GetVar(key string) string {
  454. return s.L.GetGlobal(key).String()
  455. }
  456. //
  457. func (s *Script) GetIntVar(key string) int {
  458. lv := s.L.GetGlobal(key)
  459. if v, ok := lv.(lua.LNumber); ok {
  460. return int(v)
  461. }
  462. return -1
  463. }
  464. //
  465. func (s *Script) GetBoolVar(key string) bool {
  466. lv := s.L.GetGlobal(key)
  467. if v, ok := lv.(lua.LBool); ok {
  468. return bool(v)
  469. }
  470. return false
  471. }
  472. func generateKey(key []byte) (genKey []byte) {
  473. genKey = make([]byte, 16)
  474. copy(genKey, key)
  475. for i := 16; i < len(key); {
  476. for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
  477. genKey[j] ^= key[i]
  478. }
  479. }
  480. return genKey
  481. }