script.go 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928
  1. /**
  2. 脚本加载+调用 封装,
  3. 前期走文件系统加载
  4. 后期走数据库配置,
  5. LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
  6. */
  7. package spider
  8. import (
  9. "bytes"
  10. "compress/gzip"
  11. "crypto/aes"
  12. "encoding/base64"
  13. "encoding/json"
  14. "io/ioutil"
  15. mu "mfw/util"
  16. "net/http"
  17. "net/url"
  18. "path"
  19. "github.com/shopspring/decimal"
  20. qu "qfw/util"
  21. _ "qfw/util/redis"
  22. "regexp"
  23. util "spiderutil"
  24. "strconv"
  25. "strings"
  26. "sync/atomic"
  27. "time"
  28. gq "github.com/PuerkitoBio/goquery"
  29. "github.com/cjoudrey/gluahttp"
  30. "github.com/donnie4w/go-logger/logger"
  31. lujson "github.com/yuin/gopher-json"
  32. "github.com/yuin/gopher-lua"
  33. "golang.org/x/text/encoding/simplifiedchinese"
  34. "golang.org/x/text/transform"
  35. )
  36. //脚本
  37. type Script struct {
  38. SCode, ScriptFile string
  39. Encoding string
  40. Userproxy bool
  41. //Ishttps bool
  42. ErrorNum int32 //错误数
  43. Downloader string //下载器
  44. TotalRequestNum int32 //总请求次数
  45. ToDayRequestNum int32 //今日请求次数
  46. YestoDayRequestNum int32 //昨日请求次数
  47. Timeout int64 //超时时间秒
  48. L *lua.LState
  49. NoDownloadNum int32 //未成功下载数
  50. LastThreeTimes []time.Duration //单条信息流程完成的时间,最后三次
  51. FileLastThreeTimes []time.Duration //附件下载单条信息流程完成的时间,最后三次
  52. }
  53. const (
  54. MAX_STEP = 5 //计算时的最大步长
  55. )
  56. var workTime = true
  57. //
  58. func init() {
  59. go isWorkTime()
  60. }
  61. var TimeSleepChan = make(chan bool, 1)
  62. //加载文件
  63. func (s *Script) LoadScript(code, script_file string, newstate bool) string {
  64. defer mu.Catch()
  65. s.SCode = code
  66. s.ScriptFile = script_file
  67. if util.Config.Working == 0 {
  68. if newstate {
  69. s.L = lua.NewState(lua.Options{
  70. RegistrySize: 256 * 20,
  71. CallStackSize: 256,
  72. IncludeGoStackTrace: false,
  73. })
  74. }
  75. } else { //节能模式从CC池中获取lua.LState
  76. if newstate { //队列模式的newstate主要区分是列表页爬虫CC还是三级页爬虫CC2
  77. lState := <-CC2
  78. s.L = lState
  79. } else {
  80. lState := <-CC
  81. s.L = lState
  82. }
  83. //logger.Debug("获取CC资源", script_file)
  84. }
  85. s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
  86. s.L.PreloadModule("json", lujson.Loader)
  87. if err := s.L.DoString(script_file); err != nil {
  88. logger.Debug(code + ",加载lua脚本错误:" + err.Error())
  89. return "加载lua脚本错误:" + err.Error()
  90. //panic(code + ",加载lua脚本错误:" + err.Error())
  91. }
  92. s.Encoding = s.GetVar("spiderPageEncoding")
  93. s.Userproxy = s.GetBoolVar("spiderUserProxy")
  94. //暴露go方法
  95. //download(url,head) 普通下载
  96. s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
  97. if s.LastThreeTimes == nil {
  98. s.LastThreeTimes = make([]time.Duration, 4)
  99. }
  100. if util.Config.IsDelay {
  101. SleepTime(1, s.LastThreeTimes) //睡眠时间
  102. }
  103. start := time.Now() //起始时间
  104. head := S.ToTable(-1)
  105. url := S.ToString(-2)
  106. ishttps := S.ToBool(-3)
  107. charset := S.ToString(-4)
  108. if charset == "" {
  109. charset = s.Encoding
  110. }
  111. ret := Download(s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  112. S.Push(lua.LString(ret))
  113. atomic.AddInt32(&s.ToDayRequestNum, 1)
  114. atomic.AddInt32(&s.TotalRequestNum, 1)
  115. end := time.Since(start)
  116. if len(s.LastThreeTimes) >= 4 {
  117. s.LastThreeTimes = s.LastThreeTimes[1:]
  118. }
  119. s.LastThreeTimes = append(s.LastThreeTimes, end)
  120. return 1
  121. }))
  122. //高级下载downloadAdv(url,method,param,head,cookie)
  123. s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
  124. if s.LastThreeTimes == nil {
  125. s.LastThreeTimes = make([]time.Duration, 4)
  126. }
  127. if util.Config.IsDelay {
  128. SleepTime(1, s.LastThreeTimes) //睡眠时间
  129. }
  130. start := time.Now() //起始时间
  131. cookie := S.ToString(-1)
  132. head := S.ToTable(-2)
  133. param := S.ToTable(-3)
  134. method := S.ToString(-4)
  135. url := S.ToString(-5)
  136. ishttps := S.ToBool(-6)
  137. charset := S.ToString(-7)
  138. if charset == "" {
  139. charset = s.Encoding
  140. }
  141. var mycookie []*http.Cookie
  142. json.Unmarshal([]byte(cookie), &mycookie)
  143. var ret string
  144. var retcookie []*http.Cookie
  145. if param == nil {
  146. ptext := map[string]interface{}{"text": S.ToString(-3)}
  147. ret, retcookie = DownloadAdv(s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  148. } else {
  149. ret, retcookie = DownloadAdv(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  150. }
  151. S.Push(lua.LString(ret))
  152. scookie, _ := json.Marshal(retcookie)
  153. S.Push(lua.LString(scookie))
  154. atomic.AddInt32(&s.ToDayRequestNum, 1)
  155. atomic.AddInt32(&s.TotalRequestNum, 1)
  156. end := time.Since(start)
  157. if len(s.LastThreeTimes) >= 4 {
  158. s.LastThreeTimes = s.LastThreeTimes[1:]
  159. }
  160. s.LastThreeTimes = append(s.LastThreeTimes, end)
  161. return 2
  162. }))
  163. //保存验证错误日志
  164. s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
  165. code := S.ToString(-4)
  166. name := S.ToString(-3)
  167. url := S.ToString(-2)
  168. content := S.ToString(-1)
  169. saveVerificationLog(code, name, url, content)
  170. atomic.AddInt32(&s.ErrorNum, 1)
  171. atomic.AddInt32(&s.NoDownloadNum, 1)
  172. //防止恶意增加日志
  173. util.TimeSleepFunc(5*time.Second, TimeSleepChan)
  174. return 0
  175. }))
  176. //添加改版日志
  177. s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
  178. url := S.ToString(-2)
  179. str := S.ToString(-1)
  180. logger.Error(s.SCode, url, str)
  181. return 0
  182. }))
  183. //查找信息是否存在(作废)
  184. s.L.SetGlobal("findHasExit", s.L.NewFunction(func(S *lua.LState) int {
  185. //c := S.ToString(-2)
  186. //q := S.ToString(-1)
  187. //b := findHasExit(c, q)
  188. S.Push(lua.LBool(false))
  189. return 1
  190. }))
  191. s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
  192. nodetype := S.ToString(-3)
  193. gpath := S.ToString(-2)
  194. content := S.ToString(-1)
  195. ret := util.FindOneText(gpath, content, nodetype)
  196. S.Push(ret)
  197. return 1
  198. }))
  199. s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
  200. gpath := S.ToString(-2)
  201. content := S.ToString(-1)
  202. ret := util.FindContentText(gpath, content)
  203. S.Push(ret)
  204. return 1
  205. }))
  206. s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
  207. nodetype := S.ToString(-3)
  208. gpath := S.ToString(-2)
  209. content := S.ToString(-1)
  210. ret := util.FindOneHtml(gpath, content, nodetype)
  211. S.Push(ret)
  212. return 1
  213. }))
  214. s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
  215. gpath := S.ToString(-2)
  216. content := S.ToString(-1)
  217. ret := s.L.NewTable()
  218. util.FindListText(gpath, content, ret)
  219. S.Push(ret)
  220. return 1
  221. }))
  222. s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
  223. gpath := S.ToString(-2)
  224. content := S.ToString(-1)
  225. ret := s.L.NewTable()
  226. util.FindListHtml(gpath, content, ret)
  227. if ret.Len() > 0 {
  228. UpdateHeart("", "", code, "", "findlist") //记录列表页实际采集数据量心跳
  229. }
  230. S.Push(ret)
  231. return 1
  232. }))
  233. // s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int {
  234. // update := [][]map[string]interface{}{}
  235. // query := map[string]interface{}{"state": 0}
  236. // data, _ := Mgo.Find(util.Config.TmpCollName, query, `{"_id":-1}`, nil, false, 0, 10)
  237. // pageList := []interface{}{}
  238. // for _, d := range *data {
  239. // tmpMap := map[string]string{}
  240. // tmpMap["title"] = qu.ObjToString(d["title"])
  241. // tmpMap["detail"] = qu.ObjToString(d["detail"])
  242. // tmpMap["href"] = qu.ObjToString(d["href"])
  243. // publishtime := qu.Int64All(d["publishtime"])
  244. // tmpMap["publishtime"] = qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout)
  245. // tmpMap["_id"] = qu.BsonIdToSId(d["_id"])
  246. // pageList = append(pageList, tmpMap)
  247. // update = append(update, []map[string]interface{}{
  248. // map[string]interface{}{"_id": d["_id"]},
  249. // map[string]interface{}{"$set": map[string]interface{}{"state": 1}},
  250. // })
  251. // }
  252. // ret := util.MapToTable(s.L, pageList)
  253. // S.Push(ret)
  254. // if len(update) > 0 {
  255. // Mgo.UpdateBulk(util.Config.TmpCollName, update...)
  256. // }
  257. // return 1
  258. // }))
  259. s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
  260. qmap := S.ToTable(-2)
  261. content := S.ToString(-1)
  262. ret := s.L.NewTable()
  263. util.FindMap(qmap, content, ret)
  264. S.Push(ret)
  265. return 1
  266. }))
  267. //公示暴露方式
  268. s.L.SetGlobal("getEcpsCode", s.L.NewFunction(func(S *lua.LState) int {
  269. area := strings.ToUpper(S.ToString(-2))
  270. content := S.ToString(-1)
  271. code, state := util.GetEcpsCode(area, []byte(content))
  272. if state == "wx" {
  273. code, _ = GetCodeByWx([]byte(content))
  274. }
  275. S.Push(lua.LString(code))
  276. return 1
  277. }))
  278. //调用jsvm
  279. s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
  280. js := S.ToString(-1)
  281. ret := s.L.NewTable()
  282. if js == "" {
  283. ret.RawSet(lua.LString("val"), lua.LString(""))
  284. ret.RawSet(lua.LString("err"), lua.LString("js is null"))
  285. } else {
  286. rep := util.JsVmPost(util.Config.JsVmUrl, js)
  287. ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
  288. ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
  289. }
  290. S.Push(ret)
  291. return 1
  292. }))
  293. //指定下载器
  294. s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
  295. s.Downloader = GetOneDownloader()
  296. S.Push(lua.LString(s.Downloader))
  297. return 1
  298. }))
  299. //指定下载器file
  300. s.L.SetGlobal("changeDownloaderFile", s.L.NewFunction(func(S *lua.LState) int {
  301. s.Downloader = GetOneDownloaderFile()
  302. S.Push(lua.LString(s.Downloader))
  303. return 1
  304. }))
  305. //手工延时
  306. s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
  307. // if workTime {
  308. // util.TimeSleepFunc(time.Duration(S.ToInt(-1))*time.Second, TimeSleepChan)
  309. // } else {
  310. // util.TimeSleepFunc(1*time.Second, TimeSleepChan)
  311. // }
  312. util.TimeSleepFunc(time.Second*2, TimeSleepChan)
  313. return 0
  314. }))
  315. //编码解码
  316. s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
  317. codeType := strings.ToLower(S.ToString(-2))
  318. str := S.CheckString(-1)
  319. switch codeType {
  320. case "unicode":
  321. str = transUnic(str)
  322. case "urlencode_gbk":
  323. data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
  324. l, _ := url.Parse("http://a.com/?" + string(data))
  325. tmpstr := l.Query().Encode()
  326. if len(tmpstr) > 1 {
  327. str = tmpstr[0 : len(tmpstr)-1]
  328. } else {
  329. str = ""
  330. }
  331. case "urlencode_utf8":
  332. l, _ := url.Parse("http://a.com/?" + str)
  333. tmpstr := l.Query().Encode()
  334. if len(tmpstr) > 1 {
  335. str = tmpstr[0 : len(tmpstr)-1]
  336. } else {
  337. str = ""
  338. }
  339. case "urldecode_utf8":
  340. str, _ = url.QueryUnescape(str)
  341. case "decode64":
  342. str = util.DecodeB64(str)
  343. case "encodemd5":
  344. str = qu.GetMd5String(str)
  345. case "htmldecode": //html实体码
  346. //txt := `<div align="left" style="margin-left: 0pt;"><span style='font-family:; font-size:13px; color:#000000'>&#22826;&#38451;&#23707;&#29305;&#21220;&#28040;&#38450;&#31449;&#12289;&#26494;&#28006;&#29305;&#21220;&#28040;&#38450;&#31449;&#24314;&#35774;&#39033;&#30446;&#35774;&#35745;&#20013;&#26631;&#20844;&#31034;</span></div>`
  347. str = S.ToString(-1)
  348. reg, _ := regexp.Compile("&#\\d+;")
  349. str = reg.ReplaceAllStringFunc(str, func(src string) string {
  350. v, _ := strconv.Atoi(src[2 : len(src)-1])
  351. return string(rune(v))
  352. })
  353. }
  354. S.Push(lua.LString(str))
  355. return 1
  356. }))
  357. //如果服务端返回的html是gzip压缩过格式的 这里需要转一下
  358. s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
  359. html := S.ToString(-1)
  360. bs := []byte(html)
  361. gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
  362. bs, _ = ioutil.ReadAll(gzipreader)
  363. S.Push(lua.LString(bs))
  364. return 1
  365. }))
  366. //luamaker提供的分析列表页url地址 获取列表数据公用方法
  367. s.L.SetGlobal("getSimpleListPage", s.L.NewFunction(func(S *lua.LState) int {
  368. html := S.ToString(-3)
  369. date_pattern := S.ToString(-2)
  370. pageListUrl := S.ToString(-1) //列表页url
  371. bs := []byte(html)
  372. tmparr := []string{}
  373. tmpret := []int{}
  374. re, _ := regexp.Compile(`采购|招标|公示|公告|意见|结果|通知|工程`)
  375. doc, _ := gq.NewDocumentFromReader(bytes.NewReader(bs))
  376. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  377. text := sq.Text()
  378. if len(text) < 30 {
  379. return
  380. }
  381. tmparr = append(tmparr, text)
  382. if re.MatchString(text) {
  383. tmpret = append(tmpret, 1)
  384. //logger.Debug(text)
  385. } else {
  386. tmpret = append(tmpret, 0)
  387. }
  388. })
  389. logger.Debug(tmpret)
  390. //线性分析,算周边,只算周围5步的点
  391. tmplen, thepos, themax := len(tmpret), -1, 0
  392. for i := 0; i < tmplen; i++ {
  393. if tmpret[i] == 0 {
  394. continue
  395. }
  396. start, end := i-MAX_STEP, i+MAX_STEP
  397. if start < 0 {
  398. start = 0
  399. }
  400. if end > tmplen {
  401. end = tmplen
  402. }
  403. tmp := 0
  404. //从当前位置往左,往右找连续点
  405. for j := i; j > start; j-- {
  406. if tmpret[j] == 1 {
  407. tmp++
  408. } else {
  409. break
  410. }
  411. }
  412. for j := i; j < end; j++ {
  413. if tmpret[j] == 1 {
  414. tmp++
  415. } else {
  416. break
  417. }
  418. }
  419. if tmp > themax {
  420. themax = tmp
  421. thepos = i
  422. }
  423. } //end of for...
  424. //logger.Debug("找位置完成")
  425. //验证
  426. if thepos == -1 {
  427. logger.Error("完蛋,找不到")
  428. panic("不支持啊,失败啊")
  429. }
  430. //下边是找父容器
  431. var thelink *gq.Selection
  432. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  433. if sq.Text() == tmparr[thepos] {
  434. thelink = sq
  435. }
  436. })
  437. isfind := false
  438. //同样Path向上找,不超过5步
  439. for i := 0; i < MAX_STEP; i++ {
  440. thelink = thelink.Parent()
  441. clen := getChildrenLen(thelink)
  442. if clen >= themax-1 {
  443. isfind = true
  444. break
  445. }
  446. //logger.Debug("TAG:::", thelink.Nodes[0].Data, clen)
  447. }
  448. //找到列表
  449. pageList := []interface{}{}
  450. if isfind {
  451. thelink.Children().Each(func(i int, sq *gq.Selection) {
  452. page := map[string]string{}
  453. link_sq := sq.Find("a")
  454. href := link_sq.AttrOr("href", "")
  455. text := link_sq.Text()
  456. page["title"] = text
  457. page["href"] = dealHref(pageListUrl, href)
  458. page["publishtime"] = dealPublishTime(strings.TrimSpace(sq.Text()), date_pattern)
  459. //logger.Debug(i)
  460. pageList = append(pageList, page)
  461. })
  462. } else {
  463. logger.Error("完蛋,找父亲节点失败啊")
  464. //panic("不支持啊,失败啊")
  465. }
  466. ret := util.MapToTable(s.L, pageList)
  467. S.Push(ret)
  468. return 1
  469. }))
  470. //招投标信息标题判重
  471. s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  472. S.Push(lua.LBool(false))
  473. return 1
  474. }))
  475. //招标信息判重新方法 2016-12-14 wanghuidong
  476. s.L.SetGlobal("urlRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  477. S.Push(lua.LBool(false))
  478. return 1
  479. }))
  480. //将url放入内存缓存 2016-12-14 wanghuidong
  481. s.L.SetGlobal("putUrl2Redis", s.L.NewFunction(func(S *lua.LState) int {
  482. //url := S.ToString(-1)
  483. return 1
  484. }))
  485. //解析附件中的word、pdf
  486. s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
  487. ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
  488. str := S.ToString(-2)
  489. extension := S.ToString(-1)
  490. bs, _ := base64.StdEncoding.DecodeString(str)
  491. bs = append([]byte{ext[extension]}, bs...)
  492. msgid := mu.UUID(8)
  493. Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
  494. return 1
  495. }))
  496. //下载附件download(url,method,param,head,cookie,fileName)
  497. s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
  498. if s.FileLastThreeTimes == nil {
  499. s.FileLastThreeTimes = make([]time.Duration, 4)
  500. }
  501. if util.Config.IsDelay {
  502. SleepTime(3, s.FileLastThreeTimes) //睡眠时间
  503. }
  504. start := time.Now() //起始时间
  505. cookie := S.ToString(-1)
  506. head := S.ToTable(-2)
  507. param := S.ToTable(-3)
  508. method := S.ToString(-4)
  509. url := S.ToString(-5)
  510. fileName := S.ToString(-6)
  511. ishttps := strings.Contains(url, "https")
  512. var mycookie []*http.Cookie
  513. if cookie != "{}" {
  514. json.Unmarshal([]byte(cookie), &mycookie)
  515. } else {
  516. mycookie = make([]*http.Cookie, 0)
  517. }
  518. fileName = strings.TrimSpace(fileName)
  519. url = strings.TrimSpace(url)
  520. ret := DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
  521. url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
  522. if strings.TrimSpace(ftype) == "" {
  523. if len(path.Ext(name)) > 0 {
  524. ftype = path.Ext(name)[1:]
  525. }
  526. }
  527. S.Push(lua.LString(url))
  528. S.Push(lua.LString(name))
  529. S.Push(lua.LString(size))
  530. S.Push(lua.LString(ftype))
  531. S.Push(lua.LString(fid))
  532. atomic.AddInt32(&s.ToDayRequestNum, 1)
  533. atomic.AddInt32(&s.TotalRequestNum, 1)
  534. end := time.Since(start)
  535. if len(s.FileLastThreeTimes) >= 4 {
  536. s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
  537. }
  538. s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
  539. return 5
  540. }))
  541. s.L.SetGlobal("clearMemoeryCache", s.L.NewFunction(func(S *lua.LState) int {
  542. /*title := S.ToString(-1)
  543. isExist, _ := redis.Exists("title_repeat_judgement", "title_repeat_"+title)
  544. if isExist {
  545. redis.Del("title_repeat_judgement", "title_repeat_"+title)
  546. }*/
  547. return 1
  548. }))
  549. //支持正则,提取
  550. s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
  551. index := int(S.ToNumber(-1))
  552. regstr := S.ToString(-2)
  553. text := S.ToString(-3)
  554. reg := regexp.MustCompile(regstr)
  555. reps := reg.FindAllStringSubmatchIndex(text, -1)
  556. ret := s.L.NewTable()
  557. number := 0
  558. for _, v := range reps {
  559. number++
  560. ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
  561. }
  562. S.Push(ret)
  563. return 1
  564. }))
  565. //支持替换
  566. s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
  567. text := S.ToString(-3)
  568. old := S.ToString(-2)
  569. repl := S.ToString(-1)
  570. text = strings.Replace(text, old, repl, -1)
  571. S.Push(lua.LString(text))
  572. return 1
  573. }))
  574. //标题的关键词、排除词过滤
  575. s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
  576. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  577. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  578. data := S.ToTable(-1)
  579. dataMap := util.TableToMap(data)
  580. ret := s.L.NewTable()
  581. num := 1
  582. for _, v := range dataMap {
  583. tmp := v.(map[string]interface{})
  584. isOk := false
  585. if title := qu.ObjToString(tmp["title"]); title != "" {
  586. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  587. isOk = true
  588. }
  589. }
  590. if isOk {
  591. ret.Insert(num, util.MapToLuaTable(S, tmp))
  592. num++
  593. }
  594. }
  595. S.Push(ret)
  596. return 1
  597. }))
  598. //标题的关键词、排除词过滤
  599. s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
  600. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  601. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  602. data := S.ToTable(-1)
  603. dataMap := util.TableToMap(data)
  604. if title := qu.ObjToString(dataMap["title"]); title != "" {
  605. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  606. S.Push(lua.LBool(true))
  607. return 1
  608. } else {
  609. qu.Debug(s.SCode, dataMap["href"], " title error")
  610. }
  611. } else {
  612. qu.Debug(s.SCode, dataMap["href"], " title error")
  613. }
  614. S.Push(lua.LBool(false))
  615. return 1
  616. }))
  617. //detail过滤
  618. s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
  619. /*
  620. 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
  621. 2.是否含汉字
  622. */
  623. reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|见附件|详见附件)")
  624. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  625. detail := S.ToString(-1)
  626. if reg1.MatchString(detail) {
  627. S.Push(lua.LBool(true))
  628. return 1
  629. }
  630. if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
  631. S.Push(lua.LBool(false))
  632. return 1
  633. }
  634. S.Push(lua.LBool(false))
  635. return 1
  636. }))
  637. //匹配汉字
  638. s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
  639. reg1 := regexp.MustCompile("(见附件|详见附件)")
  640. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  641. detail := S.ToString(-1)
  642. detail = reg1.ReplaceAllString(detail, "")
  643. ok := reg2.MatchString(detail)
  644. S.Push(lua.LBool(ok))
  645. return 1
  646. }))
  647. //aes ecb模式加密
  648. s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
  649. origData := S.ToString(-2)
  650. key := S.ToString(-1)
  651. bytekey := []byte(key)
  652. byteorigData := []byte(origData)
  653. cipher, _ := aes.NewCipher(generateKey([]byte(bytekey)))
  654. length := (len(byteorigData) + aes.BlockSize) / aes.BlockSize
  655. plain := make([]byte, length*aes.BlockSize)
  656. copy(plain, byteorigData)
  657. pad := byte(len(plain) - len(byteorigData))
  658. for i := len(byteorigData); i < len(plain); i++ {
  659. plain[i] = pad
  660. }
  661. encrypted := make([]byte, len(plain))
  662. // 分组分块加密
  663. for bs, be := 0, cipher.BlockSize(); bs <= len(byteorigData); bs, be = bs+cipher.BlockSize(), be+cipher.BlockSize() {
  664. cipher.Encrypt(encrypted[bs:be], plain[bs:be])
  665. }
  666. result := base64.StdEncoding.EncodeToString(encrypted)
  667. S.Push(lua.LString(result))
  668. return 1
  669. }))
  670. //根据正文获取发布时间
  671. s.L.SetGlobal("getPublishtime", s.L.NewFunction(func(S *lua.LState) int {
  672. detail := S.ToString(-2)
  673. contenthtml := S.ToString(-1)
  674. publishtime := util.GetPublishtime([]string{contenthtml, detail})
  675. S.Push(lua.LString(publishtime))
  676. return 1
  677. }))
  678. //匹配
  679. s.L.SetGlobal("stringFind", s.L.NewFunction(func(S *lua.LState) int {
  680. regstr := S.ToString(-1)
  681. text := S.ToString(-2)
  682. reg := regexp.MustCompile(regstr)
  683. result := reg.FindString(text)
  684. isMatch := false
  685. if result != "" {
  686. isMatch = true
  687. }
  688. S.Push(lua.LString(result))
  689. S.Push(lua.LBool(isMatch))
  690. return 2
  691. }))
  692. //截取
  693. s.L.SetGlobal("stringSub", s.L.NewFunction(func(S *lua.LState) int {
  694. text := S.ToString(-3)
  695. start := S.ToInt(-2)
  696. end := S.ToInt(-1)
  697. result := ""
  698. if len(text) > 0 {
  699. textRune := []rune(text)
  700. textLen := len(textRune)
  701. if end < 0 {
  702. if start > 0 { //正向截取到倒数第end位
  703. result = string(textRune[start-1 : textLen+1+end])
  704. } else if start < 0 { //反向截取 从倒数第start位截取到倒数第end位
  705. result = string(textRune[textLen+start : textLen+1+end])
  706. }
  707. } else if start > 0 && end >= start && end <= textLen { //从第start个截取到第end个
  708. result = string(textRune[start-1 : end])
  709. }
  710. // if end == -1 {
  711. // if start >= 1 { //正向截取到结尾
  712. // result = string(textRune[start-1:])
  713. // } else if start < 0 && textLen+start >= 0 { //反向截取后缀
  714. // result = string(textRune[textLen+start:])
  715. // }
  716. // } else if start >= 1 && end <= textLen { //从第start个截取到第end个
  717. // result = string(textRune[start-1 : end])
  718. // }
  719. }
  720. S.Push(lua.LString(result))
  721. return 1
  722. }))
  723. //base64加密
  724. s.L.SetGlobal("encodeBase64", s.L.NewFunction(func(S *lua.LState) int {
  725. text := S.ToString(-1)
  726. base64Text := base64.StdEncoding.EncodeToString([]byte(text))
  727. S.Push(lua.LString(base64Text))
  728. return 1
  729. }))
  730. //base64解密
  731. s.L.SetGlobal("decodeBase64", s.L.NewFunction(func(S *lua.LState) int {
  732. text := S.ToString(-1)
  733. result := ""
  734. byteText, err := base64.StdEncoding.DecodeString(text)
  735. if err == nil {
  736. result = string(byteText)
  737. }
  738. S.Push(lua.LString(result))
  739. return 1
  740. }))
  741. //长度
  742. s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int {
  743. text := S.ToString(-1)
  744. textLen := len([]rune(text))
  745. S.Push(lua.LNumber(textLen))
  746. return 1
  747. }))
  748. //去除特殊标签中间内容
  749. s.L.SetGlobal("getPureContent", s.L.NewFunction(func(S *lua.LState) int {
  750. con := S.ToString(-1)
  751. reg := regexp.MustCompile("(?s)<(!%-%-|!--|style).*?(%-%-|--|style)>") //注释 css
  752. con = reg.ReplaceAllString(con, "")
  753. // indexArr := reg.FindAllStringIndex(con, -1)
  754. // for i := len(indexArr) - 1; i >= 0; i-- {
  755. // if index := indexArr[i]; len(index) == 2 {
  756. // con = con[:index[0]] + con[index[1]:]
  757. // }
  758. // }
  759. S.Push(lua.LString(con))
  760. return 1
  761. }))
  762. //interface转string
  763. s.L.SetGlobal("formatToString", s.L.NewFunction(func(S *lua.LState) int {
  764. strNum := S.ToString(-1)
  765. decimalNum, _ := decimal.NewFromString(strNum)
  766. S.Push(lua.LString(decimalNum.String()))
  767. return 1
  768. }))
  769. return ""
  770. }
  771. func dealHref(pageListUrl, href string) string {
  772. returnUrl := ""
  773. if href != "" {
  774. r, _ := regexp.Compile("^./")
  775. match := r.MatchString(href)
  776. if match {
  777. url2 := r.ReplaceAllString(href, "")
  778. returnUrl = pageListUrl + url2
  779. }
  780. r2, _ := regexp.Compile("^/")
  781. match2 := r2.MatchString(href)
  782. if match2 {
  783. r3, _ := regexp.Compile("http://[^/]*/")
  784. domain := r3.FindString(pageListUrl)
  785. //fmt.Println(domain)
  786. url2 := r2.ReplaceAllString(href, "")
  787. returnUrl = domain + url2
  788. }
  789. }
  790. return returnUrl
  791. }
  792. func dealPublishTime(content string, pattern string) string {
  793. publishTime := ""
  794. if pattern == "yyyy-MM-dd HH:mm:ss" {
  795. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2}")
  796. publishTime = r.FindString(content)
  797. } else if pattern == "yyyy-MM-dd" {
  798. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}")
  799. publishTime = r.FindString(content)
  800. } else if pattern == "MM-dd" {
  801. r, _ := regexp.Compile("\\d{2}-\\d{2}")
  802. publishTime = r.FindString(content)
  803. }
  804. return publishTime
  805. }
  806. func getChildrenLen(sq *gq.Selection) (ret int) {
  807. sq.Children().Each(func(i int, sq2 *gq.Selection) {
  808. ret = i
  809. })
  810. return
  811. }
  812. //
  813. //func (s *Script) Reload() {
  814. // s.L.Close()
  815. // s.LoadScript(s.SCode, s.ScriptFile, false)
  816. //}
  817. //unicode转码
  818. func transUnic(str string) string {
  819. buf := bytes.NewBuffer(nil)
  820. i, j := 0, len(str)
  821. for i < j {
  822. x := i + 6
  823. if x > j {
  824. buf.WriteString(str[i:])
  825. break
  826. }
  827. if str[i] == '\\' && str[i+1] == 'u' {
  828. hex := str[i+2 : x]
  829. r, err := strconv.ParseUint(hex, 16, 64)
  830. if err == nil {
  831. buf.WriteRune(rune(r))
  832. } else {
  833. logger.Warn(err.Error())
  834. buf.WriteString(str[i:x])
  835. }
  836. i = x
  837. } else {
  838. buf.WriteByte(str[i])
  839. i++
  840. }
  841. }
  842. return buf.String()
  843. }
  844. //取得变量
  845. func (s *Script) GetVar(key string) string {
  846. return s.L.GetGlobal(key).String()
  847. }
  848. //
  849. func (s *Script) GetIntVar(key string) int {
  850. lv := s.L.GetGlobal(key)
  851. if v, ok := lv.(lua.LNumber); ok {
  852. return int(v)
  853. }
  854. return -1
  855. }
  856. //
  857. func (s *Script) GetBoolVar(key string) bool {
  858. lv := s.L.GetGlobal(key)
  859. if v, ok := lv.(lua.LBool); ok {
  860. return bool(v)
  861. }
  862. return false
  863. }
  864. func isWorkTime() {
  865. workTime = util.IsWorkTime()
  866. util.TimeAfterFunc(10*time.Minute, isWorkTime, TimeChan)
  867. }
  868. //设置睡眠时间
  869. func SleepTime(basetime int, times []time.Duration) {
  870. st := 0 //记录最后睡眠时长
  871. base := float64(basetime * 60)
  872. if times[3].Seconds() > base { //最后一次大于 basetime*60秒
  873. if times[2].Seconds() > base {
  874. n := 0
  875. if times[0].Seconds() > base {
  876. n++
  877. }
  878. if times[1].Seconds() > base {
  879. n++
  880. }
  881. st = n + 1
  882. } else if times[2].Seconds() < base && times[0].Seconds() > base && times[1].Seconds() > base {
  883. st = 1
  884. }
  885. }
  886. if st > 0 {
  887. time.Sleep(time.Duration(st) * time.Minute)
  888. }
  889. }
  890. func generateKey(key []byte) (genKey []byte) {
  891. genKey = make([]byte, 16)
  892. copy(genKey, key)
  893. for i := 16; i < len(key); {
  894. for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
  895. genKey[j] ^= key[i]
  896. }
  897. }
  898. return genKey
  899. }