script.go 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231
  1. package spider
  2. import (
  3. codegrpc "analysiscode/client"
  4. "bytes"
  5. "compress/gzip"
  6. "encoding/base64"
  7. "encoding/json"
  8. "github.com/shopspring/decimal"
  9. gojs "gorunjs/client"
  10. "io"
  11. "io/ioutil"
  12. mu "mfw/util"
  13. "net/http"
  14. "net/url"
  15. "path"
  16. qu "qfw/util"
  17. "regexp"
  18. util "spiderutil"
  19. "strconv"
  20. "strings"
  21. "sync/atomic"
  22. "time"
  23. gq "github.com/PuerkitoBio/goquery"
  24. "github.com/cjoudrey/gluahttp"
  25. "github.com/donnie4w/go-logger/logger"
  26. lujson "github.com/yuin/gopher-json"
  27. "github.com/yuin/gopher-lua"
  28. "golang.org/x/text/encoding/simplifiedchinese"
  29. "golang.org/x/text/transform"
  30. )
  31. const (
  32. MAX_STEP = 5 //计算时的最大步长
  33. )
  34. var TimeSleepChan = make(chan bool, 1)
  35. // 脚本
  36. type Script struct {
  37. SCode, ScriptFile string
  38. Encoding string
  39. Userproxy bool
  40. //Ishttps bool
  41. ErrorNum int32 //错误数
  42. Downloader string //下载器
  43. TotalRequestNum int32 //总请求次数
  44. ToDayRequestNum int32 //今日请求次数
  45. YestoDayRequestNum int32 //昨日请求次数
  46. Timeout int64 //超时时间秒
  47. L *lua.LState
  48. NoDownloadNum int32 //未成功下载数
  49. LastThreeTimes []time.Duration //单条信息流程完成的时间,最后三次
  50. FileLastThreeTimes []time.Duration //附件下载单条信息流程完成的时间,最后三次
  51. }
  52. var ErrFid = "a6879f0a8570256aa21fb978e6dabb50429a30dfacff697cf0b898abbc5c262e" //限制访问的附件
  53. // 加载文件
  54. func (s *Script) LoadScript(site, channel, user *string, code, script_file string) string {
  55. defer mu.Catch()
  56. s.SCode = code
  57. s.ScriptFile = script_file
  58. s.L = lua.NewState(lua.Options{
  59. RegistrySize: 256 * 20,
  60. CallStackSize: 256,
  61. IncludeGoStackTrace: false,
  62. })
  63. s.L.PreloadModule("http", gluahttp.NewHttpModule(&http.Client{}).Loader)
  64. s.L.PreloadModule("json", lujson.Loader)
  65. if err := s.L.DoString(script_file); err != nil {
  66. logger.Debug(code + ",加载lua脚本错误:" + err.Error())
  67. return "加载lua脚本错误:" + err.Error()
  68. //panic(code + ",加载lua脚本错误:" + err.Error())
  69. }
  70. s.Encoding = s.GetVar("spiderPageEncoding")
  71. s.Userproxy = s.GetBoolVar("spiderUserProxy")
  72. //暴露go方法
  73. //download(url,head) 普通下载
  74. s.L.SetGlobal("download", s.L.NewFunction(func(S *lua.LState) int {
  75. if s.LastThreeTimes == nil {
  76. s.LastThreeTimes = make([]time.Duration, 4)
  77. }
  78. if util.Config.IsDelay {
  79. SleepTime(1, s.LastThreeTimes) //睡眠时间
  80. }
  81. start := time.Now() //起始时间
  82. head := S.ToTable(-1)
  83. url := S.ToString(-2)
  84. ishttps := S.ToBool(-3)
  85. charset := S.ToString(-4)
  86. if charset == "" {
  87. charset = s.Encoding
  88. }
  89. var retLen int64
  90. ret := Download(&retLen, s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  91. //流量统计
  92. //if retLen > 0 {
  93. // key := Today + "+" + code
  94. // if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
  95. // if sfMap, ok := sf.(*SpiderFlow); ok {
  96. // sfMap.Flow += retLen
  97. // //sfMap.Site = *site
  98. // //sfMap.Channel = *channel
  99. // //sfMap.ModifyUser = *user
  100. // SpiderFlowMap.Store(key, sfMap)
  101. // }
  102. // } else {
  103. // SpiderFlowMap.Store(key, &SpiderFlow{
  104. // //Code: code,
  105. // Site: *site,
  106. // Channel: *channel,
  107. // Flow: retLen,
  108. // ModifyUser: *user,
  109. // })
  110. // }
  111. //}
  112. S.Push(lua.LString(ret))
  113. atomic.AddInt32(&s.ToDayRequestNum, 1)
  114. atomic.AddInt32(&s.TotalRequestNum, 1)
  115. end := time.Since(start)
  116. if len(s.LastThreeTimes) >= 4 {
  117. s.LastThreeTimes = s.LastThreeTimes[1:]
  118. }
  119. s.LastThreeTimes = append(s.LastThreeTimes, end)
  120. return 1
  121. }))
  122. //高级下载downloadAdv(url,method,param,head,cookie)
  123. s.L.SetGlobal("downloadAdv", s.L.NewFunction(func(S *lua.LState) int {
  124. if s.LastThreeTimes == nil {
  125. s.LastThreeTimes = make([]time.Duration, 4)
  126. }
  127. if util.Config.IsDelay {
  128. SleepTime(1, s.LastThreeTimes) //睡眠时间
  129. }
  130. start := time.Now() //起始时间
  131. cookie := S.ToString(-1)
  132. head := S.ToTable(-2)
  133. param := S.ToTable(-3)
  134. method := S.ToString(-4)
  135. url := S.ToString(-5)
  136. ishttps := S.ToBool(-6)
  137. charset := S.ToString(-7)
  138. if charset == "" {
  139. charset = s.Encoding
  140. }
  141. var mycookie []*http.Cookie
  142. json.Unmarshal([]byte(cookie), &mycookie)
  143. var ret string
  144. var retcookie []*http.Cookie
  145. var headers = map[string]interface{}{}
  146. var retLen int64
  147. if param == nil {
  148. ptext := map[string]interface{}{"text": S.ToString(-3)}
  149. ret, retcookie, headers = DownloadAdv(&retLen, s.Downloader, url, method, ptext, util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  150. } else {
  151. ret, retcookie, headers = DownloadAdv(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
  152. }
  153. //流量统计
  154. //if retLen > 0 {
  155. // key := Today + "+" + code
  156. // if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
  157. // if sfMap, ok := sf.(*SpiderFlow); ok {
  158. // sfMap.Flow += retLen
  159. // //sfMap.Site = *site
  160. // //sfMap.Channel = *channel
  161. // //sfMap.ModifyUser = *user
  162. // SpiderFlowMap.Store(key, sfMap)
  163. // }
  164. // } else {
  165. // SpiderFlowMap.Store(key, &SpiderFlow{
  166. // //Code: code,
  167. // Site: *site,
  168. // Channel: *channel,
  169. // Flow: retLen,
  170. // ModifyUser: *user,
  171. // })
  172. // }
  173. //}
  174. S.Push(lua.LString(ret))
  175. scookie, _ := json.Marshal(retcookie)
  176. S.Push(lua.LString(scookie))
  177. hTable := util.MapToLuaTable(S, headers)
  178. S.Push(hTable)
  179. atomic.AddInt32(&s.ToDayRequestNum, 1)
  180. atomic.AddInt32(&s.TotalRequestNum, 1)
  181. end := time.Since(start)
  182. if len(s.LastThreeTimes) >= 4 {
  183. s.LastThreeTimes = s.LastThreeTimes[1:]
  184. }
  185. s.LastThreeTimes = append(s.LastThreeTimes, end)
  186. return 3
  187. }))
  188. //下载附件downloadFile(url,method,param,head,cookie,fileName)
  189. s.L.SetGlobal("downloadFile", s.L.NewFunction(func(S *lua.LState) int {
  190. if s.FileLastThreeTimes == nil {
  191. s.FileLastThreeTimes = make([]time.Duration, 4)
  192. }
  193. if util.Config.IsDelay {
  194. SleepTime(3, s.FileLastThreeTimes) //睡眠时间
  195. }
  196. start := time.Now() //起始时间
  197. cookie := S.ToString(-1)
  198. head := S.ToTable(-2)
  199. param := S.ToTable(-3)
  200. method := S.ToString(-4)
  201. url := S.ToString(-5)
  202. fileName := S.ToString(-6)
  203. ishttps := strings.Contains(url, "https")
  204. //base64匹配
  205. base64UrlReg := regexp.MustCompile("data:image")
  206. indexArr := base64UrlReg.FindStringIndex(url)
  207. name, size, ftype, fid := "", "", "", ""
  208. tmpUrl := ""
  209. var ret []byte
  210. var err error
  211. var mycookie []*http.Cookie
  212. if cookie != "{}" {
  213. json.Unmarshal([]byte(cookie), &mycookie)
  214. } else {
  215. mycookie = make([]*http.Cookie, 0)
  216. }
  217. //base64 url
  218. if len(indexArr) == 2 { //base64 http://www.mmjyjt.com/data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAqAAAAOwCAYAAAD
  219. //截取base64
  220. start := indexArr[0]
  221. url = url[start:]
  222. fileName = "文件下载.jpg"
  223. index := strings.Index(url, ",")
  224. dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(url[index+1:]))
  225. ret, err = io.ReadAll(dec)
  226. if err == nil && len(ret) > 0 {
  227. url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, "", ret)
  228. }
  229. } else {
  230. fileName = strings.TrimSpace(fileName)
  231. url = strings.TrimSpace(url)
  232. tmpUrl = url
  233. ret = DownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
  234. url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
  235. if strings.TrimSpace(ftype) == "" {
  236. if len(path.Ext(name)) > 0 {
  237. ftype = path.Ext(name)[1:]
  238. }
  239. }
  240. }
  241. //特殊处理中国招标投标公共服务平台异常附件过滤
  242. if *site == "中国招标投标公共服务平台" {
  243. if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件
  244. size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载
  245. } else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf
  246. logger.Info("Error File Type:", bttype, url)
  247. size, ftype, fid = "", "", ""
  248. }
  249. } else if *site == "中国政府采购网" && tmpUrl != "" { //中国政府采购网附件大小异常,限制IP所致
  250. if size == "4.1 KB" || size == "4.2 KB" {
  251. times := 1
  252. for { //重试三次
  253. if times > 3 {
  254. break
  255. }
  256. //http://www.ccgp.gov.cn/cggg/dfgg/jzxcs/202302/t20230210_19437644.htm
  257. ret = DownloadFile(s.Downloader, tmpUrl, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
  258. bs := bytes.NewReader(ret)
  259. bsLen := qu.ConvertFileSize(bs.Len())
  260. if bsLen != "4.1 KB" && bsLen != "4.2 KB" && bsLen != "0 B" {
  261. url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, tmpUrl, ret)
  262. break
  263. }
  264. times++
  265. }
  266. if size == "4.1 KB" || size == "4.2 KB" { //重试后异常
  267. fid = ""
  268. ftype = ""
  269. name = ""
  270. }
  271. }
  272. }
  273. S.Push(lua.LString(url))
  274. S.Push(lua.LString(name))
  275. S.Push(lua.LString(size))
  276. S.Push(lua.LString(ftype))
  277. S.Push(lua.LString(fid))
  278. atomic.AddInt32(&s.ToDayRequestNum, 1)
  279. atomic.AddInt32(&s.TotalRequestNum, 1)
  280. end := time.Since(start)
  281. if len(s.FileLastThreeTimes) >= 4 {
  282. s.FileLastThreeTimes = s.FileLastThreeTimes[1:]
  283. }
  284. s.FileLastThreeTimes = append(s.FileLastThreeTimes, end)
  285. return 5
  286. }))
  287. //下载、上传base64图片
  288. s.L.SetGlobal("downloadBase64File", s.L.NewFunction(func(S *lua.LState) int {
  289. url := S.ToString(-3)
  290. fileName := S.ToString(-2)
  291. base64Img := S.ToString(-1)
  292. if fileName == "" {
  293. fileName = "文件下载"
  294. }
  295. fileName = fileName + ".jpg"
  296. i := strings.Index(base64Img, ",")
  297. dec := base64.NewDecoder(base64.StdEncoding, strings.NewReader(base64Img[i+1:]))
  298. ret, err := io.ReadAll(dec)
  299. name, size, ftype, fid := "", "", "", ""
  300. if err == nil && len(ret) > 0 {
  301. url, name, size, ftype, fid = util.UploadFile(s.SCode, fileName, url, ret)
  302. }
  303. S.Push(lua.LString(url))
  304. S.Push(lua.LString(name))
  305. S.Push(lua.LString(size))
  306. S.Push(lua.LString(ftype))
  307. S.Push(lua.LString(fid))
  308. return 5
  309. }))
  310. //保存验证错误日志
  311. s.L.SetGlobal("saveErrLog", s.L.NewFunction(func(S *lua.LState) int {
  312. code := S.ToString(-4)
  313. name := S.ToString(-3)
  314. url := S.ToString(-2)
  315. content := S.ToString(-1)
  316. //saveVerificationLog(code, name, url, content)
  317. logger.Info("Error Log:", code, name, url, content)
  318. atomic.AddInt32(&s.ErrorNum, 1)
  319. atomic.AddInt32(&s.NoDownloadNum, 1)
  320. //防止恶意增加日志
  321. util.TimeSleepFunc(5*time.Second, TimeSleepChan)
  322. return 0
  323. }))
  324. //添加改版日志
  325. s.L.SetGlobal("saveRevisionLog", s.L.NewFunction(func(S *lua.LState) int {
  326. url := S.ToString(-2)
  327. str := S.ToString(-1)
  328. logger.Error(s.SCode, url, str)
  329. return 0
  330. }))
  331. //查找信息是否存在(作废)
  332. s.L.SetGlobal("findHasExit", s.L.NewFunction(func(S *lua.LState) int {
  333. //c := S.ToString(-2)
  334. //q := S.ToString(-1)
  335. //b := findHasExit(c, q)
  336. S.Push(lua.LBool(false))
  337. return 1
  338. }))
  339. s.L.SetGlobal("findOneText", s.L.NewFunction(func(S *lua.LState) int {
  340. nodetype := S.ToString(-3)
  341. gpath := S.ToString(-2)
  342. content := S.ToString(-1)
  343. ret := util.FindOneText(gpath, content, nodetype)
  344. S.Push(ret)
  345. return 1
  346. }))
  347. s.L.SetGlobal("findContentText", s.L.NewFunction(func(S *lua.LState) int {
  348. gpath := S.ToString(-2)
  349. content := S.ToString(-1)
  350. ret := util.FindContentText(gpath, content)
  351. S.Push(ret)
  352. return 1
  353. }))
  354. s.L.SetGlobal("findOneHtml", s.L.NewFunction(func(S *lua.LState) int {
  355. nodetype := S.ToString(-3)
  356. gpath := S.ToString(-2)
  357. content := S.ToString(-1)
  358. ret := util.FindOneHtml(gpath, content, nodetype)
  359. S.Push(ret)
  360. return 1
  361. }))
  362. s.L.SetGlobal("findListText", s.L.NewFunction(func(S *lua.LState) int {
  363. gpath := S.ToString(-2)
  364. content := S.ToString(-1)
  365. ret := s.L.NewTable()
  366. util.FindListText(gpath, content, ret)
  367. S.Push(ret)
  368. return 1
  369. }))
  370. s.L.SetGlobal("findListHtml", s.L.NewFunction(func(S *lua.LState) int {
  371. gpath := S.ToString(-2)
  372. content := S.ToString(-1)
  373. ret := s.L.NewTable()
  374. util.FindListHtml(gpath, content, ret)
  375. //if ret.Len() > 0 {
  376. // UpdateHeart(site, channel, code, user, "findlist") //记录列表页实际采集数据量心跳
  377. //}
  378. S.Push(ret)
  379. return 1
  380. }))
  381. //推送列表页下载数据量
  382. s.L.SetGlobal("sendListNum", s.L.NewFunction(func(S *lua.LState) int {
  383. table := S.ToTable(-1)
  384. list := util.TableToMap(table)
  385. logger.Info(s.SCode, len(list))
  386. //if len(list) > 0 {
  387. // UpdateHeart(*site, *channel, code, *user, "findlist") //记录列表页实际采集数据量心跳
  388. //}
  389. return 1
  390. }))
  391. // s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int {
  392. // update := [][]map[string]interface{}{}
  393. // query := map[string]interface{}{"state": 0}
  394. // data, _ := Mgo.Find(util.Config.TmpCollName, query, `{"_id":-1}`, nil, false, 0, 10)
  395. // pageList := []interface{}{}
  396. // for _, d := range *data {
  397. // tmpMap := map[string]string{}
  398. // tmpMap["title"] = qu.ObjToString(d["title"])
  399. // tmpMap["detail"] = qu.ObjToString(d["detail"])
  400. // tmpMap["href"] = qu.ObjToString(d["href"])
  401. // publishtime := qu.Int64All(d["publishtime"])
  402. // tmpMap["publishtime"] = qu.FormatDateByInt64(&publishtime, qu.Date_Full_Layout)
  403. // tmpMap["_id"] = qu.BsonIdToSId(d["_id"])
  404. // pageList = append(pageList, tmpMap)
  405. // update = append(update, []map[string]interface{}{
  406. // map[string]interface{}{"_id": d["_id"]},
  407. // map[string]interface{}{"$set": map[string]interface{}{"state": 1}},
  408. // })
  409. // }
  410. // ret := util.MapToTable(s.L, pageList)
  411. // S.Push(ret)
  412. // if len(update) > 0 {
  413. // Mgo.UpdateBulk(util.Config.TmpCollName, update...)
  414. // }
  415. // return 1
  416. // }))
  417. s.L.SetGlobal("findMap", s.L.NewFunction(func(S *lua.LState) int {
  418. qmap := S.ToTable(-2)
  419. content := S.ToString(-1)
  420. ret := s.L.NewTable()
  421. util.FindMap(qmap, content, ret)
  422. S.Push(ret)
  423. return 1
  424. }))
  425. //公示暴露方式
  426. s.L.SetGlobal("getEcpsCode", s.L.NewFunction(func(S *lua.LState) int {
  427. area := strings.ToUpper(S.ToString(-2))
  428. content := S.ToString(-1)
  429. code, state := util.GetEcpsCode(area, []byte(content))
  430. if state == "wx" {
  431. code, _ = GetCodeByWx([]byte(content))
  432. }
  433. S.Push(lua.LString(code))
  434. return 1
  435. }))
  436. //调用jsvm
  437. s.L.SetGlobal("jsvm", s.L.NewFunction(func(S *lua.LState) int {
  438. js := S.ToString(-1)
  439. ret := s.L.NewTable()
  440. if js == "" {
  441. ret.RawSet(lua.LString("val"), lua.LString(""))
  442. ret.RawSet(lua.LString("err"), lua.LString("js is null"))
  443. } else {
  444. rep := util.JsVmPost(util.Config.JsVmUrl, js)
  445. ret.RawSet(lua.LString("val"), lua.LString(qu.ObjToString(rep["val"])))
  446. ret.RawSet(lua.LString("err"), lua.LString(qu.ObjToString(rep["err"])))
  447. }
  448. S.Push(ret)
  449. return 1
  450. }))
  451. //指定下载器
  452. s.L.SetGlobal("changeDownloader", s.L.NewFunction(func(S *lua.LState) int {
  453. s.Downloader = GetOneDownloader()
  454. S.Push(lua.LString(s.Downloader))
  455. return 1
  456. }))
  457. //指定下载器file
  458. s.L.SetGlobal("changeDownloaderFile", s.L.NewFunction(func(S *lua.LState) int {
  459. s.Downloader = GetOneDownloaderFile()
  460. S.Push(lua.LString(s.Downloader))
  461. return 1
  462. }))
  463. //手工延时
  464. s.L.SetGlobal("timeSleep", s.L.NewFunction(func(S *lua.LState) int {
  465. // if workTime {
  466. // util.TimeSleepFunc(time.Duration(S.ToInt(-1))*time.Second, TimeSleepChan)
  467. // } else {
  468. // util.TimeSleepFunc(1*time.Second, TimeSleepChan)
  469. // }
  470. util.TimeSleepFunc(time.Second*2, TimeSleepChan)
  471. return 0
  472. }))
  473. //编码解码
  474. s.L.SetGlobal("transCode", s.L.NewFunction(func(S *lua.LState) int {
  475. codeType := strings.ToLower(S.ToString(-2))
  476. str := S.CheckString(-1)
  477. switch codeType {
  478. case "unicode":
  479. str = strings.Replace(str, "%u", "\\u", -1)
  480. str = transUnic(str)
  481. case "urlencode_gbk":
  482. data, _ := ioutil.ReadAll(transform.NewReader(bytes.NewReader([]byte(str)), simplifiedchinese.GBK.NewEncoder()))
  483. l, _ := url.Parse("http://a.com/?" + string(data))
  484. tmpstr := l.Query().Encode()
  485. if len(tmpstr) > 1 {
  486. str = tmpstr[0 : len(tmpstr)-1]
  487. } else {
  488. str = ""
  489. }
  490. case "urlencode_utf8":
  491. l, _ := url.Parse("http://a.com/?" + str)
  492. tmpstr := l.Query().Encode()
  493. if len(tmpstr) > 1 {
  494. str = tmpstr[0 : len(tmpstr)-1]
  495. } else {
  496. str = ""
  497. }
  498. case "urldecode_utf8":
  499. str, _ = url.QueryUnescape(str)
  500. case "decode64":
  501. str = util.DecodeB64(str)
  502. case "encodemd5":
  503. str = qu.GetMd5String(str)
  504. case "htmldecode": //html实体码
  505. //txt := `<div align="left" style="margin-left: 0pt;"><span style='font-family:; font-size:13px; color:#000000'>&#22826;&#38451;&#23707;&#29305;&#21220;&#28040;&#38450;&#31449;&#12289;&#26494;&#28006;&#29305;&#21220;&#28040;&#38450;&#31449;&#24314;&#35774;&#39033;&#30446;&#35774;&#35745;&#20013;&#26631;&#20844;&#31034;</span></div>`
  506. str = S.ToString(-1)
  507. reg, _ := regexp.Compile("&#\\d+;")
  508. str = reg.ReplaceAllStringFunc(str, func(src string) string {
  509. v, _ := strconv.Atoi(src[2 : len(src)-1])
  510. return string(rune(v))
  511. })
  512. }
  513. S.Push(lua.LString(str))
  514. return 1
  515. }))
  516. //如果服务端返回的html是gzip压缩过格式的 这里需要转一下
  517. s.L.SetGlobal("unGzip", s.L.NewFunction(func(S *lua.LState) int {
  518. html := S.ToString(-1)
  519. bs := []byte(html)
  520. gzipreader, _ := gzip.NewReader(bytes.NewReader(bs))
  521. bs, _ = ioutil.ReadAll(gzipreader)
  522. S.Push(lua.LString(bs))
  523. return 1
  524. }))
  525. //luamaker提供的分析列表页url地址 获取列表数据公用方法
  526. s.L.SetGlobal("getSimpleListPage", s.L.NewFunction(func(S *lua.LState) int {
  527. html := S.ToString(-3)
  528. date_pattern := S.ToString(-2)
  529. pageListUrl := S.ToString(-1) //列表页url
  530. bs := []byte(html)
  531. tmparr := []string{}
  532. tmpret := []int{}
  533. re, _ := regexp.Compile(`采购|招标|公示|公告|意见|结果|通知|工程`)
  534. doc, _ := gq.NewDocumentFromReader(bytes.NewReader(bs))
  535. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  536. text := sq.Text()
  537. if len(text) < 30 {
  538. return
  539. }
  540. tmparr = append(tmparr, text)
  541. if re.MatchString(text) {
  542. tmpret = append(tmpret, 1)
  543. //logger.Debug(text)
  544. } else {
  545. tmpret = append(tmpret, 0)
  546. }
  547. })
  548. logger.Debug(tmpret)
  549. //线性分析,算周边,只算周围5步的点
  550. tmplen, thepos, themax := len(tmpret), -1, 0
  551. for i := 0; i < tmplen; i++ {
  552. if tmpret[i] == 0 {
  553. continue
  554. }
  555. start, end := i-MAX_STEP, i+MAX_STEP
  556. if start < 0 {
  557. start = 0
  558. }
  559. if end > tmplen {
  560. end = tmplen
  561. }
  562. tmp := 0
  563. //从当前位置往左,往右找连续点
  564. for j := i; j > start; j-- {
  565. if tmpret[j] == 1 {
  566. tmp++
  567. } else {
  568. break
  569. }
  570. }
  571. for j := i; j < end; j++ {
  572. if tmpret[j] == 1 {
  573. tmp++
  574. } else {
  575. break
  576. }
  577. }
  578. if tmp > themax {
  579. themax = tmp
  580. thepos = i
  581. }
  582. } //end of for...
  583. //logger.Debug("找位置完成")
  584. //验证
  585. if thepos == -1 {
  586. logger.Error("完蛋,找不到")
  587. panic("不支持啊,失败啊")
  588. }
  589. //下边是找父容器
  590. var thelink *gq.Selection
  591. doc.Find("a").Each(func(i int, sq *gq.Selection) {
  592. if sq.Text() == tmparr[thepos] {
  593. thelink = sq
  594. }
  595. })
  596. isfind := false
  597. //同样Path向上找,不超过5步
  598. for i := 0; i < MAX_STEP; i++ {
  599. thelink = thelink.Parent()
  600. clen := getChildrenLen(thelink)
  601. if clen >= themax-1 {
  602. isfind = true
  603. break
  604. }
  605. //logger.Debug("TAG:::", thelink.Nodes[0].Data, clen)
  606. }
  607. //找到列表
  608. pageList := []interface{}{}
  609. if isfind {
  610. thelink.Children().Each(func(i int, sq *gq.Selection) {
  611. page := map[string]string{}
  612. link_sq := sq.Find("a")
  613. href := link_sq.AttrOr("href", "")
  614. text := link_sq.Text()
  615. page["title"] = text
  616. page["href"] = dealHref(pageListUrl, href)
  617. page["publishtime"] = dealPublishTime(strings.TrimSpace(sq.Text()), date_pattern)
  618. //logger.Debug(i)
  619. pageList = append(pageList, page)
  620. })
  621. } else {
  622. logger.Error("完蛋,找父亲节点失败啊")
  623. //panic("不支持啊,失败啊")
  624. }
  625. ret := util.MapToTable(s.L, pageList)
  626. S.Push(ret)
  627. return 1
  628. }))
  629. //招投标信息标题判重
  630. s.L.SetGlobal("titleRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  631. S.Push(lua.LBool(false))
  632. return 1
  633. }))
  634. //招标信息判重新方法 2016-12-14 wanghuidong
  635. s.L.SetGlobal("urlRepeatJudgement", s.L.NewFunction(func(S *lua.LState) int {
  636. S.Push(lua.LBool(false))
  637. return 1
  638. }))
  639. //将url放入内存缓存 2016-12-14 wanghuidong
  640. s.L.SetGlobal("putUrl2Redis", s.L.NewFunction(func(S *lua.LState) int {
  641. //url := S.ToString(-1)
  642. return 1
  643. }))
  644. //解析附件中的word、pdf
  645. s.L.SetGlobal("officeAnalysis", s.L.NewFunction(func(S *lua.LState) int {
  646. ext := map[string]byte{"pdf": byte(0), "doc": byte(1), "docx": byte(2)}
  647. str := S.ToString(-2)
  648. extension := S.ToString(-1)
  649. bs, _ := base64.StdEncoding.DecodeString(str)
  650. bs = append([]byte{ext[extension]}, bs...)
  651. msgid := mu.UUID(8)
  652. Msclient.Call("", msgid, mu.SERVICE_OFFICE_ANALYSIS, mu.SENDTO_TYPE_ALL_RECIVER, bs, 60)
  653. return 1
  654. }))
  655. s.L.SetGlobal("clearMemoeryCache", s.L.NewFunction(func(S *lua.LState) int {
  656. /*title := S.ToString(-1)
  657. isExist, _ := redis.Exists("title_repeat_judgement", "title_repeat_"+title)
  658. if isExist {
  659. redis.Del("title_repeat_judgement", "title_repeat_"+title)
  660. }*/
  661. return 1
  662. }))
  663. //支持正则,提取
  664. s.L.SetGlobal("regexp", s.L.NewFunction(func(S *lua.LState) int {
  665. index := int(S.ToNumber(-1))
  666. regstr := S.ToString(-2)
  667. text := S.ToString(-3)
  668. reg := regexp.MustCompile(regstr)
  669. reps := reg.FindAllStringSubmatchIndex(text, -1)
  670. ret := s.L.NewTable()
  671. number := 0
  672. for _, v := range reps {
  673. number++
  674. ret.Insert(number, lua.LString(text[v[index]:v[index+1]]))
  675. }
  676. S.Push(ret)
  677. return 1
  678. }))
  679. //支持替换
  680. s.L.SetGlobal("replace", s.L.NewFunction(func(S *lua.LState) int {
  681. text := S.ToString(-3)
  682. old := S.ToString(-2)
  683. repl := S.ToString(-1)
  684. text = strings.Replace(text, old, repl, -1)
  685. S.Push(lua.LString(text))
  686. return 1
  687. }))
  688. //标题的关键词、排除词过滤
  689. s.L.SetGlobal("pagefilterword", s.L.NewFunction(func(S *lua.LState) int {
  690. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  691. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  692. data := S.ToTable(-1)
  693. dataMap := util.TableToMap(data)
  694. ret := s.L.NewTable()
  695. num := 1
  696. for _, v := range dataMap {
  697. tmp := v.(map[string]interface{})
  698. isOk := false
  699. if title := qu.ObjToString(tmp["title"]); title != "" {
  700. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  701. isOk = true
  702. }
  703. }
  704. if isOk {
  705. ret.Insert(num, util.MapToLuaTable(S, tmp))
  706. num++
  707. }
  708. }
  709. S.Push(ret)
  710. return 1
  711. }))
  712. //标题的关键词、排除词过滤
  713. s.L.SetGlobal("detailfilterword", s.L.NewFunction(func(S *lua.LState) int {
  714. keyWordReg := regexp.MustCompile(util.Config.Word["keyword"])
  715. notKeyWordReg := regexp.MustCompile(util.Config.Word["notkeyword"])
  716. data := S.ToTable(-1)
  717. dataMap := util.TableToMap(data)
  718. if title := qu.ObjToString(dataMap["title"]); title != "" {
  719. if keyWordReg.MatchString(title) && !notKeyWordReg.MatchString(title) {
  720. S.Push(lua.LBool(true))
  721. return 1
  722. } else {
  723. qu.Debug(s.SCode, dataMap["href"], " title error")
  724. }
  725. } else {
  726. qu.Debug(s.SCode, dataMap["href"], " title error")
  727. }
  728. S.Push(lua.LBool(false))
  729. return 1
  730. }))
  731. //detail过滤
  732. s.L.SetGlobal("filterdetail", s.L.NewFunction(func(S *lua.LState) int {
  733. /*
  734. 1.长度判断 (特殊处理:详情请访问原网页!;详见原网页;见原网页;无;无相关内容;无正文内容)
  735. 2.是否含汉字
  736. */
  737. reg1 := regexp.MustCompile("(原网页|无|无相关内容|无正文内容|见附件|详见附件)")
  738. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  739. detail := S.ToString(-1)
  740. if reg1.MatchString(detail) {
  741. S.Push(lua.LBool(true))
  742. return 1
  743. }
  744. if len([]rune(detail)) < 50 || !reg2.MatchString(detail) {
  745. S.Push(lua.LBool(false))
  746. return 1
  747. }
  748. S.Push(lua.LBool(false))
  749. return 1
  750. }))
  751. //匹配汉字
  752. s.L.SetGlobal("matchan", s.L.NewFunction(func(S *lua.LState) int {
  753. reg1 := regexp.MustCompile("(见附件|详见附件)")
  754. reg2 := regexp.MustCompile("[\u4e00-\u9fa5]")
  755. detail := S.ToString(-1)
  756. detail = reg1.ReplaceAllString(detail, "")
  757. ok := reg2.MatchString(detail)
  758. S.Push(lua.LBool(ok))
  759. return 1
  760. }))
  761. //base64加密
  762. s.L.SetGlobal("encodeBase64", s.L.NewFunction(func(S *lua.LState) int {
  763. text := S.ToString(-1)
  764. base64Text := base64.StdEncoding.EncodeToString([]byte(text))
  765. S.Push(lua.LString(base64Text))
  766. return 1
  767. }))
  768. //base64解密
  769. s.L.SetGlobal("decodeBase64", s.L.NewFunction(func(S *lua.LState) int {
  770. text := S.ToString(-1)
  771. result := ""
  772. byteText, err := base64.StdEncoding.DecodeString(text)
  773. if err == nil {
  774. result = string(byteText)
  775. }
  776. S.Push(lua.LString(result))
  777. return 1
  778. }))
  779. //aes cbc模式加密
  780. s.L.SetGlobal("aesEncryptCBC", s.L.NewFunction(func(S *lua.LState) int {
  781. origData := S.ToString(-3)
  782. key := S.ToString(-2)
  783. iv := S.ToString(-1)
  784. bytekey := []byte(key)
  785. byteorigData := []byte(origData)
  786. byteiv := []byte(iv)
  787. encrypted := util.AesCBCEncrypt(byteorigData, bytekey, byteiv)
  788. // 将加密后的数据和初始向量进行Base64编码
  789. result := base64.StdEncoding.EncodeToString(encrypted)
  790. S.Push(lua.LString(result))
  791. return 1
  792. }))
  793. //aes cbc模式解密
  794. s.L.SetGlobal("aesDecryptCBC", s.L.NewFunction(func(S *lua.LState) int {
  795. origData := S.ToString(-3)
  796. key := S.ToString(-2)
  797. iv := S.ToString(-1)
  798. bytekey := []byte(key)
  799. byteiv := []byte(iv)
  800. data, _ := base64.StdEncoding.DecodeString(origData)
  801. result := util.AesCBCDecrypter(data, bytekey, byteiv)
  802. S.Push(lua.LString(result))
  803. return 1
  804. }))
  805. //aes ecb模式加密
  806. s.L.SetGlobal("aesEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
  807. origData := S.ToString(-2)
  808. key := S.ToString(-1)
  809. bytekey := []byte(key)
  810. byteorigData := []byte(origData)
  811. encrypted := util.AesECBEncrypt(byteorigData, bytekey)
  812. result := base64.StdEncoding.EncodeToString(encrypted)
  813. S.Push(lua.LString(result))
  814. return 1
  815. }))
  816. //aes ecb模式解密
  817. s.L.SetGlobal("aesDecryptECB", s.L.NewFunction(func(S *lua.LState) int {
  818. origData := S.ToString(-2)
  819. key := S.ToString(-1)
  820. data, _ := base64.StdEncoding.DecodeString(origData)
  821. result := util.AesECBDecrypter(data, []byte(key))
  822. S.Push(lua.LString(result))
  823. return 1
  824. }))
  825. //des ecb模式加密
  826. s.L.SetGlobal("desEncryptECB", s.L.NewFunction(func(S *lua.LState) int {
  827. origData := S.ToString(-2)
  828. key := S.ToString(-1)
  829. encrypted := util.DesECBEncrypt([]byte(origData), []byte(key))
  830. result := base64.StdEncoding.EncodeToString(encrypted)
  831. S.Push(lua.LString(result))
  832. return 1
  833. }))
  834. //des ecb模式解密
  835. s.L.SetGlobal("desDecryptECB", s.L.NewFunction(func(S *lua.LState) int {
  836. origData := S.ToString(-2)
  837. key := S.ToString(-1)
  838. data, _ := base64.StdEncoding.DecodeString(origData)
  839. result := util.DesECBDecrypter(data, []byte(key))
  840. S.Push(lua.LString(result))
  841. return 1
  842. }))
  843. //des cbc模式加密
  844. s.L.SetGlobal("desEncryptCBC", s.L.NewFunction(func(S *lua.LState) int {
  845. origData := S.ToString(-3)
  846. key := S.ToString(-2)
  847. iv := S.ToString(-1)
  848. bytekey := []byte(key)
  849. byteorigData := []byte(origData)
  850. byteiv := []byte(iv)
  851. encrypted := util.DesCBCEncrypt(byteorigData, bytekey, byteiv)
  852. result := base64.StdEncoding.EncodeToString(encrypted)
  853. S.Push(lua.LString(result))
  854. return 1
  855. }))
  856. //des cbc模式解密
  857. s.L.SetGlobal("desDecryptCBC", s.L.NewFunction(func(S *lua.LState) int {
  858. origData := S.ToString(-3)
  859. key := S.ToString(-2)
  860. iv := S.ToString(-1)
  861. bytekey := []byte(key)
  862. byteiv := []byte(iv)
  863. data, _ := base64.StdEncoding.DecodeString(origData)
  864. result := util.DesCBCDecrypter(data, bytekey, byteiv)
  865. S.Push(lua.LString(result))
  866. return 1
  867. }))
  868. //rsa 公钥加密
  869. s.L.SetGlobal("rsaEncrypt", s.L.NewFunction(func(S *lua.LState) int {
  870. origData := S.ToString(-2)
  871. key := S.ToString(-1)
  872. encrypted := util.EncryptWithPublicKey([]byte(origData), []byte(key))
  873. result := base64.StdEncoding.EncodeToString(encrypted)
  874. S.Push(lua.LString(result))
  875. return 1
  876. }))
  877. //rsa 私钥解密
  878. s.L.SetGlobal("rsaDecrypt", s.L.NewFunction(func(S *lua.LState) int {
  879. origData := S.ToString(-2)
  880. key := S.ToString(-1)
  881. data, _ := base64.StdEncoding.DecodeString(origData)
  882. result := util.DecryptWithPrivateKey(data, []byte(key))
  883. S.Push(lua.LString(result))
  884. return 1
  885. }))
  886. //根据正文获取发布时间
  887. s.L.SetGlobal("getPublishtime", s.L.NewFunction(func(S *lua.LState) int {
  888. detail := S.ToString(-2)
  889. contenthtml := S.ToString(-1)
  890. publishtime := util.GetPublishtime([]string{contenthtml, detail})
  891. S.Push(lua.LString(publishtime))
  892. return 1
  893. }))
  894. //匹配
  895. s.L.SetGlobal("stringFind", s.L.NewFunction(func(S *lua.LState) int {
  896. regstr := S.ToString(-1)
  897. text := S.ToString(-2)
  898. textReg := regexp.MustCompile(regstr)
  899. //spaceReg := regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
  900. //text = spaceReg.ReplaceAllString(text, "")
  901. result := textReg.FindString(text)
  902. isMatch := false
  903. if result != "" {
  904. isMatch = true
  905. }
  906. S.Push(lua.LString(result))
  907. S.Push(lua.LBool(isMatch))
  908. return 2
  909. }))
  910. //截取
  911. s.L.SetGlobal("stringSub", s.L.NewFunction(func(S *lua.LState) int {
  912. text := S.ToString(-3)
  913. start := S.ToInt(-2)
  914. end := S.ToInt(-1)
  915. result := ""
  916. if len(text) > 0 {
  917. textRune := []rune(text)
  918. textLen := len(textRune)
  919. if end < 0 {
  920. if start > 0 { //正向截取到倒数第end位
  921. result = string(textRune[start-1 : textLen+1+end])
  922. } else if start < 0 { //反向截取 从倒数第start位截取到倒数第end位
  923. result = string(textRune[textLen+start : textLen+1+end])
  924. }
  925. } else if start > 0 && end >= start && end <= textLen { //从第start个截取到第end个
  926. result = string(textRune[start-1 : end])
  927. }
  928. // if end == -1 {
  929. // if start >= 1 { //正向截取到结尾
  930. // result = string(textRune[start-1:])
  931. // } else if start < 0 && textLen+start >= 0 { //反向截取后缀
  932. // result = string(textRune[textLen+start:])
  933. // }
  934. // } else if start >= 1 && end <= textLen { //从第start个截取到第end个
  935. // result = string(textRune[start-1 : end])
  936. // }
  937. }
  938. S.Push(lua.LString(result))
  939. return 1
  940. }))
  941. //长度
  942. s.L.SetGlobal("stringLen", s.L.NewFunction(func(S *lua.LState) int {
  943. text := S.ToString(-1)
  944. textLen := len([]rune(text))
  945. S.Push(lua.LNumber(textLen))
  946. return 1
  947. }))
  948. //去除特殊标签中间内容
  949. s.L.SetGlobal("getPureContent", s.L.NewFunction(func(S *lua.LState) int {
  950. con := S.ToString(-1)
  951. reg := regexp.MustCompile("(?s)<(!%-%-|!--|style).*?(%-%-|--|style)>") //注释 css
  952. con = reg.ReplaceAllString(con, "")
  953. // indexArr := reg.FindAllStringIndex(con, -1)
  954. // for i := len(indexArr) - 1; i >= 0; i-- {
  955. // if index := indexArr[i]; len(index) == 2 {
  956. // con = con[:index[0]] + con[index[1]:]
  957. // }
  958. // }
  959. S.Push(lua.LString(con))
  960. return 1
  961. }))
  962. //interface转string
  963. s.L.SetGlobal("formatToString", s.L.NewFunction(func(S *lua.LState) int {
  964. strNum := S.ToString(-1)
  965. decimalNum, _ := decimal.NewFromString(strNum)
  966. S.Push(lua.LString(decimalNum.String()))
  967. return 1
  968. }))
  969. //获取验证码
  970. s.L.SetGlobal("getCodeByPath", s.L.NewFunction(func(S *lua.LState) int {
  971. cookie := S.ToString(-1)
  972. head := S.ToTable(-2)
  973. stype := S.ToString(-3)
  974. path := S.ToString(-4)
  975. proxy := S.ToBool(-5)
  976. headMap := util.GetTable(head)
  977. //qu.Debug("cookie----------", cookie)
  978. //qu.Debug("headMap----------", headMap)
  979. headJsonStr := ""
  980. headByte, err := json.Marshal(headMap)
  981. if err == nil {
  982. headJsonStr = string(headByte)
  983. }
  984. code, respHead, respCookie := codegrpc.GetCodeByPath(path, stype, headJsonStr, cookie, proxy)
  985. //qu.Debug("code====", code)
  986. //qu.Debug("respHead====", respHead)
  987. //qu.Debug("respCookie====", respCookie)
  988. S.Push(lua.LString(code))
  989. respHeadMap := map[string]interface{}{}
  990. json.Unmarshal([]byte(respHead), &respHeadMap)
  991. hTable := util.MapToLuaTable(S, respHeadMap)
  992. S.Push(hTable)
  993. S.Push(lua.LString(respCookie))
  994. return 3
  995. }))
  996. s.L.SetGlobal("goRunJs", s.L.NewFunction(func(S *lua.LState) int {
  997. param := S.ToString(-2) //list or detail
  998. step := S.ToString(-1) //参数
  999. result := gojs.GoRunJsGetResult(s.SCode, param, step)
  1000. qu.Debug("Go Run Js Result:", param, step, result)
  1001. S.Push(lua.LString(result))
  1002. return 1
  1003. }))
  1004. s.L.SetGlobal("newDownloadFile", s.L.NewFunction(func(S *lua.LState) int {
  1005. cookie := S.ToString(-1)
  1006. head := S.ToTable(-2)
  1007. param := S.ToTable(-3)
  1008. method := S.ToString(-4)
  1009. url := S.ToString(-5)
  1010. fileName := S.ToString(-6)
  1011. ishttps := strings.Contains(url, "https")
  1012. var mycookie []*http.Cookie
  1013. if cookie != "{}" {
  1014. json.Unmarshal([]byte(cookie), &mycookie)
  1015. } else {
  1016. mycookie = make([]*http.Cookie, 0)
  1017. }
  1018. fileName = strings.TrimSpace(fileName)
  1019. url = strings.TrimSpace(url)
  1020. ret := NewDownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout, false)
  1021. url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
  1022. if strings.TrimSpace(ftype) == "" {
  1023. if len(path.Ext(name)) > 0 {
  1024. ftype = path.Ext(name)[1:]
  1025. }
  1026. }
  1027. //特殊处理中国招标投标公共服务平台异常附件过滤
  1028. if *site == "中国招标投标公共服务平台" {
  1029. if fid != "" && strings.Contains(fid, ErrFid) { //限制访问的附件
  1030. size, ftype, fid = "", "", "" //信息置空,AnalysisProjectInfo方法将判断数据下载失败重新下载
  1031. } else if bttype := qu.GetFileType(ret); bttype != "pdf" { //由字节流解析的附件类型不是pdf
  1032. logger.Info("Error File Type:", bttype, url)
  1033. size, ftype, fid = "", "", ""
  1034. }
  1035. }
  1036. S.Push(lua.LString(url))
  1037. S.Push(lua.LString(name))
  1038. S.Push(lua.LString(size))
  1039. S.Push(lua.LString(ftype))
  1040. S.Push(lua.LString(fid))
  1041. return 5
  1042. }))
  1043. //渲染页面下载
  1044. s.L.SetGlobal("downloadByRender", s.L.NewFunction(func(S *lua.LState) int {
  1045. href := S.ToString(-1)
  1046. contentHtml := util.DownloadByRender(href)
  1047. S.Push(lua.LString(contentHtml))
  1048. return 1
  1049. }))
  1050. //chromedp下载
  1051. s.L.SetGlobal("downloadByChrome", s.L.NewFunction(func(S *lua.LState) int {
  1052. timeout := S.ToInt64(-2)
  1053. taskStr := S.ToString(-1)
  1054. cam := util.ChromeActionMap{}
  1055. if json.Unmarshal([]byte(taskStr), &cam) == nil {
  1056. if len(cam.BaseActions) > 0 {
  1057. if len(cam.RangeActions) > 0 && cam.RangeTimes > 0 {
  1058. for times := 1; times <= cam.RangeTimes; times++ {
  1059. cam.BaseActions = append(cam.BaseActions, cam.RangeActions...)
  1060. }
  1061. }
  1062. chromeTask := util.ChromeTask{
  1063. TimeOut: timeout,
  1064. Actions: cam.BaseActions,
  1065. }
  1066. ret := DownloadByChrome(s.SCode, s.Downloader, chromeTask, s.Timeout)
  1067. S.Push(util.MapToTable(S, ret))
  1068. } else {
  1069. S.Push(S.NewTable())
  1070. }
  1071. } else {
  1072. S.Push(S.NewTable())
  1073. }
  1074. return 1
  1075. }))
  1076. //针对中国招标投标公共服务平台三级页瑞数加密下载方法
  1077. s.L.SetGlobal("downloadByDataIntercept", s.L.NewFunction(func(S *lua.LState) int {
  1078. url := S.ToString(-4)
  1079. url_regex := S.ToString(-3)
  1080. timeout := S.ToInt(-2)
  1081. proxy := S.ToBool(-1)
  1082. headers := util.DownloadByDataIntercept(url, url_regex, timeout, proxy)
  1083. table := util.MapToLuaTable(S, headers)
  1084. S.Push(table)
  1085. return 1
  1086. }))
  1087. return ""
  1088. }
  1089. func dealHref(pageListUrl, href string) string {
  1090. returnUrl := ""
  1091. if href != "" {
  1092. r, _ := regexp.Compile("^./")
  1093. match := r.MatchString(href)
  1094. if match {
  1095. url2 := r.ReplaceAllString(href, "")
  1096. returnUrl = pageListUrl + url2
  1097. }
  1098. r2, _ := regexp.Compile("^/")
  1099. match2 := r2.MatchString(href)
  1100. if match2 {
  1101. r3, _ := regexp.Compile("http://[^/]*/")
  1102. domain := r3.FindString(pageListUrl)
  1103. //fmt.Println(domain)
  1104. url2 := r2.ReplaceAllString(href, "")
  1105. returnUrl = domain + url2
  1106. }
  1107. }
  1108. return returnUrl
  1109. }
  1110. func dealPublishTime(content string, pattern string) string {
  1111. publishTime := ""
  1112. if pattern == "yyyy-MM-dd HH:mm:ss" {
  1113. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}\\s*\\d{2}:\\d{2}:\\d{2}")
  1114. publishTime = r.FindString(content)
  1115. } else if pattern == "yyyy-MM-dd" {
  1116. r, _ := regexp.Compile("\\d{4}-\\d{2}-\\d{2}")
  1117. publishTime = r.FindString(content)
  1118. } else if pattern == "MM-dd" {
  1119. r, _ := regexp.Compile("\\d{2}-\\d{2}")
  1120. publishTime = r.FindString(content)
  1121. }
  1122. return publishTime
  1123. }
  1124. func getChildrenLen(sq *gq.Selection) (ret int) {
  1125. sq.Children().Each(func(i int, sq2 *gq.Selection) {
  1126. ret = i
  1127. })
  1128. return
  1129. }
  1130. // unicode转码
  1131. func transUnic(str string) string {
  1132. buf := bytes.NewBuffer(nil)
  1133. i, j := 0, len(str)
  1134. for i < j {
  1135. x := i + 6
  1136. if x > j {
  1137. buf.WriteString(str[i:])
  1138. break
  1139. }
  1140. if str[i] == '\\' && str[i+1] == 'u' {
  1141. hex := str[i+2 : x]
  1142. r, err := strconv.ParseUint(hex, 16, 64)
  1143. if err == nil {
  1144. buf.WriteRune(rune(r))
  1145. } else {
  1146. logger.Warn(err.Error())
  1147. buf.WriteString(str[i:x])
  1148. }
  1149. i = x
  1150. } else {
  1151. buf.WriteByte(str[i])
  1152. i++
  1153. }
  1154. }
  1155. return buf.String()
  1156. }
  1157. // 取得变量
  1158. func (s *Script) GetVar(key string) string {
  1159. return s.L.GetGlobal(key).String()
  1160. }
  1161. func (s *Script) GetIntVar(key string) int {
  1162. lv := s.L.GetGlobal(key)
  1163. if v, ok := lv.(lua.LNumber); ok {
  1164. return int(v)
  1165. }
  1166. return -1
  1167. }
  1168. func (s *Script) GetBoolVar(key string) bool {
  1169. lv := s.L.GetGlobal(key)
  1170. if v, ok := lv.(lua.LBool); ok {
  1171. return bool(v)
  1172. }
  1173. return false
  1174. }
  1175. // 设置睡眠时间
  1176. func SleepTime(basetime int, times []time.Duration) {
  1177. st := 0 //记录最后睡眠时长
  1178. base := float64(basetime * 60)
  1179. if times[3].Seconds() > base { //最后一次大于 basetime*60秒
  1180. if times[2].Seconds() > base {
  1181. n := 0
  1182. if times[0].Seconds() > base {
  1183. n++
  1184. }
  1185. if times[1].Seconds() > base {
  1186. n++
  1187. }
  1188. st = n + 1
  1189. } else if times[2].Seconds() < base && times[0].Seconds() > base && times[1].Seconds() > base {
  1190. st = 1
  1191. }
  1192. }
  1193. if st > 0 {
  1194. time.Sleep(time.Duration(st) * time.Minute)
  1195. }
  1196. }
  1197. func generateKey(key []byte) (genKey []byte) {
  1198. genKey = make([]byte, 16)
  1199. copy(genKey, key)
  1200. for i := 16; i < len(key); {
  1201. for j := 0; j < 16 && i < len(key); j, i = j+1, i+1 {
  1202. genKey[j] ^= key[i]
  1203. }
  1204. }
  1205. return genKey
  1206. }