template.go 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. // model
  2. package spiderutil
  3. const (
  4. //其他参数
  5. Tmp_Other = `spiderType="%s";
  6. spiderHistoryMaxPage=%d;
  7. spiderMoveEvent="%s";
  8. spiderIsCompete=%v;
  9. `
  10. //通用配置
  11. Tmp_common = `
  12. local json=require "json"
  13. local com=require "res.util.comm"
  14. spiderCode="%s";
  15. spiderName="%s";
  16. spiderChannel="%s";
  17. spiderDownDetailPage=%v;
  18. spiderStartPage=%d;
  19. spiderMaxPage=%d;
  20. spiderRunRate=%d;
  21. spider2Collection="%s";
  22. spiderPageEncoding="%s";
  23. spiderStoreMode=%d;
  24. spiderStoreToMsgEvent=%d;
  25. spiderTargetChannelUrl="%s";
  26. spiderLastDownloadTime="%s";
  27. spiderIsHistoricalMend=%v;
  28. spiderIsMustDownload=%v;
  29. spiderUserName="%s";
  30. spiderUserEmail="%s";
  31. spiderUploadTime="%s";
  32. spiderCoverAttr="title";
  33. spiderSleepBase=1000;
  34. spiderSleepRand=5000;
  35. spiderTimeout=150;
  36. `
  37. //获取最新时间
  38. Tmp_pubtime = `
  39. function getLastPublishTime()
  40. local timeType="%s"
  41. if timeType=="yyyyMMdd" or timeType=="MMdd" then
  42. return com.nowDate()
  43. end
  44. local content = download("%s",{})
  45. local tmp = findOneText("%s",content)
  46. local lastpushtime=com.parseDate(tmp,timeType)
  47. return lastpushtime
  48. end
  49. `
  50. //获取列表页
  51. Tmp_pagelist = `
  52. local lastRoundTagId = ""
  53. local currRoundTagId = ""
  54. local firstStart = true
  55. function downloadAndParseListPage(pageno)
  56. for i=1,5 do --5次下载任务不成功,退出
  57. local update="%s"
  58. local page={}
  59. local href="%s"--列表页通用地址
  60. local hrefs={%v}--固定列表配置
  61. local content = download(href,{})
  62. local list = findListHtml("%s",content)--信息块规则
  63. if list~=nil then
  64. sendListNum(list)--推送下载量
  65. for k, v in pairs(list) do
  66. local item={}
  67. item["href"]="%s"--信息地址
  68. item["title"]="%s"--信息标题
  69. item["publishtime"]="%s"--信息时间
  70. item=findMap(item,"<table><tr>"..v.."</tr></table>")
  71. if item["title"]~=nil and item["title"]~="" then
  72. item["title"]=com.trim(item["title"])
  73. local timeType="%s"
  74. item["publishtime"]=com.parseDate(item["publishtime"],timeType)--时间格式
  75. item["href"]=com.gethref(spiderTargetChannelUrl,item["href"])
  76. --Common-- --通用数据补充
  77. --Model-- --模型数据补充
  78. table.insert(page,item)
  79. end
  80. end
  81. end
  82. if table.getn(page)>0 then
  83. return page
  84. end
  85. end
  86. end
  87. `
  88. //获取三级页
  89. Tmp_content = `
  90. function downloadDetailPage(data)
  91. local update="%s"
  92. for i=1,3 do --3次下载任务不成功,退出
  93. local content = download(data["href"],{})
  94. data["detail"]=findContentText("%s",content)
  95. data["contenthtml"]=findOneHtml("%s",content)
  96. data["l_np_publishtime"]=com.strToTimestamp(data["publishtime"])
  97. data["_d"]="comeintime"
  98. local checkAttr={"title","href","publishtime","detail","contenthtml"}
  99. local b,err=com.checkData(checkAttr,data)
  100. if b then
  101. return data
  102. else
  103. timeSleep(60)--延时60秒再次请求
  104. if i==5 then
  105. --print("下载失败")
  106. saveErrLog(spiderCode,spiderName,data["href"],err)
  107. end
  108. end
  109. end
  110. end
  111. `
  112. )