template.go 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. // model
  2. package spiderutil
  3. const (
  4. //其他参数
  5. Tmp_Other = `spiderType="%s";
  6. spiderHistoryMaxPage=%d;
  7. spiderMoveEvent="%s";
  8. spiderIsCompete=%v;
  9. `
  10. //通用配置
  11. Tmp_common = `
  12. local json=require "json"
  13. local com=require "res.util.comm"
  14. spiderCode="%s";
  15. spiderName="%s";
  16. spiderChannel="%s";
  17. spiderDownDetailPage=%v;
  18. spiderStartPage=%d;
  19. spiderMaxPage=%d;
  20. spiderRunRate=%d;
  21. spider2Collection="%s";
  22. spiderPageEncoding="%s";
  23. spiderStoreMode=%d;
  24. spiderStoreToMsgEvent=%d;
  25. spiderTargetChannelUrl="%s";
  26. spiderLastDownloadTime="%s";
  27. spiderIsHistoricalMend=%v;
  28. spiderIsMustDownload=%v;
  29. spiderUserName="%s";
  30. spiderUserEmail="%s";
  31. spiderUploadTime="%s";
  32. spiderCoverAttr="title";
  33. spiderSleepBase=1000;
  34. spiderSleepRand=5000;
  35. spiderTimeout=150;
  36. `
  37. //获取最新时间
  38. Tmp_pubtime = `
  39. function getLastPublishTime()
  40. local timeType="%s"
  41. if timeType=="yyyyMMdd" or timeType=="MMdd" then
  42. return com.nowDate()
  43. end
  44. local content = download("%s",{})
  45. local tmp = findOneText("%s",content)
  46. local lastpushtime=com.parseDate(tmp,timeType)
  47. return lastpushtime
  48. end
  49. `
  50. //获取列表页
  51. Tmp_pagelist = `
  52. local lastRoundTagId = ""
  53. local currRoundTagId = ""
  54. local firstStart = true
  55. function downloadAndParseListPage(pageno)
  56. for i=1,5 do --5次下载任务不成功,退出
  57. local update="%s"
  58. local page={}
  59. local href="%s"--列表页通用地址
  60. local hrefs={%v}--固定列表配置
  61. local content = download(href,{})
  62. local list = findListHtml("%s",content)--信息块规则
  63. if list~=nil then
  64. for k, v in pairs(list) do
  65. local item={}
  66. item["href"]="%s"--信息地址
  67. item["title"]="%s"--信息标题
  68. item["publishtime"]="%s"--信息时间
  69. item=findMap(item,"<table><tr>"..v.."</tr></table>")
  70. if item["title"]~=nil and item["title"]~="" then --title校验
  71. item["title"]=com.trim(item["title"])
  72. sendListNum(k,list) --推送下载量
  73. local timeType="%s"
  74. item["publishtime"]=com.parseDate(item["publishtime"],timeType)--时间格式
  75. item["href"]=com.gethref(spiderTargetChannelUrl,item["href"])
  76. --Common-- --通用数据补充
  77. --Model-- --模型数据补充
  78. table.insert(page,item)
  79. end
  80. end
  81. end
  82. if table.getn(page)>0 then
  83. return page
  84. end
  85. end
  86. end
  87. `
  88. //获取三级页
  89. Tmp_content = `
  90. function downloadDetailPage(data)
  91. local update="%s"
  92. for i=1,3 do --3次下载任务不成功,退出
  93. local content = download(data["href"],{})
  94. data["s_title"]=com.trim(findOneText("",content))
  95. data["detail"]=findContentText("%s",content)
  96. data["contenthtml"]=findOneHtml("%s",content)
  97. data["l_np_publishtime"]=com.strToTimestamp(data["publishtime"])
  98. data["_d"]="comeintime"
  99. local checkAttr={"title","href","publishtime","detail","contenthtml"}
  100. local b,err=com.checkData(checkAttr,data)
  101. if b then
  102. local attachments = com.getFileAttachmentsArrayWithTag(data["href"],"dl","<dl>"..data["contenthtml"].."</dl>",false)
  103. if table.getn(attachments)>0 then
  104. data["projectinfo"]={
  105. ["attachments"]=attachments
  106. }
  107. end
  108. return data
  109. else
  110. timeSleep(60)--延时60秒再次请求
  111. if i==5 then
  112. --print("下载失败")
  113. saveErrLog(spiderCode,spiderName,data["href"],err)
  114. end
  115. end
  116. end
  117. end
  118. `
  119. )