// model package spiderutil const ( //其他参数 Tmp_Other = `spiderType="%s"; spiderHistoryMaxPage=%d; spiderMoveEvent="%s"; spiderIsCompete=%v; ` //通用配置 Tmp_common = ` local json=require "json" local com=require "res.util.comm" spiderCode="%s"; spiderName="%s"; spiderChannel="%s"; spiderDownDetailPage=%v; spiderStartPage=%d; spiderMaxPage=%d; spiderRunRate=%d; spider2Collection="%s"; spiderPageEncoding="%s"; spiderStoreMode=%d; spiderStoreToMsgEvent=%d; spiderTargetChannelUrl="%s"; spiderLastDownloadTime="%s"; spiderIsHistoricalMend=%v; spiderIsMustDownload=%v; spiderUserName="%s"; spiderUserEmail="%s"; spiderUploadTime="%s"; spiderCoverAttr="title"; spiderSleepBase=1000; spiderSleepRand=5000; spiderTimeout=150; ` //获取最新时间 Tmp_pubtime = ` function getLastPublishTime() local timeType="%s" if timeType=="yyyyMMdd" or timeType=="MMdd" then return com.nowDate() end local content = download("%s",{}) local tmp = findOneText("%s",content) local lastpushtime=com.parseDate(tmp,timeType) return lastpushtime end ` //获取列表页 Tmp_pagelist = ` local lastRoundTagId = "" local currRoundTagId = "" local firstStart = true function downloadAndParseListPage(pageno) for i=1,5 do --5次下载任务不成功,退出 local update="%s" local page={} local href="%s"--列表页通用地址 local hrefs={%v}--固定列表配置 local content = download(href,{}) local list = findListHtml("%s",content)--信息块规则 if list~=nil then for k, v in pairs(list) do local item={} item["href"]="%s"--信息地址 item["title"]="%s"--信息标题 item["publishtime"]="%s"--信息时间 item=findMap(item,""..v.."
") if item["title"]~=nil and item["title"]~="" then --title校验 item["title"]=com.trim(item["title"]) sendListNum(k,list) --推送下载量 local timeType="%s" item["publishtime"]=com.parseDate(item["publishtime"],timeType)--时间格式 item["href"]=com.gethref(spiderTargetChannelUrl,item["href"]) --Common-- --通用数据补充 --Model-- --模型数据补充 table.insert(page,item) end end end if table.getn(page)>0 then return page end end end ` //获取三级页 Tmp_content = ` function downloadDetailPage(data) local update="%s" for i=1,3 do --3次下载任务不成功,退出 local content = download(data["href"],{}) data["s_title"]=com.trim(findOneText("",content)) data["detail"]=findContentText("%s",content) data["contenthtml"]=findOneHtml("%s",content) data["l_np_publishtime"]=com.strToTimestamp(data["publishtime"]) data["_d"]="comeintime" local checkAttr={"title","href","publishtime","detail","contenthtml"} local b,err=com.checkData(checkAttr,data) if b then local attachments = com.getFileAttachmentsArrayWithTag(data["href"],"dl","
"..data["contenthtml"].."
",false) if table.getn(attachments)>0 then data["projectinfo"]={ ["attachments"]=attachments } end return data else timeSleep(60)--延时60秒再次请求 if i==5 then --print("下载失败") saveErrLog(spiderCode,spiderName,data["href"],err) end end end end ` )