123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- // model
- package spiderutil
- const (
- //其他参数
- Tmp_Other = `spiderType="%s";
- spiderHistoryMaxPage=%d;
- spiderMoveEvent="%s";
- spiderIsCompete=%v;
- `
- //通用配置
- Tmp_common = `
- local json=require "json"
- local com=require "res.util.comm"
- spiderCode="%s";
- spiderName="%s";
- spiderChannel="%s";
- spiderDownDetailPage=%v;
- spiderStartPage=%d;
- spiderMaxPage=%d;
- spiderRunRate=%d;
- spider2Collection="%s";
- spiderPageEncoding="%s";
- spiderStoreMode=%d;
- spiderStoreToMsgEvent=%d;
- spiderTargetChannelUrl="%s";
- spiderLastDownloadTime="%s";
- spiderIsHistoricalMend=%v;
- spiderIsMustDownload=%v;
- spiderUserName="%s";
- spiderUserEmail="%s";
- spiderUploadTime="%s";
- spiderCoverAttr="title";
- spiderSleepBase=1000;
- spiderSleepRand=5000;
- spiderTimeout=150;
- `
- //获取最新时间
- Tmp_pubtime = `
- function getLastPublishTime()
- local timeType="%s"
- if timeType=="yyyyMMdd" or timeType=="MMdd" then
- return com.nowDate()
- end
- local content = download("%s",{})
- local tmp = findOneText("%s",content)
- local lastpushtime=com.parseDate(tmp,timeType)
- return lastpushtime
- end
- `
- //获取列表页
- Tmp_pagelist = `
- local lastRoundTagId = ""
- local currRoundTagId = ""
- local firstStart = true
- function downloadAndParseListPage(pageno)
- for i=1,5 do --5次下载任务不成功,退出
- local update="%s"
- local page={}
- local href="%s"--列表页通用地址
- local hrefs={%v}--固定列表配置
-
- local content = download(href,{})
-
- local list = findListHtml("%s",content)--信息块规则
- if list~=nil then
- sendListNum(list)--推送下载量
- for k, v in pairs(list) do
- local item={}
- item["href"]="%s"--信息地址
- item["title"]="%s"--信息标题
- item["publishtime"]="%s"--信息时间
- item=findMap(item,"<table><tr>"..v.."</tr></table>")
- if item["title"]~=nil and item["title"]~="" then
- item["title"]=com.trim(item["title"])
- local timeType="%s"
- item["publishtime"]=com.parseDate(item["publishtime"],timeType)--时间格式
- item["href"]=com.gethref(spiderTargetChannelUrl,item["href"])
- --Common-- --通用数据补充
- --Model-- --模型数据补充
- table.insert(page,item)
- end
- end
- end
- if table.getn(page)>0 then
- return page
- end
- end
- end
- `
- //获取三级页
- Tmp_content = `
- function downloadDetailPage(data)
- local update="%s"
- for i=1,3 do --3次下载任务不成功,退出
- local content = download(data["href"],{})
-
- data["detail"]=findContentText("%s",content)
- data["contenthtml"]=findOneHtml("%s",content)
- data["l_np_publishtime"]=com.strToTimestamp(data["publishtime"])
- data["_d"]="comeintime"
- local checkAttr={"title","href","publishtime","detail","contenthtml"}
- local b,err=com.checkData(checkAttr,data)
- if b then
- return data
- else
- timeSleep(60)--延时60秒再次请求
- if i==5 then
- --print("下载失败")
- saveErrLog(spiderCode,spiderName,data["href"],err)
- end
- end
- end
- end
- `
- )
|