--[[ 企明星爬虫系统,公共文件 Author:a7 Date:2016/4/7 ]] local json=require "json" common={} --Lua的Eval函数 function common.eval(script) script=common.clearJson(script) local tmp = "return "..script; local s = loadstring(tmp); if s==nil then return nil end return s() end --输出 function printf(obj) print(dump(obj) ) end function dump(obj) local getIndent, quoteStr, wrapKey, wrapVal, isArray, dumpObj getIndent = function(level) return string.rep("\t", level) end quoteStr = function(str) str = string.gsub(str, "[%c\\\"]", { ["\t"] = "\\t", ["\r"] = "\\r", ["\n"] = "\\n", ["\""] = "\\\"", ["\\"] = "\\\\", }) return '"' .. str .. '"' end wrapKey = function(val) if type(val) == "number" then return "[" .. val .. "]" elseif type(val) == "string" then return "[" .. quoteStr(val) .. "]" else return "[" .. tostring(val) .. "]" end end wrapVal = function(val, level) if type(val) == "table" then return dumpObj(val, level) elseif type(val) == "number" then return val elseif type(val) == "string" then return quoteStr(val) else return tostring(val) end end local isArray = function(arr) local count = 0 for k, v in pairs(arr) do count = count + 1 end for i = 1, count do if arr[i] == nil then return false end end return true, count end dumpObj = function(obj, level) if type(obj) ~= "table" then return wrapVal(obj) end level = level + 1 local tokens = {} tokens[#tokens + 1] = "{" local ret, count = isArray(obj) if ret then for i = 1, count do tokens[#tokens + 1] = getIndent(level) .. wrapVal(obj[i], level) .. "," end else for k, v in pairs(obj) do tokens[#tokens + 1] = getIndent(level) .. wrapKey(k) .. " = " .. wrapVal(v, level) .. "," end end tokens[#tokens + 1] = getIndent(level - 1) .. "}" return table.concat(tokens, "\n") end return dumpObj(obj, 0) end --JSON数据清理 function common.clearJson(json) --中括号替换 json=string.gsub(json,"%[","{") json=string.gsub(json,"%]","}") --键的引号及冒号替换 json=string.gsub(json,"\"([^\"]*)\":","%1=") return json end -- 替换转义字符 function common.replaceEscString(c) c=string.gsub(c,"<","<") c=string.gsub(c,">",">") c=string.gsub(c,""","'") c=string.gsub(c,"&","&") c=string.gsub(c,""","\"") return c end --返回通用当前日期时间 function common.nowDate() return os.date("%Y-%m-%d %H:%M:%S", os.time()) end --返回通用日期格式 monthmap={["Jan"]="01",["Feb"]="02",["Mar"]="03",["Apr"]="04",["May"]="05",["June"]="06",["Jun"]="06",["July"]="07",["Jul"]="07",["Aug"]="08",["Sept"]="09",["Sep"]="09",["Oct"]="10",["Nov"]="11",["Dec"]="12"} -- 处理格林威治时间 function common.timeStrByCST(strtime) local st=common.split(strtime," ") return st[6].."-"..monthmap[st[2]].."-"..st[3].." "..st[4] end --日期解析 function common.parseDate(datestr,datetype) if datestr == nil then return "0" end local tmp = {} local pos=0 for i in string.gmatch(datestr,"(%d+)") do tmp[pos]=i pos=pos+1 end if table.getn(tmp) == 0 then return "0" --return os.date("%Y-%m-%d %H:%M:%S", os.time()) end --判断日期值是否有误 if tmp[0]==nil or tmp[1]==nil then return "0" end --月日 if datetype=="MMdd" then return tostring(os.date("%Y",os.time())).."-"..common.padDigital(tmp[0]).."-"..common.padDigital(tmp[1]).." 00:00:00" end if tmp[2] ~=nil then --传入的格式是:年月日(中间可以有任意分隔符) if datetype=="yyyyMMdd" then return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).. os.date(" %H:%M:%S", os.time()) end if tmp[3] ~=nil and tmp[4] ~=nil then --年月日时分 if datetype=="yyyyMMddHHmm" then return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).." "..common.padDigital(tmp[3])..":"..tmp[4]..":00" end if tmp[5] ~=nil then --年月日时分秒 if datetype=="yyyyMMddHHmmss" then return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).." "..common.padDigital(tmp[3])..":"..tmp[4]..":"..tmp[5] end end end end return "0" -- if datetype=="yyyyMMdd" then -- return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).. os.date(" %H:%M:%S", os.time()) -- --年月日时分秒 -- elseif datetype=="yyyyMMddHHmmss" then -- return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).." "..common.padDigital(tmp[3])..":"..tmp[4]..":"..tmp[5] -- --年月日时分 -- elseif datetype=="yyyyMMddHHmm" then -- return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).." "..common.padDigital(tmp[3])..":"..tmp[4]..":00" -- --月日 -- elseif datetype=="MMdd" then -- return tostring(os.date("%Y",os.time())).."-"..common.padDigital(tmp[0]).."-"..common.padDigital(tmp[1]).." 00:00:00" -- else -- return "0" -- end end --日期补全 function common.padDigital(src) if string.len(src)<2 then return "0"..src else return src end end --local datestr="2016年05月12日22:05:04" --print(parseDate(datestr,"yyyyMMddHHmm")) --print(parseDate("4月5日","MMdd")) --字符日期转时间戳 原始时间字符串,要求格式yyyy-MM-dd HH:mm:ss, function common.strToTimestamp(str) --从日期字符串中截取出年月日时分秒 if string.len(str)<19 then return 0 -- return os.time() end local Y = tonumber(string.sub(str,1,4)) local M = tonumber(string.sub(str,6,7)) local D = tonumber(string.sub(str,9,10)) local H = tonumber(string.sub(str,12,13)) local MM = tonumber(string.sub(str,15,16)) local SS = tonumber(string.sub(str,18,19)) return os.time{year=Y, month=M, day=D, hour=H,min=MM,sec=SS} end function common.trim(s) if s == nil then return "" end return string.gsub(s, "[\r|\n| |\t]+", "") end --分割字符串 function common.split(str, delimiter) local result = {} if str==nil or str=='' or delimiter==nil then return result end for match in (str..delimiter):gmatch("(.-)"..delimiter) do table.insert(result, match) end return result end --正则匹配返回值修正 function common.regTab(con,reg) local tab=string.match(con,reg) if tab==nil then return "" else return tab end end --只验证属性字段不为空 tab1属性字段,tab2待验证对象 function common.checkData(tab1,tab2) local b=true local str="" for _,v in pairs(tab1) do if tab2[v]==nil or tab2[v]=="" then str=str..v..":值空".."," b=false end end return b,str end --URL编码 function common.decodeURI(s) if s == nil then return "" end s = string.gsub(s, '%%(%x%x)', function(h) return string.char(tonumber(h, 16)) end) return s end function common.encodeURI(s) if s == nil then return "" end s = string.gsub(s, "([^%w%.%- ])", function(c) return string.format("%%%02X", string.byte(c)) end) return string.gsub(s, " ", "+") end function common.gethref(channel,href) local prehttp=string.sub(channel,1,5) if string.lower(prehttp)=="https" then prehttp="https://" else prehttp="http://" end local pre=string.sub(href,1,4) if string.lower(pre)=="http" then return href else -- channel=string.sub(channel,8) channel=channel:match("https?://(.*)$") local channelpath=common.split(channel,"/") pre=string.sub(href,1,1) if pre~="." and pre~="/" then href = "./"..href end pre=string.sub(href,1,2) if pre==".." then local infopath=common.split(href,"%./") for i=1,table.getn(infopath) do if table.getn(channelpath)==1 then break end table.remove(channelpath,-1) end tmp="" for i=1,table.getn(channelpath) do tmp=tmp..channelpath[i].."/" end local infourl = infopath[table.getn(infopath)] href=prehttp..tmp..string.sub(infourl,0,string.len(infourl)-1) else if pre=="./" then table.remove(channelpath,-1) tmp=prehttp for i=1,table.getn(channelpath) do tmp=tmp..channelpath[i].."/" end href=tmp..string.sub(href,3) else if string.sub(href,0,1)=="/" then href=prehttp..channelpath[1]..href else href=prehttp..channelpath[1].."/"..href end end end return href end end function common.splitf(str, delimiter) if str==nil or str=='' or delimiter==nil then return nil end local result = {} for match in (str..delimiter):gmatch("(.-)"..delimiter) do table.insert(result, match) end return result end function common.checkUpdate(content,update) if update == "" or update == nil then return 0 end local updates=common.splitf(update,"\n") local out=1 for _,v in pairs(updates) do local vs=common.splitf(v,"==") if table.getn(vs)>1 then local item={} item["tmp"]=vs[1]; local tmp=findMap(item,content)["tmp"] if tmp~=vs[2] then out=-1 end end end if out==-1 then return -1 else return 0 end end --获取附件标题 function common.getEnclosureTitle(href,content) local fileTitles = {} local linkList = findListHtml("a", content) for k,v in pairs(linkList) do local tempJpg1 = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.jpg$") local tempJpg2 = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.JPG$") local tempBid = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.bid$") local tempPdf = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.pdf$") local tempDoc = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.doc$") local tempDocx = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.docx$") local tempXls = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.xls$") local tempXlsx = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.xlsx$") local tempZip = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.zip$") local tempRar = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.rar$") if tempPdf ~= nil or tempDoc ~= nil or tempDocx ~= nil or tempXls ~= nil or tempXlsx ~= nil or tempZip ~= nil or tempRar ~= nil or tempJpg1 ~= nil or tempJpg2 ~= nil or tempBid ~= nil then local tempTitle = findOneText("a:eq("..tostring(k-1)..")", content) fileTitles[k] = tempTitle --table.insert(fileTitles, tempTitle) end end return fileTitles end --获取附件链接 function common.getEnclosureHref(href,content) local hrefs = {} --href = common.gethref(href, "") local linkList = findListHtml("a", content) for k,v in pairs(linkList) do local tempJpg1 = string.find(v, "%.jpg$") local tempJpg2 = string.find(v, "%.JPG$") local tempBid = string.find(v, "%.bid$") local tempPdf = string.find(v, "%.pdf$") local tempDoc = string.find(v, "%.doc$") local tempDocx = string.find(v, "%.docx$") local tempXls = string.find(v, "%.xls$") local tempXlsx = string.find(v, "%.xlsx$") local tempZip = string.find(v, "%.zip$") local tempRar = string.find(v, "%.rar$") if tempPdf ~= nil or tempDoc ~= nil or tempDocx ~= nil or tempXls ~= nil or tempXlsx ~= nil or tempZip ~= nil or tempRar ~= nil or tempJpg1 ~= nil or tempJpg2 ~= nil or tempBid ~= nil then local tempHref = findOneText("a:eq("..tostring(k-1).."):attr(href)", content) local isWholeHref = string.find(tempHref, "http") if isWholeHref == nil then tempHref = common.gethref(href, tempHref) --tempHref = href..tempHref end tempHref = string.gsub(tempHref, "\\", "/") hrefs[k] = tempHref --table.insert(hrefs, tempHref) end end return hrefs end --获取附件链接2 function common.getEnclosureHrefByList(href,content) local hrefs = {} --href = common.gethref(href, "") local linkList = findListHtml("a", content) for k,v in pairs(linkList) do local tempJpg1 = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.jpg$") local tempJpg2 = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.JPG$") local tempBid = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.bid$") local tempPdf = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.pdf$") local tempDoc = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.doc$") local tempDocx = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.docx$") local tempXls = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.xls$") local tempXlsx = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.xlsx$") local tempZip = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.zip$") local tempRar = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.rar$") if tempPdf ~= nil or tempDoc ~= nil or tempDocx ~= nil or tempXls ~= nil or tempXlsx ~= nil or tempZip ~= nil or tempRar ~= nil or tempJpg1 ~= nil or tempJpg2 ~= nil or tempBid ~= nil then local tempHref = findOneText("a:eq("..tostring(k-1).."):attr(href)", content) local isWholeHref = string.find(tempHref, "http") if isWholeHref == nil then tempHref = common.gethref(href, tempHref) --tempHref = href..tempHref end tempHref = string.gsub(tempHref, "\\", "/") hrefs[k] = tempHref --table.insert(hrefs, tempHref) end end return hrefs end --下载多个附件 function common.getFileAttachmentsArray(fileNameArray,fileLinkArray) local attachments = {} for i,fileLink in pairs(fileLinkArray) do local url,name,size,ftype,fid=downloadFile(fileNameArray[i], fileLink, "get",{},{},"") local u=1 while url=="" and u<6 do url,name,size,ftype,fid=downloadFile(fileNameArray[i],fileLink,"get",{},{},"") u=u+1 if u==6 and url=="" then saveErrLog(fileLink,"comm附件下载失败") end end if url~=nil and url~="" and name~=nil and name~="" then local file = {} file = { ["url"]=url, ["filename"]=name, ["size"]=size, ["ftype"]=ftype, ["fid"]=fid } table.insert(attachments, file) end end return attachments end --多附件下载,跳过获取href和title集合阶段 function common.getFileAttachmentsArrayByHrefAndContent(href,content) local fileTitles = common.getEnclosureTitle(href, content) local fileLinks = common.getEnclosureHrefByList(href, content) if table.getn(fileLinks) == 0 then fileLinks = common.getEnclosureHref(href, content) end for i,v in ipairs(fileTitles) do if v == "" then table.remove(fileTitles, i) table.remove(fileLinks, i) end end local attachments = common.getFileAttachmentsArray(fileTitles, fileLinks) return attachments end --确定模块的附件下载方法(获取title与href) --tags:模块选择器 --withend:是否以文件类型为后缀,比如 .doc,true为后缀,false不为后缀 filetype={"jpg","JPG","bid","pdf","png","PDF","docx","doc","xlsx","xls","zip","rar","swf","DOCX","DOC","PDF","XLSX","XLS","ZIP","RAR","SWF"} function common.getFilesLinkByTag(href,tags,content,withend) local dhtml = findOneHtml(tags, content) --dhtml=dhtml.."123.doc" local alist = findListHtml(tags.." a", content) local flist={} for k,v in pairs(alist) do local item={} item["href"]="a:eq("..tostring(k-1).."):attr(href)" item["title"]="a:eq("..tostring(k-1)..")" item=findMap(item,dhtml) item["title"]=common.trim(tostring(item["title"])) item["href"]=common.gethref(href,tostring(item["href"])) item["href"] = string.gsub(item["href"], "\\", "/") local isWholeHref = string.find(item["href"], "http") if isWholeHref == nil then item["href"] = transCode("utf8",item["href"]) end local statehref; for _,ftype in pairs(filetype) do if withend then statehref=string.find(item["href"], "%."..ftype.."$") if statehref==nil or statehref=="" then statehref=string.find(item["title"], "%."..ftype.."$") end item["ftype"]="%."..ftype else statehref=string.find(item["href"], "%."..ftype) if statehref==nil or statehref=="" then statehref=string.find(item["title"], "%."..ftype) end item["ftype"]="%."..ftype end if statehref then break end end if statehref~=nil and item["title"]~="" then table.insert(flist,item) end end return flist end --确定模块的附件下载方法,封装 function common.getFileAttachmentsArrayWithTag(href,tags,content,withend,param,head,ck) if param == nil or head == nil then param={} head={} ck="" end local attachments = {} --local nameTypeArr={"jpg","JPG","bid","pdf","PDF","doc","docx","xls","xlsx","zip","rar","swf","DOCX","DOC","PDF","XLS","XLSX","ZIP","RAR","SWF"} local titleAndHrefList = common.getFilesLinkByTag(href,tags,content,withend) for i,v in ipairs(titleAndHrefList) do local end_type = string.find(v["title"],v["ftype"].."$") local file_name = "" if end_type==nil or end_type=="" then file_name = string.match(v["title"],"(.+"..v["ftype"]..")") else file_name = v["title"] end local url,name,size,ftype,fid=downloadFile(file_name, v["href"], "get",param,head,ck) -- 附件原地址(默认为空) local init_url = v["href"] if url == "" then local u = 0 while u < 2 do u = u + 1 url,name,size,ftype,fid=downloadFile(file_name,v["href"],"get",param,head,ck) if url ~= "" and size ~= "" then u = 3 -- 下载无误 跳出循环 end if u==2 and (url == "" or size == "") then saveErrLog(v["href"],"comm附件下载失败") end end end if url == "" and size == "" then name = file_name end if type(url) ~= "string" then url = "" end -- 下载成功, 正常返回 if url~=nil and url~="" and name~=nil and name~="" and size ~= "" then local file = {} file = { ["url"]=url, ["filename"]=name, ["size"]=size, ["ftype"]=ftype, ["fid"]=fid, ["org_url"] = init_url } table.insert(attachments, file) -- 下载失败 else local file = {} file = { ["filename"]=name, ["org_url"] = init_url } table.insert(attachments, file) end end return attachments end function common.getPureContent(content) local startChar local _,endChar local resContent = content while string.find(resContent, "") resContent = string.sub(resContent, 1, startChar-1)..string.sub(resContent, endChar+1, string.len(resContent)) end return resContent end function common.getMoneyAndType(orgStr) orgStr = common.trim(orgStr) orgStr = string.gsub(orgStr, "(", "") orgStr = string.gsub(orgStr, ")", "") orgStr = string.gsub(orgStr, ",", "") local moneyType = "" local num =0 local resNum =0 if string.find(orgStr, "万") ~= nil then orgStr = string.gsub(orgStr, "万元", "") orgStr = string.gsub(orgStr, "万", "") if string.find(orgStr, "人民币") ~= nil then orgStr = string.gsub(orgStr, "人民币", "") orgStr = string.gsub(orgStr, "¥", "") orgStr = string.gsub(orgStr, "¥", "") moneyType = "人民币" elseif string.find(orgStr, "美元") ~= nil then orgStr = string.gsub(orgStr, "美元", "") orgStr = string.gsub(orgStr, "$", "") moneyType = "美元" else moneyType = "人民币" end local i, j = string.find(orgStr, "[0-9]+%.*[0-9]*") orgStr=string.sub(orgStr, i, j) num = tonumber(orgStr) num = num*10000 else if string.find(orgStr, "人民币") ~= nil then orgStr = string.gsub(orgStr, "人民币", "") orgStr = string.gsub(orgStr, "¥", "") orgStr = string.gsub(orgStr, "¥", "") moneyType = "人民币" elseif string.find(orgStr, "美元") ~= nil then orgStr = string.gsub(orgStr, "美元", "") orgStr = string.gsub(orgStr, "$", "") moneyType = "美元" else moneyType = "人民币" end local i, j = string.find(orgStr, "[0-9]+%.*[0-9]*") orgStr=string.sub(orgStr, i, j) num = tonumber(orgStr) end local fmt = '%.' .. 2 .. 'f' local resNum = tonumber(string.format(fmt, num)) return resNum, moneyType end function common.dataNil(data) local nameNilArr={"jsondata","href","title","publishtime","detail","contenthtml"} for _,name in pairs(nameNilArr) do if data[name] == nil then data[name] = "" if name == "jsondata" then data[name] = "{}" end elseif name == "jsondata" and type(data[name]) == "table" then local length = 0 for key, value in pairs(data[name]) do length = length + 1 end if length > 0 then data[name] = json.encode(data[name]) else data[name] = "{}" end end end return data end --判断三级页是否跳到其他网站 function common.hrefInThisWeb(href,itemHref) itemHref = common.gethref(href,itemHref)--标准化href if itemHref == "" or itemHref == nil then return "", false end --https开头 local httpsindex = string.find(itemHref,"https") if httpsindex == 1 then return itemHref, common.isThisWeb(href,itemHref,9) end --http开头 local httpindex = string.find(itemHref,"http") if httpindex == 1 then return itemHref, common.isThisWeb(href,itemHref,8) end return itemHref, false end function common.isThisWeb(href,itemHref,i) itemHref = string.sub(itemHref,i,string.len(itemHref)) --取http://后边的内容 domainame = common.split(itemHref,"/")[1] --截取域名 if domainame ~= nil and domainame ~= "" then index = string.find(href,domainame) if index ~= nil and index >= 1 then return true end end return false end --通用方法结束 return common;