Quellcode durchsuchen

清理html空格

fengweiqiang vor 5 Jahren
Ursprung
Commit
8db8b6a4d1
3 geänderte Dateien mit 564 neuen und 25 gelöschten Zeilen
  1. 1 0
      src/jy/util/clearHtml.go
  2. 562 24
      src/lua/comm.lua
  3. 1 1
      src/web/templates/admin/com_header.html

+ 1 - 0
src/jy/util/clearHtml.go

@@ -55,6 +55,7 @@ func NewCut() *Cut {
 //清理HTML标签
 func (c *Cut) ClearHtml(src string) string {
 	src = strings.Replace(src, ">\n", ">", -1)
+	src = strings.Replace(src, " ", "", -1)
 	//标签全转小写
 	src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
 	//清script,style

+ 562 - 24
src/lua/comm.lua

@@ -1,36 +1,27 @@
---[[抽取脚本工具类]]
+--[[
+企明星爬虫系统,公共文件
+Author:a7
+Date:2016/4/7
+]]
 
 common={}
 
---根据field获取结果对象
-function common.getFieldObjects(result,field)
-	for key, val in pairs(result) do
-		if key==field then
-			return val
-		end
-	end
-	return nil
-end
-
---抽取field新增结果赋值object(field,code,ruletext,extfrom,field,value,type,matchtype)
-function common.setFieldObjects(result,field,object)
-	local nofield=true
-	for key, val in pairs(result) do
-		if key==field then
-			nofield=false
-			table.insert(val, object)
-		end
-	end
-	if nofield then
-		result[field]={object}
+--Lua的Eval函数
+function common.eval(script)
+	script=common.clearJson(script)
+	local tmp = "return "..script;
+	local s = loadstring(tmp);
+	if s==nil then
+		return nil
 	end
-	return result
+	return s()
 end
 
 --输出
 function printf(obj)
 	print(dump(obj) )
 end
+
 function dump(obj)  
     local getIndent, quoteStr, wrapKey, wrapVal, isArray, dumpObj  
     getIndent = function(level)  
@@ -99,6 +90,553 @@ function dump(obj)
         return table.concat(tokens, "\n")  
     end  
     return dumpObj(obj, 0)  
-end 
+end  
+
+--JSON数据清理
+function common.clearJson(json)
+	--中括号替换
+	json=string.gsub(json,"%[","{")
+	json=string.gsub(json,"%]","}")
+	--键的引号及冒号替换
+	json=string.gsub(json,"\"([^\"]*)\":","%1=")
+	return json
+end
+-- 替换转义字符
+function common.replaceEscString(c)
+      c=string.gsub(c,"&lt;","<")
+      c=string.gsub(c,"&gt;",">")
+      c=string.gsub(c,"&quot;","'")
+      c=string.gsub(c,"&amp;","&")
+      c=string.gsub(c,"&#34;","\"")
+      return c
+end
+
+--返回通用当前日期时间
+function common.nowDate()
+	return os.date("%Y-%m-%d %H:%M:%S", os.time())
+end
+--返回通用日期格式
+
+monthmap={["Jan"]="01",["Feb"]="02",["Mar"]="03",["Apr"]="04",["May"]="05",["June"]="06",["Jun"]="06",["July"]="07",["Jul"]="07",["Aug"]="08",["Sept"]="09",["Sep"]="09",["Oct"]="10",["Nov"]="11",["Dec"]="12"}
+-- 处理格林威治时间
+function common.timeStrByCST(strtime)
+	local st=common.split(strtime," ")
+	return st[6].."-"..monthmap[st[2]].."-"..st[3].." "..st[4]
+end
+
+
+--日期解析
+function common.parseDate(datestr,datetype)
+	local tmp = {}
+	local pos=0
+	for i in string.gmatch(datestr,"(%d+)")  do 
+		tmp[pos]=i
+		pos=pos+1
+	end
+	if table.getn(tmp) == 0 then
+		return "0"
+		--return os.date("%Y-%m-%d %H:%M:%S", os.time())
+	end
+	--传入的格式是:年月日(中间可以有任意分隔符)
+	if datetype=="yyyyMMdd" then
+		return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).. os.date(" %H:%M:%S", os.time())
+	--年月日时分秒
+	elseif datetype=="yyyyMMddHHmmss" then 
+		return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).." "..common.padDigital(tmp[3])..":"..tmp[4]..":"..tmp[5]
+	--年月日时分
+	elseif datetype=="yyyyMMddHHmm" then 
+		return tmp[0].."-"..common.padDigital(tmp[1]).."-"..common.padDigital(tmp[2]).." "..common.padDigital(tmp[3])..":"..tmp[4]..":00"
+	--月日	
+	elseif datetype=="MMdd" then 
+		return tostring(os.date("%Y",os.time())).."-"..common.padDigital(tmp[0]).."-"..common.padDigital(tmp[1]).. os.date(" %H:%M:%S", os.time())
+	else 
+	        return "0"
+	end
+end
+
+--日期补全
+function common.padDigital(src)
+	if string.len(src)<2 then
+		return "0"..src
+	else
+		return src
+	end
+end
+--local datestr="2016年05月12日22:05:04"
+--print(parseDate(datestr,"yyyyMMddHHmm"))
+--print(parseDate("4月5日","MMdd"))
+
+--字符日期转时间戳  原始时间字符串,要求格式yyyy-MM-dd HH:mm:ss,
+function common.strToTimestamp(str)  
+    --从日期字符串中截取出年月日时分秒  
+	if string.len(str)<19 then
+	      return 0
+	  --    	return os.time()
+	end
+    local Y = tonumber(string.sub(str,1,4))
+    local M = tonumber(string.sub(str,6,7)) 
+    local D = tonumber(string.sub(str,9,10))  
+    local H = tonumber(string.sub(str,12,13))  
+    local MM = tonumber(string.sub(str,15,16))  
+    local SS = tonumber(string.sub(str,18,19))  
+ 	return os.time{year=Y, month=M, day=D, hour=H,min=MM,sec=SS} 
+end  
+
+function common.trim(s) 
+	return string.gsub(s, "[\r|\n| |\t]+", "")
+end   
+
+--分割字符串
+function common.split(str, delimiter)
+	if str==nil or str=='' or delimiter==nil then
+		return nil
+	end
+	
+    local result = {}
+    for match in (str..delimiter):gmatch("(.-)"..delimiter) do
+        table.insert(result, match)
+    end
+    return result
+end
+
+--正则匹配返回值修正
+function common.regTab(con,reg)
+	local tab=string.match(con,reg)
+	if tab==nil then
+		return ""
+	else
+		return tab
+	end
+end
+
+--只验证属性字段不为空 tab1属性字段,tab2待验证对象
+function common.checkData(tab1,tab2)
+	local b=true
+	local str=""
+	for _,v in pairs(tab1) do
+		if tab2[v]==nil or tab2[v]=="" then
+			str=str..v..":值空"..","
+			b=false
+		end
+	end
+	return  b,str
+end
+
+--URL编码
+function common.decodeURI(s)
+    s = string.gsub(s, '%%(%x%x)', function(h) return string.char(tonumber(h, 16)) end)
+    return s
+end
+
+function common.encodeURI(s)
+    s = string.gsub(s, "([^%w%.%- ])", function(c) return string.format("%%%02X", string.byte(c)) end)
+    return string.gsub(s, " ", "+")
+end
+
+
+function common.gethref(channel,href)
+	local prehttp=string.sub(channel,1,5)
+	if string.lower(prehttp)=="https" then
+		prehttp="https://"
+	else
+		prehttp="http://"
+	end
+	local pre=string.sub(href,1,4)
+	if string.lower(pre)=="http" then
+		return href
+	else 
+		-- channel=string.sub(channel,8)
+		channel=channel:match("https?://(.*)$")
+		local channelpath=common.split(channel,"/")
+
+		pre=string.sub(href,1,1)
+		if pre~="." and  pre~="/" then
+			href = "./"..href
+		end
+		pre=string.sub(href,1,2)
+		if pre==".." then
+			local infopath=common.split(href,"%./")
+			for i=1,table.getn(infopath) do
+		 		table.remove(channelpath,-1) 
+		 	end
+			tmp=""
+		 	for i=1,table.getn(channelpath) do
+		 		tmp=tmp..channelpath[i].."/"
+		 	end
+			local infourl = infopath[table.getn(infopath)]
+		 	href=prehttp..tmp..string.sub(infourl,0,string.len(infourl)-1)
+		else
+			if pre=="./" then
+			 	table.remove(channelpath,-1) 
+				tmp=prehttp
+			 	for i=1,table.getn(channelpath) do
+			 		tmp=tmp..channelpath[i].."/"
+			 	end
+				href=tmp..string.sub(href,3)
+			else
+				if string.sub(href,0,1)=="/" then
+					href=prehttp..channelpath[1]..href
+				else
+					href=prehttp..channelpath[1].."/"..href
+				end
+			end
+		end
+		return href
+	end
+end
+
+function common.splitf(str, delimiter)
+	if str==nil or str=='' or delimiter==nil then
+		return nil
+	end
+	
+    local result = {}
+    for match in (str..delimiter):gmatch("(.-)"..delimiter) do
+        table.insert(result, match)
+    end
+    return result
+end
+
+
+function common.checkUpdate(content,update)
+	if update == "" or update == nil then
+		return 0
+	end
+	local updates=common.splitf(update,"\n")
+  	local out=1
+    for _,v in pairs(updates) do
+   		local vs=common.splitf(v,"==")
+   		if table.getn(vs)>1 then
+   			local item={}
+   			item["tmp"]=vs[1];
+   			local tmp=findMap(item,content)["tmp"]
+   			if tmp~=vs[2] then
+     			out=-1
+   			end
+   		end
+	end
+	
+	if out==-1 then
+		return -1
+	else
+		return 0
+	end
+end
+
+--获取附件标题
+function common.getEnclosureTitle(href,content)
+	local fileTitles = {}
+	
+	local linkList = findListHtml("a", content)
+
+	for k,v in pairs(linkList) do 
+		local tempJpg1 = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.jpg$")
+		local tempJpg2 = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.JPG$")
+		local tempBid = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.bid$")
+		local tempPdf = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.pdf$")
+		local tempDoc = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.doc$")
+		local tempDocx = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.docx$")
+		local tempXls = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.xls$")
+		local tempXlsx = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.xlsx$")
+		local tempZip = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.zip$")
+		local tempRar = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.rar$")
+
+		if tempPdf ~= nil or tempDoc ~= nil or tempDocx ~= nil or tempXls ~= nil or tempXlsx ~= nil or tempZip ~= nil or tempRar ~= nil or tempJpg1 ~= nil or tempJpg2 ~= nil or tempBid ~= nil then
+			local tempTitle = findOneText("a:eq("..tostring(k-1)..")", content)
+			fileTitles[k] = tempTitle
+			--table.insert(fileTitles, tempTitle)
+		end
+	end
+
+	return fileTitles
+end
+
+--获取附件链接
+function common.getEnclosureHref(href,content)
+	local hrefs = {}
+	
+	--href = common.gethref(href, "")
+	local linkList = findListHtml("a", content)
+
+	for k,v in pairs(linkList) do 
+		local tempJpg1 = string.find(v, "%.jpg$")
+		local tempJpg2 = string.find(v, "%.JPG$")
+		local tempBid = string.find(v, "%.bid$")
+		local tempPdf = string.find(v, "%.pdf$")
+		local tempDoc = string.find(v, "%.doc$")
+		local tempDocx = string.find(v, "%.docx$")
+		local tempXls = string.find(v, "%.xls$")
+		local tempXlsx = string.find(v, "%.xlsx$")
+		local tempZip = string.find(v, "%.zip$")
+		local tempRar = string.find(v, "%.rar$")
+
+		if tempPdf ~= nil or tempDoc ~= nil or tempDocx ~= nil or tempXls ~= nil or tempXlsx ~= nil or tempZip ~= nil or tempRar ~= nil or tempJpg1 ~= nil or tempJpg2 ~= nil or tempBid ~= nil then
+			local tempHref = findOneText("a:eq("..tostring(k-1).."):attr(href)", content)
+			local isWholeHref = string.find(tempHref, "http")
+			if isWholeHref == nil then
+				tempHref = common.gethref(href, tempHref)
+				--tempHref = href..tempHref
+			end
+			tempHref = string.gsub(tempHref, "\\", "/")
+			hrefs[k] = tempHref
+			--table.insert(hrefs, tempHref)
+		end
+	end
+	return hrefs
+end
+
+--获取附件链接2
+function common.getEnclosureHrefByList(href,content)
+	local hrefs = {}
+	
+	--href = common.gethref(href, "")
+	local linkList = findListHtml("a", content)
+
+	for k,v in pairs(linkList) do 
+		local tempJpg1 = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.jpg$")
+		local tempJpg2 = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.JPG$")
+		local tempBid = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.bid$")
+		local tempPdf = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.pdf$")
+		local tempDoc = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.doc$")
+		local tempDocx = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.docx$")
+		local tempXls = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.xls$")
+		local tempXlsx = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.xlsx$")
+		local tempZip = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.zip$")
+		local tempRar = string.find(findOneText("a:eq("..tostring(k-1).."):attr(href)", content), "%.rar$")
+
+		if tempPdf ~= nil or tempDoc ~= nil or tempDocx ~= nil or tempXls ~= nil or tempXlsx ~= nil or tempZip ~= nil or tempRar ~= nil or tempJpg1 ~= nil or tempJpg2 ~= nil or tempBid ~= nil then
+			local tempHref = findOneText("a:eq("..tostring(k-1).."):attr(href)", content)
+			local isWholeHref = string.find(tempHref, "http")
+			if isWholeHref == nil then
+				tempHref = common.gethref(href, tempHref)
+				--tempHref = href..tempHref
+			end
+			tempHref = string.gsub(tempHref, "\\", "/")
+			hrefs[k] = tempHref
+			--table.insert(hrefs, tempHref)
+		end
+	end
+	return hrefs
+end
+
+
+--下载多个附件
+function common.getFileAttachmentsArray(fileNameArray,fileLinkArray)
+	local attachments = {}
+	for i,fileLink in pairs(fileLinkArray) do
+		local url,name,size,ftype,fid=downloadFile(fileNameArray[i], fileLink, "get",{},{},"")
+		local u=1
+		while url=="" and u<6 do
+			url,name,size,ftype,fid=downloadFile(fileNameArray[i],fileLink,"get",{},{},"")
+			u=u+1
+			if u==6 and url=="" then
+				saveErrLog(fileLink,"comm附件下载失败")
+			end
+		end
+		if url~=nil and url~="" and name~=nil and name~="" then
+			local file = {}
+			file = {
+				["url"]=url,
+				["filename"]=name,
+				["size"]=size,
+				["ftype"]=ftype,
+				["fid"]=fid
+			}
+			table.insert(attachments, file)
+		end
+	end
+	return attachments
+end
+
+--多附件下载,跳过获取href和title集合阶段
+function common.getFileAttachmentsArrayByHrefAndContent(href,content)
+	local fileTitles = common.getEnclosureTitle(href, content)
+	local fileLinks = common.getEnclosureHrefByList(href, content)
+	if table.getn(fileLinks) == 0 then
+		fileLinks = common.getEnclosureHref(href, content)
+	end
+	for i,v in ipairs(fileTitles) do
+		if v == "" then
+			table.remove(fileTitles, i)
+			table.remove(fileLinks, i)
+		end
+	end
+	local attachments = common.getFileAttachmentsArray(fileTitles, fileLinks)
+
+	return attachments
+end
+
+
+--确定模块的附件下载方法(获取title与href)
+--tags:模块选择器
+--withend:是否以文件类型为后缀,比如 .doc,true为后缀,false不为后缀
+filetype={"jpg","JPG","bid","pdf","PDF","doc","docx","xls","xlsx","zip","rar","swf","DOCX","DOC","PDF","XLS","XLSX","ZIP","RAR","SWF"}	 
+function common.getFilesLinkByTag(href,tags,content,withend)
+	local dhtml = findOneHtml(tags, content)
+	--dhtml=dhtml.."<a href='/123.doc'>123.doc</a>"
+	local alist = findListHtml(tags.." a", content)
+	local flist={}
+	for k,v in pairs(alist) do
+		local item={}
+		item["href"]="a:eq("..tostring(k-1).."):attr(href)"
+		item["title"]="a:eq("..tostring(k-1)..")"
+		item=findMap(item,dhtml)
+		item["title"]=common.trim(tostring(item["title"]))
+		item["href"]=common.gethref(href,tostring(item["href"]))
+		item["href"] = string.gsub(item["href"], "\\", "/")
+		local isWholeHref = string.find(item["href"], "http")
+		if isWholeHref == nil then
+			item["href"] = transCode("utf8",item["href"])
+		end
+		local statehref;
+		for _,ftype in pairs(filetype) do
+			if withend then
+				statehref=string.find(item["href"], "%."..ftype.."$")
+				if statehref==nil or statehref=="" then
+					statehref=string.find(item["title"], "%."..ftype.."$")
+				end
+			else
+				statehref=string.find(item["href"], "%."..ftype)
+				if statehref==nil or statehref=="" then
+					statehref=string.find(item["title"], "%."..ftype)
+				end
+			end
+			if statehref then
+				break
+			end
+		end
+	
+		if statehref~=nil and item["title"]~="" then
+			table.insert(flist,item)
+		end
+	end
+	return flist
+end
+
+--确定模块的附件下载方法,封装
+function common.getFileAttachmentsArrayWithTag(href,tags,content,withend)
+	local attachments = {}
+	--local nameTypeArr={"jpg","JPG","bid","pdf","PDF","doc","docx","xls","xlsx","zip","rar","swf","DOCX","DOC","PDF","XLS","XLSX","ZIP","RAR","SWF"}
+	local titleAndHrefList = common.getFilesLinkByTag(href,tags,content,withend)
+	for i,v in ipairs(titleAndHrefList) do
+		local url,name,size,ftype,fid=downloadFile(v["title"], v["href"], "get",{},{},"")
+		-- 附件原地址(默认为空)
+		local init_url = v["href"]
+		if url == "" then
+			local u = 0
+			while u < 6 do
+				u = u + 1
+				url,name,size,ftype,fid=downloadFile(v["title"],v["href"],"get",{},{},"")
+				if url ~= "" and size ~= "" then
+					u = 7   -- 下载无误 跳出循环
+				end
+				if u==6 and (url == "" or size == "") then
+					saveErrLog(v["href"],"comm附件下载失败")
+				end
+			end
+		end
+		
+		if url == "" and size == "" then
+			name = v["title"]
+		end
+
+		if type(url) ~= "string" then
+			url = ""
+		end
+		
+		-- 下载成功, 正常返回
+		if url~=nil and url~="" and name~=nil and name~="" and size ~= "" then
+			local file = {}
+			file = {
+				["url"]=url,
+				["filename"]=name,
+				["size"]=size,
+				["ftype"]=ftype,
+				["fid"]=fid,
+				["org_url"] = init_url
+			}
+			table.insert(attachments, file)
+			-- 下载失败
+		else 
+			local file = {}
+			file = {
+			    ["filename"]=name,
+				["org_url"] = init_url
+			}
+			table.insert(attachments, file)
+		end
+	end
+	return attachments
+end
+
+
+function common.getPureContent(content)
+	local startChar
+	local _,endChar
+	local resContent = content
+	while string.find(resContent, "<!%-%-")~=nil do
+		startChar,_ = string.find(resContent, "<!%-%-")
+		_,endChar = string.find(resContent, "%-%->")
+		resContent = string.sub(resContent, 1, startChar-1)..string.sub(resContent, endChar+1, string.len(resContent))
+	end
+	return resContent
+end
+
+
+function common.getMoneyAndType(orgStr)
+	orgStr = common.trim(orgStr)
+	orgStr = string.gsub(orgStr, "(", "")
+	orgStr = string.gsub(orgStr, ")", "")
+	orgStr = string.gsub(orgStr, ",", "")
+	local moneyType = ""
+	local num =0
+	local resNum =0
+	if string.find(orgStr, "万") ~= nil then
+		orgStr = string.gsub(orgStr, "万元", "")
+		orgStr = string.gsub(orgStr, "万", "")
+		if string.find(orgStr, "人民币") ~= nil then
+			orgStr = string.gsub(orgStr, "人民币", "")
+			orgStr = string.gsub(orgStr, "¥", "")
+			orgStr = string.gsub(orgStr, "¥", "")
+			moneyType = "人民币"
+		elseif string.find(orgStr, "美元") ~= nil then
+			orgStr = string.gsub(orgStr, "美元", "")
+			orgStr = string.gsub(orgStr, "$", "")
+			moneyType = "美元"
+		else
+			moneyType = "人民币"
+		end
+		local i, j = string.find(orgStr, "[0-9]+%.*[0-9]*")
+      	orgStr=string.sub(orgStr, i, j)
+		num = tonumber(orgStr)
+		num = num*10000
+	else
+	    if string.find(orgStr, "人民币") ~= nil then
+			orgStr = string.gsub(orgStr, "人民币", "")
+			orgStr = string.gsub(orgStr, "¥", "")
+			orgStr = string.gsub(orgStr, "¥", "")
+			moneyType = "人民币"
+		elseif string.find(orgStr, "美元") ~= nil then
+			orgStr = string.gsub(orgStr, "美元", "")
+			orgStr = string.gsub(orgStr, "$", "")
+			moneyType = "美元"
+		else
+			moneyType = "人民币"
+		end
+		local i, j = string.find(orgStr, "[0-9]+%.*[0-9]*")
+      	orgStr=string.sub(orgStr, i, j)
+		num = tonumber(orgStr)
+
+	end
+
+	local fmt = '%.' .. 2 .. 'f'
+    local resNum = tonumber(string.format(fmt, num))
+
+    return resNum, moneyType
+end
+
+
+
+
 --通用方法结束
 return common;

+ 1 - 1
src/web/templates/admin/com_header.html

@@ -113,7 +113,7 @@
 <style>
 	.dataTable tr td {
 		/* for Firefox,mozilla */
-		max-width: 100px;
+		max-width: 50px;
 		overflow: hidden;
 		text-overflow:ellipsis;
 		white-space: nowrap