|
@@ -1,6 +1,7 @@
|
|
package luacheck
|
|
package luacheck
|
|
|
|
|
|
import (
|
|
import (
|
|
|
|
+ "encoding/json"
|
|
"fmt"
|
|
"fmt"
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/PuerkitoBio/goquery"
|
|
lua "github.com/yuin/gopher-lua"
|
|
lua "github.com/yuin/gopher-lua"
|
|
@@ -25,12 +26,6 @@ var (
|
|
reg_fileter_text = regexp.MustCompile("([<>《》[]()()【】\\[\\]『』。;、;,\\s\u3000\u2003\u00a0]+|(\\\\n)+(\\\\t)+)")
|
|
reg_fileter_text = regexp.MustCompile("([<>《》[]()()【】\\[\\]『』。;、;,\\s\u3000\u2003\u00a0]+|(\\\\n)+(\\\\t)+)")
|
|
reg_filetype = regexp.MustCompile(`\.(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls|wps|jpeg|javascript:void\(0\)(;))$`)
|
|
reg_filetype = regexp.MustCompile(`\.(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls|wps|jpeg|javascript:void\(0\)(;))$`)
|
|
reg_err_filetype = regexp.MustCompile(`(\.(jtbz|jxzf|tytbz|hbz|tbyj|et|tbz|rtf|dwg|bmp|htbz|qttbz|application|zbid|pptx|gef)$|^(#_|file:))`)
|
|
reg_err_filetype = regexp.MustCompile(`(\.(jtbz|jxzf|tytbz|hbz|tbyj|et|tbz|rtf|dwg|bmp|htbz|qttbz|application|zbid|pptx|gef)$|^(#_|file:))`)
|
|
- //全匹配无效内容
|
|
|
|
- reg_invalid_text = regexp.MustCompile(`^(\d{4}年\d{1,2}月\d{1,2}日|潜在供应商|递交|查看评论|flash插件|打印文章|收藏|请点击|更多|无|采购详细内容|申请履约保函|关于我们|返回|百度一下|登录(系统)?|查看|网站首页|(免费)?注册|其他|立即报名|我要(报价|投诉|投标|留言)|[\d.])$`)
|
|
|
|
- //包含无效关键词
|
|
|
|
- reg_filter_text1 = regexp.MustCompile(`(\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)(/)?$|网站|政府|财产|得分|合同|业绩|负责人|页面|注意事项|注册|投诉|导航|登录|办理|请到原网|我要纠错|([\p{Han}]|\d)+[a-zA-z\d-]{5,}$|[上下首尾](一)?[页篇条]|跳转|详情请见原网站|详见(项目|公告)详情|原(文|公告)链接(地址)?|点击(报名|查看|查阅)(原公告(内容|详情))?|(点[击我])?(查看|查阅)(资质等级树|标的物详情|内容|公告|详情))`)
|
|
|
|
- //以关键词结尾
|
|
|
|
- reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`)
|
|
|
|
|
|
|
|
CheckLuaChan = make(chan map[string]interface{}, 1000)
|
|
CheckLuaChan = make(chan map[string]interface{}, 1000)
|
|
CheckLuaMap = make(map[string]bool)
|
|
CheckLuaMap = make(map[string]bool)
|
|
@@ -242,15 +237,15 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
|
|
script = lua["luacontent"].(string)
|
|
script = lua["luacontent"].(string)
|
|
}
|
|
}
|
|
s := spider.CreateSpider(DOWNLOADNODE, script)
|
|
s := spider.CreateSpider(DOWNLOADNODE, script)
|
|
- s.SpiderMaxPage = SPIDER_MAXPAGENUM //采集列表页总页数
|
|
|
|
s.Timeout = 60
|
|
s.Timeout = 60
|
|
- result := map[int64][]map[string]interface{}{}
|
|
|
|
|
|
+ result := map[int][]map[string]interface{}{}
|
|
downloadNum := 0
|
|
downloadNum := 0
|
|
for page := 1; page <= SPIDER_MAXPAGENUM; page++ {
|
|
for page := 1; page <= SPIDER_MAXPAGENUM; page++ {
|
|
- for i := 1; i <= 3; i++ { //每页列表重试三次
|
|
|
|
|
|
+ for i := 1; i <= 2; i++ { //每页列表重试2次
|
|
s.SpiderStartPage = int64(page)
|
|
s.SpiderStartPage = int64(page)
|
|
|
|
+ s.SpiderMaxPage = s.SpiderStartPage
|
|
result_page, downloadNum_page, _ := s.DownListPageItem() //列表页采集结果
|
|
result_page, downloadNum_page, _ := s.DownListPageItem() //列表页采集结果
|
|
- result[s.SpiderStartPage] = result_page[s.SpiderStartPage]
|
|
|
|
|
|
+ result[page] = result_page[int64(page)]
|
|
if downloadNum_page > 0 {
|
|
if downloadNum_page > 0 {
|
|
downloadNum += downloadNum_page
|
|
downloadNum += downloadNum_page
|
|
break
|
|
break
|
|
@@ -269,7 +264,7 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
|
|
}
|
|
}
|
|
|
|
|
|
// 列表页下载检测
|
|
// 列表页下载检测
|
|
-func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err, warn map[string]string, s *spider.Spider) {
|
|
|
|
|
|
+func listResultCheck(pageNum int, lists map[int][]map[string]interface{}, err, warn map[string]string, s *spider.Spider) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
//翻页
|
|
//翻页
|
|
if pageNum > 1 && len(lists) < SPIDER_MAXPAGENUM {
|
|
if pageNum > 1 && len(lists) < SPIDER_MAXPAGENUM {
|
|
@@ -277,13 +272,16 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
|
|
} else if pageNum == 1 && len(lists) > 1 {
|
|
} else if pageNum == 1 && len(lists) > 1 {
|
|
warn["列表页翻页异常"] = "列表页可以翻页,最大页却为1"
|
|
warn["列表页翻页异常"] = "列表页可以翻页,最大页却为1"
|
|
}
|
|
}
|
|
- var fileNum, downloadOkNum int //附件下载量,详情页下载量
|
|
|
|
|
|
+ var fileNum, downloadOkNum, fileFailedNum int //附件下载量,详情页下载量
|
|
n := 0
|
|
n := 0
|
|
wg := &sync.WaitGroup{}
|
|
wg := &sync.WaitGroup{}
|
|
lock := &sync.Mutex{}
|
|
lock := &sync.Mutex{}
|
|
ch := make(chan bool, 10)
|
|
ch := make(chan bool, 10)
|
|
|
|
+ hrefs := make([][]string, 2) //记录前两页数据链接
|
|
for _, list := range lists {
|
|
for _, list := range lists {
|
|
|
|
+ pageHrefs := []string{}
|
|
for _, l := range list {
|
|
for _, l := range list {
|
|
|
|
+ pageHrefs = append(pageHrefs, qu.ObjToString(l["href"]))
|
|
if n > 50 {
|
|
if n > 50 {
|
|
break
|
|
break
|
|
}
|
|
}
|
|
@@ -298,20 +296,8 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
|
|
spTmp := spider.CreateSpider(DOWNLOADNODE, s.ScriptFile)
|
|
spTmp := spider.CreateSpider(DOWNLOADNODE, s.ScriptFile)
|
|
warnTmp := map[string]string{}
|
|
warnTmp := map[string]string{}
|
|
errTmp := map[string]string{}
|
|
errTmp := map[string]string{}
|
|
- fileNumTmp := 0
|
|
|
|
- downloadOkNumTmp := 0
|
|
|
|
- //校验title
|
|
|
|
|
|
+ var fileNumTmp, fileFailedNumTmp, downloadOkNumTmp int
|
|
title := qu.ObjToString(tmp["title"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
- titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
|
|
|
|
- if len(titleRandomArr) > 0 {
|
|
|
|
- warnTmp["列表页title含乱码"] = "列表页title含乱码(" + title + "):" + strings.Join(titleRandomArr, "")
|
|
|
|
- }
|
|
|
|
- if !util.TitleHanReg.MatchString(title) {
|
|
|
|
- warnTmp["列表页title无汉字"] = "列表页title中无汉字(" + title + "):"
|
|
|
|
- } else if str := util.TitleFilterReg.FindString(title); str != "" {
|
|
|
|
- str = "列表页title中包含异常信息(" + title + "):" + str
|
|
|
|
- errTmp["列表页title中含异常信息"] = str
|
|
|
|
- }
|
|
|
|
//校验发布时间
|
|
//校验发布时间
|
|
publishtime := qu.ObjToString(tmp["publishtime"])
|
|
publishtime := qu.ObjToString(tmp["publishtime"])
|
|
if publishtime == "0" || publishtime == "" {
|
|
if publishtime == "0" || publishtime == "" {
|
|
@@ -328,12 +314,13 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
|
|
errTmp["列表页链接异常"] = "公告链接存在异常后缀(" + title + "):" + str
|
|
errTmp["列表页链接异常"] = "公告链接存在异常后缀(" + title + "):" + str
|
|
}
|
|
}
|
|
//详情页下载校验
|
|
//详情页下载校验
|
|
- downloadDetail(&fileNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
|
|
|
|
|
|
+ downloadDetail(&fileNumTmp, &fileFailedNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
|
|
spTmp.L.Close()
|
|
spTmp.L.Close()
|
|
|
|
|
|
lock.Lock()
|
|
lock.Lock()
|
|
fileNum += fileNumTmp
|
|
fileNum += fileNumTmp
|
|
downloadOkNum += downloadOkNumTmp
|
|
downloadOkNum += downloadOkNumTmp
|
|
|
|
+ fileFailedNum += fileFailedNumTmp
|
|
for k, v := range warnTmp {
|
|
for k, v := range warnTmp {
|
|
warn[k] = v
|
|
warn[k] = v
|
|
}
|
|
}
|
|
@@ -343,18 +330,29 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
|
|
lock.Unlock()
|
|
lock.Unlock()
|
|
}(l)
|
|
}(l)
|
|
}
|
|
}
|
|
|
|
+ hrefs = append(hrefs, pageHrefs)
|
|
}
|
|
}
|
|
wg.Wait()
|
|
wg.Wait()
|
|
|
|
+ hrefByte1, _ := json.Marshal(hrefs[0])
|
|
|
|
+ hrefByte2, _ := json.Marshal(hrefs[1])
|
|
|
|
+ if sp.GetHashKey(hrefByte1) == sp.GetHashKey(hrefByte2) {
|
|
|
|
+ err["前两页数据一致"] = "前两页数据一致"
|
|
|
|
+ }
|
|
if fileNum == 0 {
|
|
if fileNum == 0 {
|
|
warn["未下载任何附件"] = "未下载任何附件"
|
|
warn["未下载任何附件"] = "未下载任何附件"
|
|
}
|
|
}
|
|
|
|
+ if fileFailedNum > 0 {
|
|
|
|
+ rate := float64(fileFailedNum) / float64(fileNum)
|
|
|
|
+ warn["部分附件未成功下载"] = "部分附件未成功下载,一共+" + fmt.Sprint(len(lists)) + "个,失败占比:" + fmt.Sprintf("%.1f", rate) + "(" + warn["附件未成功下载"] + ")"
|
|
|
|
+ }
|
|
if n := len(lists) - downloadOkNum; n > 0 {
|
|
if n := len(lists) - downloadOkNum; n > 0 {
|
|
- warn["部分详情页未下载成功"] = "部分详情页未成功下载,失败" + fmt.Sprint(n) + "条"
|
|
|
|
|
|
+ rate := float64(n) / float64(len(lists))
|
|
|
|
+ warn["部分详情页未下载成功"] = "部分详情页未成功下载,一共+" + fmt.Sprint(len(lists)) + "条,失败占比:" + fmt.Sprintf("%.1f", rate) + "(" + warn["详情页下载失败"] + ")"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// 详情页下载校验
|
|
// 详情页下载校验
|
|
-func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, list map[string]interface{}, s *spider.Spider) {
|
|
|
|
|
|
+func downloadDetail(fileNum, fileFailedNumTmp, downloadOkNum *int, err, warn map[string]string, list map[string]interface{}, s *spider.Spider) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
param := map[string]string{}
|
|
param := map[string]string{}
|
|
data := map[string]interface{}{} //详情页下载结果集
|
|
data := map[string]interface{}{} //详情页下载结果集
|
|
@@ -379,8 +377,9 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
|
|
getFile = true
|
|
getFile = true
|
|
warn["附件含乱码"] = "部分附件中含有乱码(" + qu.ObjToString(param["href"]) + ")"
|
|
warn["附件含乱码"] = "部分附件中含有乱码(" + qu.ObjToString(param["href"]) + ")"
|
|
} else if qu.ObjToString(tmpMap["org_url"]) != "" && qu.ObjToString(tmpMap["fid"]) == "" { //附件未下载成功
|
|
} else if qu.ObjToString(tmpMap["org_url"]) != "" && qu.ObjToString(tmpMap["fid"]) == "" { //附件未下载成功
|
|
- warn["部分附件未成功下载"] = "部分附件未成功下载(" + qu.ObjToString(param["href"]) + ")"
|
|
|
|
|
|
+ warn["附件未成功下载"] = qu.ObjToString(param["href"])
|
|
getFile = true
|
|
getFile = true
|
|
|
|
+ *fileFailedNumTmp++
|
|
} else if qu.ObjToString(tmpMap["fid"]) != "" {
|
|
} else if qu.ObjToString(tmpMap["fid"]) != "" {
|
|
getFile = true
|
|
getFile = true
|
|
}
|
|
}
|
|
@@ -402,13 +401,14 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if !downloadDetailOk {
|
|
if !downloadDetailOk {
|
|
- warn["详情页下载失败"] = "详情页下载失败(" + qu.ObjToString(param["href"]) + ")"
|
|
|
|
|
|
+ warn["详情页下载失败"] = qu.ObjToString(param["href"])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// 详情页下载检测
|
|
// 详情页下载检测
|
|
func detailResultCheck(data map[string]interface{}, warn, err map[string]string) {
|
|
func detailResultCheck(data map[string]interface{}, warn, err map[string]string) {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
|
|
+ href := qu.ObjToString(data["href"])
|
|
//校验s_title
|
|
//校验s_title
|
|
if data["s_title"] == nil {
|
|
if data["s_title"] == nil {
|
|
err["s_title缺失"] = "详情页s_title缺失"
|
|
err["s_title缺失"] = "详情页s_title缺失"
|
|
@@ -417,18 +417,17 @@ func detailResultCheck(data map[string]interface{}, warn, err map[string]string)
|
|
s_title := qu.ObjToString(data["s_title"])
|
|
s_title := qu.ObjToString(data["s_title"])
|
|
s_titleRandomArr := util.RandomDFA.CheckSensitiveWord(s_title) //敏感词匹配乱码集
|
|
s_titleRandomArr := util.RandomDFA.CheckSensitiveWord(s_title) //敏感词匹配乱码集
|
|
if len(s_titleRandomArr) > 0 {
|
|
if len(s_titleRandomArr) > 0 {
|
|
- warn["s_title含乱码"] = "s_title含乱码:" + strings.Join(s_titleRandomArr, "")
|
|
|
|
|
|
+ warn["s_title含乱码"] = "s_title含乱码(" + href + "):" + strings.Join(s_titleRandomArr, "")
|
|
}
|
|
}
|
|
if !util.TitleHanReg.MatchString(s_title) {
|
|
if !util.TitleHanReg.MatchString(s_title) {
|
|
warn["s_title无汉字"] = "s_title中无汉字"
|
|
warn["s_title无汉字"] = "s_title中无汉字"
|
|
} else if str := util.TitleFilterReg.FindString(s_title); str != "" {
|
|
} else if str := util.TitleFilterReg.FindString(s_title); str != "" {
|
|
- str = "s_title中包含异常信息:" + str
|
|
|
|
|
|
+ str = "s_title中包含异常信息(" + href + "):" + str
|
|
err["s_title中含异常信息"] = str
|
|
err["s_title中含异常信息"] = str
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//校验title
|
|
//校验title
|
|
title := qu.ObjToString(data["title"])
|
|
title := qu.ObjToString(data["title"])
|
|
- href := qu.ObjToString(data["href"])
|
|
|
|
titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
|
|
titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
|
|
if len(titleRandomArr) > 0 {
|
|
if len(titleRandomArr) > 0 {
|
|
warn["详情页title含乱码"] = "详情页title含乱码(" + href + "):" + strings.Join(titleRandomArr, "")
|
|
warn["详情页title含乱码"] = "详情页title含乱码(" + href + "):" + strings.Join(titleRandomArr, "")
|
|
@@ -548,7 +547,7 @@ func findFileInfo(url, text string) (result []*Data) {
|
|
By: "text",
|
|
By: "text",
|
|
FileType: strings.ReplaceAll(fileType, ".", ""),
|
|
FileType: strings.ReplaceAll(fileType, ".", ""),
|
|
})
|
|
})
|
|
- } else {
|
|
|
|
|
|
+ } /*else {
|
|
//textStr = reg_fileter_text.ReplaceAllString(textStr, "") //过滤无效字符
|
|
//textStr = reg_fileter_text.ReplaceAllString(textStr, "") //过滤无效字符
|
|
if reg_invalid_text.ReplaceAllString(text, "") == "" { //无效,全文本匹配,舍弃
|
|
if reg_invalid_text.ReplaceAllString(text, "") == "" { //无效,全文本匹配,舍弃
|
|
return
|
|
return
|
|
@@ -560,6 +559,6 @@ func findFileInfo(url, text string) (result []*Data) {
|
|
Text: text,
|
|
Text: text,
|
|
By: "filter",
|
|
By: "filter",
|
|
})
|
|
})
|
|
- }
|
|
|
|
|
|
+ }*/
|
|
return
|
|
return
|
|
}
|
|
}
|