1 year ago · e1fe1c4f87
--- a/src/front/spider.go
+++ b/src/front/spider.go
@@ -801,10 +801,10 @@ func LuaTextCheck(infoformat int, param map[string]interface{}, param_list_chrom
 
															 			errmsg += "详情页代码中含有lua原生方法；"
														
 
															 		}
														
 
															 		if u.ListFilterReg.MatchString(detail) && !strings.Contains(detail, "delete") { //三级页含过滤但是没有data["delete"]="true"
														
 
															-			errmsg += `三级页缺少data["delete"]="true"；`
														
 
															+			warnmsg += `详情页缺少data["delete"]="true"；`
														
 
															 		}
														
 
															 		if !strings.Contains(detail, "s_title") {
														
 
															-			errmsg += "三级页缺少s_title；"
														
 
															+			errmsg += "详情页缺少s_title；"
														
 
															 		}
														
 
															 		if strings.Contains(detail, "downloadByChrome") { //chrome下载方法动作参数判断
														
 
															 			for _, act := range param_content_chrome {
														
@@ -816,7 +816,7 @@ func LuaTextCheck(infoformat int, param map[string]interface{}, param_list_chrom
 
															 	}
														
 
															 	//2、提醒校验
														
 
															 	if !strings.Contains(detail, "downloadFile") && !strings.Contains(detail, "getFileAttachmentsArrayWithTag") {
														
 
															-		warnmsg += "三级页缺少下载附件方法；"
														
 
															+		warnmsg += "详情页缺少下载附件方法；"
														
 
															 	}
														
 
															 	msgResult["warn"] += warnmsg
														
 
															 	msgResult["err"] += errmsg
														
--- a/src/luacheck/luacheck.go
+++ b/src/luacheck/luacheck.go
@@ -1,6 +1,7 @@
 
															 package luacheck
														
 
															 import (
														
 
															+	"encoding/json"
														
 
															 	"fmt"
														
 
															 	"github.com/PuerkitoBio/goquery"
														
 
															 	lua "github.com/yuin/gopher-lua"
														
@@ -25,12 +26,6 @@ var (
 
															 	reg_fileter_text = regexp.MustCompile("([<>《》［］()（）【】\\[\\]『』。;、；，\\s\u3000\u2003\u00a0]+|(\\\\n)+(\\\\t)+)")
														
 
															 	reg_filetype     = regexp.MustCompile(`\.(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls|wps|jpeg|javascript:void\(0\)(;))$`)
														
 
															 	reg_err_filetype = regexp.MustCompile(`(\.(jtbz|jxzf|tytbz|hbz|tbyj|et|tbz|rtf|dwg|bmp|htbz|qttbz|application|zbid|pptx|gef)$|^(#_|file:))`)
														
 
															-	//全匹配无效内容
														
 
															-	reg_invalid_text = regexp.MustCompile(`^(\d{4}年\d{1,2}月\d{1,2}日|潜在供应商|递交|查看评论|flash插件|打印文章|收藏|请点击|更多|无|采购详细内容|申请履约保函|关于我们|返回|百度一下|登录(系统)?|查看|网站首页|(免费)?注册|其他|立即报名|我要(报价|投诉|投标|留言)|[\d.])$`)
														
 
															-	//包含无效关键词
														
 
															-	reg_filter_text1 = regexp.MustCompile(`(\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)(/)?$|网站|政府|财产|得分|合同|业绩|负责人|页面|注意事项|注册|投诉|导航|登录|办理|请到原网|我要纠错|([\p{Han}]|\d)+[a-zA-z\d-]{5,}$|[上下首尾](一)?[页篇条]|跳转|详情请见原网站|详见(项目|公告)详情|原(文|公告)链接(地址)?|点击(报名|查看|查阅)(原公告(内容|详情))?|(点[击我])?(查看|查阅)(资质等级树|标的物详情|内容|公告|详情))`)
														
 
															-	//以关键词结尾
														
 
															-	reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`)
														
 
															 	CheckLuaChan    = make(chan map[string]interface{}, 1000)
														
 
															 	CheckLuaMap     = make(map[string]bool)
														
@@ -242,15 +237,15 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
 
															 		script = lua["luacontent"].(string)
														
 
															 	}
														
 
															 	s := spider.CreateSpider(DOWNLOADNODE, script)
														
 
															-	s.SpiderMaxPage = SPIDER_MAXPAGENUM //采集列表页总页数
														
 
															 	s.Timeout = 60
														
 
															-	result := map[int64][]map[string]interface{}{}
														
 
															+	result := map[int][]map[string]interface{}{}
														
 
															 	downloadNum := 0
														
 
															 	for page := 1; page <= SPIDER_MAXPAGENUM; page++ {
														
 
															-		for i := 1; i <= 3; i++ { //每页列表重试三次
														
 
															+		for i := 1; i <= 2; i++ { //每页列表重试2次
														
 
															 			s.SpiderStartPage = int64(page)
														
 
															+			s.SpiderMaxPage = s.SpiderStartPage
														
 
															 			result_page, downloadNum_page, _ := s.DownListPageItem() //列表页采集结果
														
 
															-			result[s.SpiderStartPage] = result_page[s.SpiderStartPage]
														
 
															+			result[page] = result_page[int64(page)]
														
 
															 			if downloadNum_page > 0 {
														
 
															 				downloadNum += downloadNum_page
														
 
															 				break
														
@@ -269,7 +264,7 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
 
															 }
														
 
															 // 列表页下载检测
														
 
															-func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err, warn map[string]string, s *spider.Spider) {
														
 
															+func listResultCheck(pageNum int, lists map[int][]map[string]interface{}, err, warn map[string]string, s *spider.Spider) {
														
 
															 	defer qu.Catch()
														
 
															 	//翻页
														
 
															 	if pageNum > 1 && len(lists) < SPIDER_MAXPAGENUM {
														
@@ -277,13 +272,16 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 
															 	} else if pageNum == 1 && len(lists) > 1 {
														
 
															 		warn["列表页翻页异常"] = "列表页可以翻页，最大页却为1"
														
 
															 	}
														
 
															-	var fileNum, downloadOkNum int //附件下载量，详情页下载量
														
 
															+	var fileNum, downloadOkNum, fileFailedNum int //附件下载量，详情页下载量
														
 
															 	n := 0
														
 
															 	wg := &sync.WaitGroup{}
														
 
															 	lock := &sync.Mutex{}
														
 
															 	ch := make(chan bool, 10)
														
 
															+	hrefs := make([][]string, 2) //记录前两页数据链接
														
 
															 	for _, list := range lists {
														
 
															+		pageHrefs := []string{}
														
 
															 		for _, l := range list {
														
 
															+			pageHrefs = append(pageHrefs, qu.ObjToString(l["href"]))
														
 
															 			if n > 50 {
														
 
															 				break
														
 
															 			}
														
@@ -298,20 +296,8 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 
															 				spTmp := spider.CreateSpider(DOWNLOADNODE, s.ScriptFile)
														
 
															 				warnTmp := map[string]string{}
														
 
															 				errTmp := map[string]string{}
														
 
															-				fileNumTmp := 0
														
 
															-				downloadOkNumTmp := 0
														
 
															-				//校验title
														
 
															+				var fileNumTmp, fileFailedNumTmp, downloadOkNumTmp int
														
 
															 				title := qu.ObjToString(tmp["title"])
														
 
															-				titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
														
 
															-				if len(titleRandomArr) > 0 {
														
 
															-					warnTmp["列表页title含乱码"] = "列表页title含乱码（" + title + "）：" + strings.Join(titleRandomArr, "")
														
 
															-				}
														
 
															-				if !util.TitleHanReg.MatchString(title) {
														
 
															-					warnTmp["列表页title无汉字"] = "列表页title中无汉字（" + title + "）："
														
 
															-				} else if str := util.TitleFilterReg.FindString(title); str != "" {
														
 
															-					str = "列表页title中包含异常信息（" + title + "）：" + str
														
 
															-					errTmp["列表页title中含异常信息"] = str
														
 
															-				}
														
 
															 				//校验发布时间
														
 
															 				publishtime := qu.ObjToString(tmp["publishtime"])
														
 
															 				if publishtime == "0" || publishtime == "" {
														
@@ -328,12 +314,13 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 
															 					errTmp["列表页链接异常"] = "公告链接存在异常后缀（" + title + "）：" + str
														
 
															 				}
														
 
															 				//详情页下载校验
														
 
															-				downloadDetail(&fileNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
														
 
															+				downloadDetail(&fileNumTmp, &fileFailedNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
														
 
															 				spTmp.L.Close()
														
 
															 				lock.Lock()
														
 
															 				fileNum += fileNumTmp
														
 
															 				downloadOkNum += downloadOkNumTmp
														
 
															+				fileFailedNum += fileFailedNumTmp
														
 
															 				for k, v := range warnTmp {
														
 
															 					warn[k] = v
														
 
															 				}
														
@@ -343,18 +330,29 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 
															 				lock.Unlock()
														
 
															 			}(l)
														
 
															 		}
														
 
															+		hrefs = append(hrefs, pageHrefs)
														
 
															 	}
														
 
															 	wg.Wait()
														
 
															+	hrefByte1, _ := json.Marshal(hrefs[0])
														
 
															+	hrefByte2, _ := json.Marshal(hrefs[1])
														
 
															+	if sp.GetHashKey(hrefByte1) == sp.GetHashKey(hrefByte2) {
														
 
															+		err["前两页数据一致"] = "前两页数据一致"
														
 
															+	}
														
 
															 	if fileNum == 0 {
														
 
															 		warn["未下载任何附件"] = "未下载任何附件"
														
 
															 	}
														
 
															+	if fileFailedNum > 0 {
														
 
															+		rate := float64(fileFailedNum) / float64(fileNum)
														
 
															+		warn["部分附件未成功下载"] = "部分附件未成功下载，一共+" + fmt.Sprint(len(lists)) + "个，失败占比：" + fmt.Sprintf("%.1f", rate) + "（" + warn["附件未成功下载"] + "）"
														
 
															+	}
														
 
															 	if n := len(lists) - downloadOkNum; n > 0 {
														
 
															-		warn["部分详情页未下载成功"] = "部分详情页未成功下载，失败" + fmt.Sprint(n) + "条"
														
 
															+		rate := float64(n) / float64(len(lists))
														
 
															+		warn["部分详情页未下载成功"] = "部分详情页未成功下载，一共+" + fmt.Sprint(len(lists)) + "条，失败占比：" + fmt.Sprintf("%.1f", rate) + "（" + warn["详情页下载失败"] + "）"
														
 
															 	}
														
 
															 }
														
 
															 // 详情页下载校验
														
 
															-func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, list map[string]interface{}, s *spider.Spider) {
														
 
															+func downloadDetail(fileNum, fileFailedNumTmp, downloadOkNum *int, err, warn map[string]string, list map[string]interface{}, s *spider.Spider) {
														
 
															 	defer qu.Catch()
														
 
															 	param := map[string]string{}
														
 
															 	data := map[string]interface{}{} //详情页下载结果集
														
@@ -379,8 +377,9 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
 
															 							getFile = true
														
 
															 							warn["附件含乱码"] = "部分附件中含有乱码（" + qu.ObjToString(param["href"]) + "）"
														
 
															 						} else if qu.ObjToString(tmpMap["org_url"]) != "" && qu.ObjToString(tmpMap["fid"]) == "" { //附件未下载成功
														
 
															-							warn["部分附件未成功下载"] = "部分附件未成功下载（" + qu.ObjToString(param["href"]) + "）"
														
 
															+							warn["附件未成功下载"] = qu.ObjToString(param["href"])
														
 
															 							getFile = true
														
 
															+							*fileFailedNumTmp++
														
 
															 						} else if qu.ObjToString(tmpMap["fid"]) != "" {
														
 
															 							getFile = true
														
 
															 						}
														
@@ -402,13 +401,14 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
 
															 		}
														
 
															 	}
														
 
															 	if !downloadDetailOk {
														
 
															-		warn["详情页下载失败"] = "详情页下载失败（" + qu.ObjToString(param["href"]) + "）"
														
 
															+		warn["详情页下载失败"] = qu.ObjToString(param["href"])
														
 
															 	}
														
 
															 }
														
 
															 // 详情页下载检测
														
 
															 func detailResultCheck(data map[string]interface{}, warn, err map[string]string) {
														
 
															 	defer qu.Catch()
														
 
															+	href := qu.ObjToString(data["href"])
														
 
															 	//校验s_title
														
 
															 	if data["s_title"] == nil {
														
 
															 		err["s_title缺失"] = "详情页s_title缺失"
														
@@ -417,18 +417,17 @@ func detailResultCheck(data map[string]interface{}, warn, err map[string]string)
 
															 		s_title := qu.ObjToString(data["s_title"])
														
 
															 		s_titleRandomArr := util.RandomDFA.CheckSensitiveWord(s_title) //敏感词匹配乱码集
														
 
															 		if len(s_titleRandomArr) > 0 {
														
 
															-			warn["s_title含乱码"] = "s_title含乱码：" + strings.Join(s_titleRandomArr, "")
														
 
															+			warn["s_title含乱码"] = "s_title含乱码（" + href + "）：" + strings.Join(s_titleRandomArr, "")
														
 
															 		}
														
 
															 		if !util.TitleHanReg.MatchString(s_title) {
														
 
															 			warn["s_title无汉字"] = "s_title中无汉字"
														
 
															 		} else if str := util.TitleFilterReg.FindString(s_title); str != "" {
														
 
															-			str = "s_title中包含异常信息：" + str
														
 
															+			str = "s_title中包含异常信息（" + href + "）：" + str
														
 
															 			err["s_title中含异常信息"] = str
														
 
															 		}
														
 
															 	}
														
 
															 	//校验title
														
 
															 	title := qu.ObjToString(data["title"])
														
 
															-	href := qu.ObjToString(data["href"])
														
 
															 	titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
														
 
															 	if len(titleRandomArr) > 0 {
														
 
															 		warn["详情页title含乱码"] = "详情页title含乱码（" + href + "）：" + strings.Join(titleRandomArr, "")
														
@@ -548,7 +547,7 @@ func findFileInfo(url, text string) (result []*Data) {
 
															 			By:       "text",
														
 
															 			FileType: strings.ReplaceAll(fileType, ".", ""),
														
 
															 		})
														
 
															-	} else {
														
 
															+	} /*else {
														
 
															 		//textStr = reg_fileter_text.ReplaceAllString(textStr, "")  //过滤无效字符
														
 
															 		if reg_invalid_text.ReplaceAllString(text, "") == "" { //无效，全文本匹配，舍弃
														
 
															 			return
														
@@ -560,6 +559,6 @@ func findFileInfo(url, text string) (result []*Data) {
 
															 			Text: text,
														
 
															 			By:   "filter",
														
 
															 		})
														
 
															-	}
														
 
															+	}*/
														
 
															 	return
														
 
															 }
														
--- a/src/timetask/timetask.go
+++ b/src/timetask/timetask.go
@@ -86,7 +86,7 @@ func UpdateSiteInfo() {
 
															 	qu.Debug("定时更新站点信息完成...")
														
 
															 }
														
 
															-// 更新重点网站爬虫信息
														
 
															+// UpdateImportantCode 更新重点网站爬虫信息
														
 
															 func UpdateImportantCode() {
														
 
															 	data, _ := util.MgoEB.Find("site_code_baseinfo", nil, nil, map[string]interface{}{"spidercode": 1}, false, -1, -1)
														
 
															 	for _, d := range *data {
														
--- a/src/util/util.go
+++ b/src/util/util.go
@@ -26,7 +26,7 @@ var (
 
															 	DownLoadReg          = regexp.MustCompile(`download\(.*?\)`)
														
 
															 	CodeTypeReg          = regexp.MustCompile(`(utf8|utf-8|gbk)`)
														
 
															 	TitleHanReg          = regexp.MustCompile(`[\p{Han}]`)
														
 
															-	TitleFilterReg       = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|查看详情|转发|<[^>]*?>)|(\.){3,6}`)
														
 
															+	TitleFilterReg       = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|查看详情|转发|<[^>]*?>|(\.){3,6})`)
														
 
															 	DetailFilterReg      = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|浏览次数|扫一扫|分享|区块链存证)`)
														
 
															 	ContentHtmlFilterReg = regexp.MustCompile(`(iframe|img)`)
														
 
															 	Area                 []string //省份
														
@@ -231,7 +231,7 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 
															 			if !TitleHanReg.MatchString(title) {
														
 
															 				msgMap["列表页title中无汉字"] = true
														
 
															 			} else if str := TitleFilterReg.FindString(title); str != "" {
														
 
															-				str = "列表页title中包含异常信息" + str
														
 
															+				str = "列表页title中包含异常信息：" + str
														
 
															 				msgMap[str] = true
														
 
															 			}
														
 
															 			//校验发布时间
														
@@ -240,29 +240,29 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 
															 				msgMap["列表页publishtime取值异常"] = true
														
 
															 			} else {
														
 
															 				t, err := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
														
 
															-				if err != nil || t.Unix() <= 0 {
														
 
															+				if err != nil || t.Unix() <= 0 || t.Unix() > time.Now().Unix() {
														
 
															 					msgMap["列表页publishtime取值异常"] = true
														
 
															 				}
														
 
															 			}
														
 
															 			//href
														
 
															 			href := qu.ObjToString(l["href"])
														
 
															 			if str := HrefReg.FindString(href); str != "" {
														
 
															-				msgMap["公告链接存在异常后缀"+str] = true
														
 
															+				msgMap["公告链接存在异常后缀："+str] = true
														
 
															 			}
														
 
															 		}
														
 
															 	}
														
 
															 	if len(data) > 0 {
														
 
															 		//校验publishtime
														
 
															 		if l_np_publishtime, ok := data["l_np_publishtime"].(lua.LNumber); ok {
														
 
															-			if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
														
 
															-				msgMap["三级页publishtime取值异常"] = true
														
 
															+			if l_np_publishtime <= 0 || (l_np_publishtime > 0 && l_np_publishtime < 1000000000) || int64(l_np_publishtime) > time.Now().Unix() {
														
 
															+				msgMap["详情页publishtime取值异常"] = true
														
 
															 			}
														
 
															 		} else if l_np_publishtime, ok := data["l_np_publishtime"].(int64); ok {
														
 
															-			if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
														
 
															-				msgMap["三级页publishtime取值异常"] = true
														
 
															+			if l_np_publishtime <= 0 || (l_np_publishtime > 0 && l_np_publishtime < 1000000000) || l_np_publishtime > time.Now().Unix() {
														
 
															+				msgMap["详情页publishtime取值异常"] = true
														
 
															 			}
														
 
															 		} else {
														
 
															-			msgMap["三级页publishtime值类型异常"] = true
														
 
															+			msgMap["详情页publishtime值类型异常"] = true
														
 
															 		}
														
 
															 		contenthtml := qu.ObjToString(data["contenthtml"])
														
 
															 		if strings.Contains(contenthtml, "img") {
														
@@ -273,7 +273,7 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 
															 		}
														
 
															 		detail := qu.ObjToString(data["detail"])
														
 
															 		if DetailFilterReg.MatchString(detail) {
														
 
															-			msgMap["三级页正文提取包含无效内容"] = true
														
 
															+			msgMap["详情页正文提取包含无效内容"] = true
														
 
															 		}
														
 
															 		//校验jsondata
														
 
															 		if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {