1 năm trước cách đây · e1fe1c4f87
--- a/src/front/spider.go
+++ b/src/front/spider.go
@@ -801,10 +801,10 @@ func LuaTextCheck(infoformat int, param map[string]interface{}, param_list_chrom
 
				 			errmsg += "详情页代码中含有lua原生方法；"
			
 
				 		}
			
 
				 		if u.ListFilterReg.MatchString(detail) && !strings.Contains(detail, "delete") { //三级页含过滤但是没有data["delete"]="true"
			
 
				-			errmsg += `三级页缺少data["delete"]="true"；`
			
 
				+			warnmsg += `详情页缺少data["delete"]="true"；`
			
 
				 		}
			
 
				 		if !strings.Contains(detail, "s_title") {
			
 
				-			errmsg += "三级页缺少s_title；"
			
 
				+			errmsg += "详情页缺少s_title；"
			
 
				 		}
			
 
				 		if strings.Contains(detail, "downloadByChrome") { //chrome下载方法动作参数判断
			
 
				 			for _, act := range param_content_chrome {
			
@@ -816,7 +816,7 @@ func LuaTextCheck(infoformat int, param map[string]interface{}, param_list_chrom
 
				 	}
			
 
				 	//2、提醒校验
			
 
				 	if !strings.Contains(detail, "downloadFile") && !strings.Contains(detail, "getFileAttachmentsArrayWithTag") {
			
 
				-		warnmsg += "三级页缺少下载附件方法；"
			
 
				+		warnmsg += "详情页缺少下载附件方法；"
			
 
				 	}
			
 
				 	msgResult["warn"] += warnmsg
			
 
				 	msgResult["err"] += errmsg
			
--- a/src/luacheck/luacheck.go
+++ b/src/luacheck/luacheck.go
@@ -1,6 +1,7 @@
 
				 package luacheck
			
 
				 
			
 
				 import (
			
 
				+	"encoding/json"
			
 
				 	"fmt"
			
 
				 	"github.com/PuerkitoBio/goquery"
			
 
				 	lua "github.com/yuin/gopher-lua"
			
@@ -25,12 +26,6 @@ var (
 
				 	reg_fileter_text = regexp.MustCompile("([<>《》［］()（）【】\\[\\]『』。;、；，\\s\u3000\u2003\u00a0]+|(\\\\n)+(\\\\t)+)")
			
 
				 	reg_filetype     = regexp.MustCompile(`\.(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls|wps|jpeg|javascript:void\(0\)(;))$`)
			
 
				 	reg_err_filetype = regexp.MustCompile(`(\.(jtbz|jxzf|tytbz|hbz|tbyj|et|tbz|rtf|dwg|bmp|htbz|qttbz|application|zbid|pptx|gef)$|^(#_|file:))`)
			
 
				-	//全匹配无效内容
			
 
				-	reg_invalid_text = regexp.MustCompile(`^(\d{4}年\d{1,2}月\d{1,2}日|潜在供应商|递交|查看评论|flash插件|打印文章|收藏|请点击|更多|无|采购详细内容|申请履约保函|关于我们|返回|百度一下|登录(系统)?|查看|网站首页|(免费)?注册|其他|立即报名|我要(报价|投诉|投标|留言)|[\d.])$`)
			
 
				-	//包含无效关键词
			
 
				-	reg_filter_text1 = regexp.MustCompile(`(\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)(/)?$|网站|政府|财产|得分|合同|业绩|负责人|页面|注意事项|注册|投诉|导航|登录|办理|请到原网|我要纠错|([\p{Han}]|\d)+[a-zA-z\d-]{5,}$|[上下首尾](一)?[页篇条]|跳转|详情请见原网站|详见(项目|公告)详情|原(文|公告)链接(地址)?|点击(报名|查看|查阅)(原公告(内容|详情))?|(点[击我])?(查看|查阅)(资质等级树|标的物详情|内容|公告|详情))`)
			
 
				-	//以关键词结尾
			
 
				-	reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`)
			
 
				 
			
 
				 	CheckLuaChan    = make(chan map[string]interface{}, 1000)
			
 
				 	CheckLuaMap     = make(map[string]bool)
			
@@ -242,15 +237,15 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
 
				 		script = lua["luacontent"].(string)
			
 
				 	}
			
 
				 	s := spider.CreateSpider(DOWNLOADNODE, script)
			
 
				-	s.SpiderMaxPage = SPIDER_MAXPAGENUM //采集列表页总页数
			
 
				 	s.Timeout = 60
			
 
				-	result := map[int64][]map[string]interface{}{}
			
 
				+	result := map[int][]map[string]interface{}{}
			
 
				 	downloadNum := 0
			
 
				 	for page := 1; page <= SPIDER_MAXPAGENUM; page++ {
			
 
				-		for i := 1; i <= 3; i++ { //每页列表重试三次
			
 
				+		for i := 1; i <= 2; i++ { //每页列表重试2次
			
 
				 			s.SpiderStartPage = int64(page)
			
 
				+			s.SpiderMaxPage = s.SpiderStartPage
			
 
				 			result_page, downloadNum_page, _ := s.DownListPageItem() //列表页采集结果
			
 
				-			result[s.SpiderStartPage] = result_page[s.SpiderStartPage]
			
 
				+			result[page] = result_page[int64(page)]
			
 
				 			if downloadNum_page > 0 {
			
 
				 				downloadNum += downloadNum_page
			
 
				 				break
			
@@ -269,7 +264,7 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
 
				 }
			
 
				 
			
 
				 // 列表页下载检测
			
 
				-func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err, warn map[string]string, s *spider.Spider) {
			
 
				+func listResultCheck(pageNum int, lists map[int][]map[string]interface{}, err, warn map[string]string, s *spider.Spider) {
			
 
				 	defer qu.Catch()
			
 
				 	//翻页
			
 
				 	if pageNum > 1 && len(lists) < SPIDER_MAXPAGENUM {
			
@@ -277,13 +272,16 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 
				 	} else if pageNum == 1 && len(lists) > 1 {
			
 
				 		warn["列表页翻页异常"] = "列表页可以翻页，最大页却为1"
			
 
				 	}
			
 
				-	var fileNum, downloadOkNum int //附件下载量，详情页下载量
			
 
				+	var fileNum, downloadOkNum, fileFailedNum int //附件下载量，详情页下载量
			
 
				 	n := 0
			
 
				 	wg := &sync.WaitGroup{}
			
 
				 	lock := &sync.Mutex{}
			
 
				 	ch := make(chan bool, 10)
			
 
				+	hrefs := make([][]string, 2) //记录前两页数据链接
			
 
				 	for _, list := range lists {
			
 
				+		pageHrefs := []string{}
			
 
				 		for _, l := range list {
			
 
				+			pageHrefs = append(pageHrefs, qu.ObjToString(l["href"]))
			
 
				 			if n > 50 {
			
 
				 				break
			
 
				 			}
			
@@ -298,20 +296,8 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 
				 				spTmp := spider.CreateSpider(DOWNLOADNODE, s.ScriptFile)
			
 
				 				warnTmp := map[string]string{}
			
 
				 				errTmp := map[string]string{}
			
 
				-				fileNumTmp := 0
			
 
				-				downloadOkNumTmp := 0
			
 
				-				//校验title
			
 
				+				var fileNumTmp, fileFailedNumTmp, downloadOkNumTmp int
			
 
				 				title := qu.ObjToString(tmp["title"])
			
 
				-				titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
			
 
				-				if len(titleRandomArr) > 0 {
			
 
				-					warnTmp["列表页title含乱码"] = "列表页title含乱码（" + title + "）：" + strings.Join(titleRandomArr, "")
			
 
				-				}
			
 
				-				if !util.TitleHanReg.MatchString(title) {
			
 
				-					warnTmp["列表页title无汉字"] = "列表页title中无汉字（" + title + "）："
			
 
				-				} else if str := util.TitleFilterReg.FindString(title); str != "" {
			
 
				-					str = "列表页title中包含异常信息（" + title + "）：" + str
			
 
				-					errTmp["列表页title中含异常信息"] = str
			
 
				-				}
			
 
				 				//校验发布时间
			
 
				 				publishtime := qu.ObjToString(tmp["publishtime"])
			
 
				 				if publishtime == "0" || publishtime == "" {
			
@@ -328,12 +314,13 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 
				 					errTmp["列表页链接异常"] = "公告链接存在异常后缀（" + title + "）：" + str
			
 
				 				}
			
 
				 				//详情页下载校验
			
 
				-				downloadDetail(&fileNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
			
 
				+				downloadDetail(&fileNumTmp, &fileFailedNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
			
 
				 				spTmp.L.Close()
			
 
				 
			
 
				 				lock.Lock()
			
 
				 				fileNum += fileNumTmp
			
 
				 				downloadOkNum += downloadOkNumTmp
			
 
				+				fileFailedNum += fileFailedNumTmp
			
 
				 				for k, v := range warnTmp {
			
 
				 					warn[k] = v
			
 
				 				}
			
@@ -343,18 +330,29 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 
				 				lock.Unlock()
			
 
				 			}(l)
			
 
				 		}
			
 
				+		hrefs = append(hrefs, pageHrefs)
			
 
				 	}
			
 
				 	wg.Wait()
			
 
				+	hrefByte1, _ := json.Marshal(hrefs[0])
			
 
				+	hrefByte2, _ := json.Marshal(hrefs[1])
			
 
				+	if sp.GetHashKey(hrefByte1) == sp.GetHashKey(hrefByte2) {
			
 
				+		err["前两页数据一致"] = "前两页数据一致"
			
 
				+	}
			
 
				 	if fileNum == 0 {
			
 
				 		warn["未下载任何附件"] = "未下载任何附件"
			
 
				 	}
			
 
				+	if fileFailedNum > 0 {
			
 
				+		rate := float64(fileFailedNum) / float64(fileNum)
			
 
				+		warn["部分附件未成功下载"] = "部分附件未成功下载，一共+" + fmt.Sprint(len(lists)) + "个，失败占比：" + fmt.Sprintf("%.1f", rate) + "（" + warn["附件未成功下载"] + "）"
			
 
				+	}
			
 
				 	if n := len(lists) - downloadOkNum; n > 0 {
			
 
				-		warn["部分详情页未下载成功"] = "部分详情页未成功下载，失败" + fmt.Sprint(n) + "条"
			
 
				+		rate := float64(n) / float64(len(lists))
			
 
				+		warn["部分详情页未下载成功"] = "部分详情页未成功下载，一共+" + fmt.Sprint(len(lists)) + "条，失败占比：" + fmt.Sprintf("%.1f", rate) + "（" + warn["详情页下载失败"] + "）"
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 // 详情页下载校验
			
 
				-func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, list map[string]interface{}, s *spider.Spider) {
			
 
				+func downloadDetail(fileNum, fileFailedNumTmp, downloadOkNum *int, err, warn map[string]string, list map[string]interface{}, s *spider.Spider) {
			
 
				 	defer qu.Catch()
			
 
				 	param := map[string]string{}
			
 
				 	data := map[string]interface{}{} //详情页下载结果集
			
@@ -379,8 +377,9 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
 
				 							getFile = true
			
 
				 							warn["附件含乱码"] = "部分附件中含有乱码（" + qu.ObjToString(param["href"]) + "）"
			
 
				 						} else if qu.ObjToString(tmpMap["org_url"]) != "" && qu.ObjToString(tmpMap["fid"]) == "" { //附件未下载成功
			
 
				-							warn["部分附件未成功下载"] = "部分附件未成功下载（" + qu.ObjToString(param["href"]) + "）"
			
 
				+							warn["附件未成功下载"] = qu.ObjToString(param["href"])
			
 
				 							getFile = true
			
 
				+							*fileFailedNumTmp++
			
 
				 						} else if qu.ObjToString(tmpMap["fid"]) != "" {
			
 
				 							getFile = true
			
 
				 						}
			
@@ -402,13 +401,14 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
 
				 		}
			
 
				 	}
			
 
				 	if !downloadDetailOk {
			
 
				-		warn["详情页下载失败"] = "详情页下载失败（" + qu.ObjToString(param["href"]) + "）"
			
 
				+		warn["详情页下载失败"] = qu.ObjToString(param["href"])
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 // 详情页下载检测
			
 
				 func detailResultCheck(data map[string]interface{}, warn, err map[string]string) {
			
 
				 	defer qu.Catch()
			
 
				+	href := qu.ObjToString(data["href"])
			
 
				 	//校验s_title
			
 
				 	if data["s_title"] == nil {
			
 
				 		err["s_title缺失"] = "详情页s_title缺失"
			
@@ -417,18 +417,17 @@ func detailResultCheck(data map[string]interface{}, warn, err map[string]string)
 
				 		s_title := qu.ObjToString(data["s_title"])
			
 
				 		s_titleRandomArr := util.RandomDFA.CheckSensitiveWord(s_title) //敏感词匹配乱码集
			
 
				 		if len(s_titleRandomArr) > 0 {
			
 
				-			warn["s_title含乱码"] = "s_title含乱码：" + strings.Join(s_titleRandomArr, "")
			
 
				+			warn["s_title含乱码"] = "s_title含乱码（" + href + "）：" + strings.Join(s_titleRandomArr, "")
			
 
				 		}
			
 
				 		if !util.TitleHanReg.MatchString(s_title) {
			
 
				 			warn["s_title无汉字"] = "s_title中无汉字"
			
 
				 		} else if str := util.TitleFilterReg.FindString(s_title); str != "" {
			
 
				-			str = "s_title中包含异常信息：" + str
			
 
				+			str = "s_title中包含异常信息（" + href + "）：" + str
			
 
				 			err["s_title中含异常信息"] = str
			
 
				 		}
			
 
				 	}
			
 
				 	//校验title
			
 
				 	title := qu.ObjToString(data["title"])
			
 
				-	href := qu.ObjToString(data["href"])
			
 
				 	titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
			
 
				 	if len(titleRandomArr) > 0 {
			
 
				 		warn["详情页title含乱码"] = "详情页title含乱码（" + href + "）：" + strings.Join(titleRandomArr, "")
			
@@ -548,7 +547,7 @@ func findFileInfo(url, text string) (result []*Data) {
 
				 			By:       "text",
			
 
				 			FileType: strings.ReplaceAll(fileType, ".", ""),
			
 
				 		})
			
 
				-	} else {
			
 
				+	} /*else {
			
 
				 		//textStr = reg_fileter_text.ReplaceAllString(textStr, "")  //过滤无效字符
			
 
				 		if reg_invalid_text.ReplaceAllString(text, "") == "" { //无效，全文本匹配，舍弃
			
 
				 			return
			
@@ -560,6 +559,6 @@ func findFileInfo(url, text string) (result []*Data) {
 
				 			Text: text,
			
 
				 			By:   "filter",
			
 
				 		})
			
 
				-	}
			
 
				+	}*/
			
 
				 	return
			
 
				 }
			
--- a/src/timetask/timetask.go
+++ b/src/timetask/timetask.go
@@ -86,7 +86,7 @@ func UpdateSiteInfo() {
 
				 	qu.Debug("定时更新站点信息完成...")
			
 
				 }
			
 
				 
			
 
				-// 更新重点网站爬虫信息
			
 
				+// UpdateImportantCode 更新重点网站爬虫信息
			
 
				 func UpdateImportantCode() {
			
 
				 	data, _ := util.MgoEB.Find("site_code_baseinfo", nil, nil, map[string]interface{}{"spidercode": 1}, false, -1, -1)
			
 
				 	for _, d := range *data {
			
--- a/src/util/util.go
+++ b/src/util/util.go
@@ -26,7 +26,7 @@ var (
 
				 	DownLoadReg          = regexp.MustCompile(`download\(.*?\)`)
			
 
				 	CodeTypeReg          = regexp.MustCompile(`(utf8|utf-8|gbk)`)
			
 
				 	TitleHanReg          = regexp.MustCompile(`[\p{Han}]`)
			
 
				-	TitleFilterReg       = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|查看详情|转发|<[^>]*?>)|(\.){3,6}`)
			
 
				+	TitleFilterReg       = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|查看详情|转发|<[^>]*?>|(\.){3,6})`)
			
 
				 	DetailFilterReg      = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|浏览次数|扫一扫|分享|区块链存证)`)
			
 
				 	ContentHtmlFilterReg = regexp.MustCompile(`(iframe|img)`)
			
 
				 	Area                 []string //省份
			
@@ -231,7 +231,7 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 
				 			if !TitleHanReg.MatchString(title) {
			
 
				 				msgMap["列表页title中无汉字"] = true
			
 
				 			} else if str := TitleFilterReg.FindString(title); str != "" {
			
 
				-				str = "列表页title中包含异常信息" + str
			
 
				+				str = "列表页title中包含异常信息：" + str
			
 
				 				msgMap[str] = true
			
 
				 			}
			
 
				 			//校验发布时间
			
@@ -240,29 +240,29 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 
				 				msgMap["列表页publishtime取值异常"] = true
			
 
				 			} else {
			
 
				 				t, err := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
			
 
				-				if err != nil || t.Unix() <= 0 {
			
 
				+				if err != nil || t.Unix() <= 0 || t.Unix() > time.Now().Unix() {
			
 
				 					msgMap["列表页publishtime取值异常"] = true
			
 
				 				}
			
 
				 			}
			
 
				 			//href
			
 
				 			href := qu.ObjToString(l["href"])
			
 
				 			if str := HrefReg.FindString(href); str != "" {
			
 
				-				msgMap["公告链接存在异常后缀"+str] = true
			
 
				+				msgMap["公告链接存在异常后缀："+str] = true
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 	if len(data) > 0 {
			
 
				 		//校验publishtime
			
 
				 		if l_np_publishtime, ok := data["l_np_publishtime"].(lua.LNumber); ok {
			
 
				-			if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
			
 
				-				msgMap["三级页publishtime取值异常"] = true
			
 
				+			if l_np_publishtime <= 0 || (l_np_publishtime > 0 && l_np_publishtime < 1000000000) || int64(l_np_publishtime) > time.Now().Unix() {
			
 
				+				msgMap["详情页publishtime取值异常"] = true
			
 
				 			}
			
 
				 		} else if l_np_publishtime, ok := data["l_np_publishtime"].(int64); ok {
			
 
				-			if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
			
 
				-				msgMap["三级页publishtime取值异常"] = true
			
 
				+			if l_np_publishtime <= 0 || (l_np_publishtime > 0 && l_np_publishtime < 1000000000) || l_np_publishtime > time.Now().Unix() {
			
 
				+				msgMap["详情页publishtime取值异常"] = true
			
 
				 			}
			
 
				 		} else {
			
 
				-			msgMap["三级页publishtime值类型异常"] = true
			
 
				+			msgMap["详情页publishtime值类型异常"] = true
			
 
				 		}
			
 
				 		contenthtml := qu.ObjToString(data["contenthtml"])
			
 
				 		if strings.Contains(contenthtml, "img") {
			
@@ -273,7 +273,7 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 
				 		}
			
 
				 		detail := qu.ObjToString(data["detail"])
			
 
				 		if DetailFilterReg.MatchString(detail) {
			
 
				-			msgMap["三级页正文提取包含无效内容"] = true
			
 
				+			msgMap["详情页正文提取包含无效内容"] = true
			
 
				 		}
			
 
				 		//校验jsondata
			
 
				 		if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {