Browse Source

机检修改

mxs 1 năm trước cách đây
mục cha
commit
e1fe1c4f87
4 tập tin đã thay đổi với 48 bổ sung49 xóa
  1. 3 3
      src/front/spider.go
  2. 34 35
      src/luacheck/luacheck.go
  3. 1 1
      src/timetask/timetask.go
  4. 10 10
      src/util/util.go

+ 3 - 3
src/front/spider.go

@@ -801,10 +801,10 @@ func LuaTextCheck(infoformat int, param map[string]interface{}, param_list_chrom
 			errmsg += "详情页代码中含有lua原生方法;"
 		}
 		if u.ListFilterReg.MatchString(detail) && !strings.Contains(detail, "delete") { //三级页含过滤但是没有data["delete"]="true"
-			errmsg += `三级页缺少data["delete"]="true";`
+			warnmsg += `详情页缺少data["delete"]="true";`
 		}
 		if !strings.Contains(detail, "s_title") {
-			errmsg += "三级页缺少s_title;"
+			errmsg += "详情页缺少s_title;"
 		}
 		if strings.Contains(detail, "downloadByChrome") { //chrome下载方法动作参数判断
 			for _, act := range param_content_chrome {
@@ -816,7 +816,7 @@ func LuaTextCheck(infoformat int, param map[string]interface{}, param_list_chrom
 	}
 	//2、提醒校验
 	if !strings.Contains(detail, "downloadFile") && !strings.Contains(detail, "getFileAttachmentsArrayWithTag") {
-		warnmsg += "三级页缺少下载附件方法;"
+		warnmsg += "详情页缺少下载附件方法;"
 	}
 	msgResult["warn"] += warnmsg
 	msgResult["err"] += errmsg

+ 34 - 35
src/luacheck/luacheck.go

@@ -1,6 +1,7 @@
 package luacheck
 
 import (
+	"encoding/json"
 	"fmt"
 	"github.com/PuerkitoBio/goquery"
 	lua "github.com/yuin/gopher-lua"
@@ -25,12 +26,6 @@ var (
 	reg_fileter_text = regexp.MustCompile("([<>《》[]()()【】\\[\\]『』。;、;,\\s\u3000\u2003\u00a0]+|(\\\\n)+(\\\\t)+)")
 	reg_filetype     = regexp.MustCompile(`\.(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls|wps|jpeg|javascript:void\(0\)(;))$`)
 	reg_err_filetype = regexp.MustCompile(`(\.(jtbz|jxzf|tytbz|hbz|tbyj|et|tbz|rtf|dwg|bmp|htbz|qttbz|application|zbid|pptx|gef)$|^(#_|file:))`)
-	//全匹配无效内容
-	reg_invalid_text = regexp.MustCompile(`^(\d{4}年\d{1,2}月\d{1,2}日|潜在供应商|递交|查看评论|flash插件|打印文章|收藏|请点击|更多|无|采购详细内容|申请履约保函|关于我们|返回|百度一下|登录(系统)?|查看|网站首页|(免费)?注册|其他|立即报名|我要(报价|投诉|投标|留言)|[\d.])$`)
-	//包含无效关键词
-	reg_filter_text1 = regexp.MustCompile(`(\.(jsp|jspx|aspx|home|com|cn|shtml|jhtml|chtml|html|htm)(/)?$|网站|政府|财产|得分|合同|业绩|负责人|页面|注意事项|注册|投诉|导航|登录|办理|请到原网|我要纠错|([\p{Han}]|\d)+[a-zA-z\d-]{5,}$|[上下首尾](一)?[页篇条]|跳转|详情请见原网站|详见(项目|公告)详情|原(文|公告)链接(地址)?|点击(报名|查看|查阅)(原公告(内容|详情))?|(点[击我])?(查看|查阅)(资质等级树|标的物详情|内容|公告|详情))`)
-	//以关键词结尾
-	reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`)
 
 	CheckLuaChan    = make(chan map[string]interface{}, 1000)
 	CheckLuaMap     = make(map[string]bool)
@@ -242,15 +237,15 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
 		script = lua["luacontent"].(string)
 	}
 	s := spider.CreateSpider(DOWNLOADNODE, script)
-	s.SpiderMaxPage = SPIDER_MAXPAGENUM //采集列表页总页数
 	s.Timeout = 60
-	result := map[int64][]map[string]interface{}{}
+	result := map[int][]map[string]interface{}{}
 	downloadNum := 0
 	for page := 1; page <= SPIDER_MAXPAGENUM; page++ {
-		for i := 1; i <= 3; i++ { //每页列表重试三
+		for i := 1; i <= 2; i++ { //每页列表重试2
 			s.SpiderStartPage = int64(page)
+			s.SpiderMaxPage = s.SpiderStartPage
 			result_page, downloadNum_page, _ := s.DownListPageItem() //列表页采集结果
-			result[s.SpiderStartPage] = result_page[s.SpiderStartPage]
+			result[page] = result_page[int64(page)]
 			if downloadNum_page > 0 {
 				downloadNum += downloadNum_page
 				break
@@ -269,7 +264,7 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
 }
 
 // 列表页下载检测
-func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err, warn map[string]string, s *spider.Spider) {
+func listResultCheck(pageNum int, lists map[int][]map[string]interface{}, err, warn map[string]string, s *spider.Spider) {
 	defer qu.Catch()
 	//翻页
 	if pageNum > 1 && len(lists) < SPIDER_MAXPAGENUM {
@@ -277,13 +272,16 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 	} else if pageNum == 1 && len(lists) > 1 {
 		warn["列表页翻页异常"] = "列表页可以翻页,最大页却为1"
 	}
-	var fileNum, downloadOkNum int //附件下载量,详情页下载量
+	var fileNum, downloadOkNum, fileFailedNum int //附件下载量,详情页下载量
 	n := 0
 	wg := &sync.WaitGroup{}
 	lock := &sync.Mutex{}
 	ch := make(chan bool, 10)
+	hrefs := make([][]string, 2) //记录前两页数据链接
 	for _, list := range lists {
+		pageHrefs := []string{}
 		for _, l := range list {
+			pageHrefs = append(pageHrefs, qu.ObjToString(l["href"]))
 			if n > 50 {
 				break
 			}
@@ -298,20 +296,8 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 				spTmp := spider.CreateSpider(DOWNLOADNODE, s.ScriptFile)
 				warnTmp := map[string]string{}
 				errTmp := map[string]string{}
-				fileNumTmp := 0
-				downloadOkNumTmp := 0
-				//校验title
+				var fileNumTmp, fileFailedNumTmp, downloadOkNumTmp int
 				title := qu.ObjToString(tmp["title"])
-				titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
-				if len(titleRandomArr) > 0 {
-					warnTmp["列表页title含乱码"] = "列表页title含乱码(" + title + "):" + strings.Join(titleRandomArr, "")
-				}
-				if !util.TitleHanReg.MatchString(title) {
-					warnTmp["列表页title无汉字"] = "列表页title中无汉字(" + title + "):"
-				} else if str := util.TitleFilterReg.FindString(title); str != "" {
-					str = "列表页title中包含异常信息(" + title + "):" + str
-					errTmp["列表页title中含异常信息"] = str
-				}
 				//校验发布时间
 				publishtime := qu.ObjToString(tmp["publishtime"])
 				if publishtime == "0" || publishtime == "" {
@@ -328,12 +314,13 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 					errTmp["列表页链接异常"] = "公告链接存在异常后缀(" + title + "):" + str
 				}
 				//详情页下载校验
-				downloadDetail(&fileNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
+				downloadDetail(&fileNumTmp, &fileFailedNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
 				spTmp.L.Close()
 
 				lock.Lock()
 				fileNum += fileNumTmp
 				downloadOkNum += downloadOkNumTmp
+				fileFailedNum += fileFailedNumTmp
 				for k, v := range warnTmp {
 					warn[k] = v
 				}
@@ -343,18 +330,29 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 				lock.Unlock()
 			}(l)
 		}
+		hrefs = append(hrefs, pageHrefs)
 	}
 	wg.Wait()
+	hrefByte1, _ := json.Marshal(hrefs[0])
+	hrefByte2, _ := json.Marshal(hrefs[1])
+	if sp.GetHashKey(hrefByte1) == sp.GetHashKey(hrefByte2) {
+		err["前两页数据一致"] = "前两页数据一致"
+	}
 	if fileNum == 0 {
 		warn["未下载任何附件"] = "未下载任何附件"
 	}
+	if fileFailedNum > 0 {
+		rate := float64(fileFailedNum) / float64(fileNum)
+		warn["部分附件未成功下载"] = "部分附件未成功下载,一共+" + fmt.Sprint(len(lists)) + "个,失败占比:" + fmt.Sprintf("%.1f", rate) + "(" + warn["附件未成功下载"] + ")"
+	}
 	if n := len(lists) - downloadOkNum; n > 0 {
-		warn["部分详情页未下载成功"] = "部分详情页未成功下载,失败" + fmt.Sprint(n) + "条"
+		rate := float64(n) / float64(len(lists))
+		warn["部分详情页未下载成功"] = "部分详情页未成功下载,一共+" + fmt.Sprint(len(lists)) + "条,失败占比:" + fmt.Sprintf("%.1f", rate) + "(" + warn["详情页下载失败"] + ")"
 	}
 }
 
 // 详情页下载校验
-func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, list map[string]interface{}, s *spider.Spider) {
+func downloadDetail(fileNum, fileFailedNumTmp, downloadOkNum *int, err, warn map[string]string, list map[string]interface{}, s *spider.Spider) {
 	defer qu.Catch()
 	param := map[string]string{}
 	data := map[string]interface{}{} //详情页下载结果集
@@ -379,8 +377,9 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
 							getFile = true
 							warn["附件含乱码"] = "部分附件中含有乱码(" + qu.ObjToString(param["href"]) + ")"
 						} else if qu.ObjToString(tmpMap["org_url"]) != "" && qu.ObjToString(tmpMap["fid"]) == "" { //附件未下载成功
-							warn["部分附件未成功下载"] = "部分附件未成功下载(" + qu.ObjToString(param["href"]) + ")"
+							warn["附件未成功下载"] = qu.ObjToString(param["href"])
 							getFile = true
+							*fileFailedNumTmp++
 						} else if qu.ObjToString(tmpMap["fid"]) != "" {
 							getFile = true
 						}
@@ -402,13 +401,14 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
 		}
 	}
 	if !downloadDetailOk {
-		warn["详情页下载失败"] = "详情页下载失败(" + qu.ObjToString(param["href"]) + ")"
+		warn["详情页下载失败"] = qu.ObjToString(param["href"])
 	}
 }
 
 // 详情页下载检测
 func detailResultCheck(data map[string]interface{}, warn, err map[string]string) {
 	defer qu.Catch()
+	href := qu.ObjToString(data["href"])
 	//校验s_title
 	if data["s_title"] == nil {
 		err["s_title缺失"] = "详情页s_title缺失"
@@ -417,18 +417,17 @@ func detailResultCheck(data map[string]interface{}, warn, err map[string]string)
 		s_title := qu.ObjToString(data["s_title"])
 		s_titleRandomArr := util.RandomDFA.CheckSensitiveWord(s_title) //敏感词匹配乱码集
 		if len(s_titleRandomArr) > 0 {
-			warn["s_title含乱码"] = "s_title含乱码:" + strings.Join(s_titleRandomArr, "")
+			warn["s_title含乱码"] = "s_title含乱码(" + href + "):" + strings.Join(s_titleRandomArr, "")
 		}
 		if !util.TitleHanReg.MatchString(s_title) {
 			warn["s_title无汉字"] = "s_title中无汉字"
 		} else if str := util.TitleFilterReg.FindString(s_title); str != "" {
-			str = "s_title中包含异常信息:" + str
+			str = "s_title中包含异常信息(" + href + "):" + str
 			err["s_title中含异常信息"] = str
 		}
 	}
 	//校验title
 	title := qu.ObjToString(data["title"])
-	href := qu.ObjToString(data["href"])
 	titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
 	if len(titleRandomArr) > 0 {
 		warn["详情页title含乱码"] = "详情页title含乱码(" + href + "):" + strings.Join(titleRandomArr, "")
@@ -548,7 +547,7 @@ func findFileInfo(url, text string) (result []*Data) {
 			By:       "text",
 			FileType: strings.ReplaceAll(fileType, ".", ""),
 		})
-	} else {
+	} /*else {
 		//textStr = reg_fileter_text.ReplaceAllString(textStr, "")  //过滤无效字符
 		if reg_invalid_text.ReplaceAllString(text, "") == "" { //无效,全文本匹配,舍弃
 			return
@@ -560,6 +559,6 @@ func findFileInfo(url, text string) (result []*Data) {
 			Text: text,
 			By:   "filter",
 		})
-	}
+	}*/
 	return
 }

+ 1 - 1
src/timetask/timetask.go

@@ -86,7 +86,7 @@ func UpdateSiteInfo() {
 	qu.Debug("定时更新站点信息完成...")
 }
 
-// 更新重点网站爬虫信息
+// UpdateImportantCode 更新重点网站爬虫信息
 func UpdateImportantCode() {
 	data, _ := util.MgoEB.Find("site_code_baseinfo", nil, nil, map[string]interface{}{"spidercode": 1}, false, -1, -1)
 	for _, d := range *data {

+ 10 - 10
src/util/util.go

@@ -26,7 +26,7 @@ var (
 	DownLoadReg          = regexp.MustCompile(`download\(.*?\)`)
 	CodeTypeReg          = regexp.MustCompile(`(utf8|utf-8|gbk)`)
 	TitleHanReg          = regexp.MustCompile(`[\p{Han}]`)
-	TitleFilterReg       = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|查看详情|转发|<[^>]*?>)|(\.){3,6}`)
+	TitleFilterReg       = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|查看详情|转发|<[^>]*?>|(\.){3,6})`)
 	DetailFilterReg      = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|浏览次数|扫一扫|分享|区块链存证)`)
 	ContentHtmlFilterReg = regexp.MustCompile(`(iframe|img)`)
 	Area                 []string //省份
@@ -231,7 +231,7 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 			if !TitleHanReg.MatchString(title) {
 				msgMap["列表页title中无汉字"] = true
 			} else if str := TitleFilterReg.FindString(title); str != "" {
-				str = "列表页title中包含异常信息" + str
+				str = "列表页title中包含异常信息" + str
 				msgMap[str] = true
 			}
 			//校验发布时间
@@ -240,29 +240,29 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 				msgMap["列表页publishtime取值异常"] = true
 			} else {
 				t, err := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
-				if err != nil || t.Unix() <= 0 {
+				if err != nil || t.Unix() <= 0 || t.Unix() > time.Now().Unix() {
 					msgMap["列表页publishtime取值异常"] = true
 				}
 			}
 			//href
 			href := qu.ObjToString(l["href"])
 			if str := HrefReg.FindString(href); str != "" {
-				msgMap["公告链接存在异常后缀"+str] = true
+				msgMap["公告链接存在异常后缀"+str] = true
 			}
 		}
 	}
 	if len(data) > 0 {
 		//校验publishtime
 		if l_np_publishtime, ok := data["l_np_publishtime"].(lua.LNumber); ok {
-			if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
-				msgMap["三级页publishtime取值异常"] = true
+			if l_np_publishtime <= 0 || (l_np_publishtime > 0 && l_np_publishtime < 1000000000) || int64(l_np_publishtime) > time.Now().Unix() {
+				msgMap["详情页publishtime取值异常"] = true
 			}
 		} else if l_np_publishtime, ok := data["l_np_publishtime"].(int64); ok {
-			if l_np_publishtime <= 0 || l_np_publishtime > 0 && l_np_publishtime < 1000000000 {
-				msgMap["三级页publishtime取值异常"] = true
+			if l_np_publishtime <= 0 || (l_np_publishtime > 0 && l_np_publishtime < 1000000000) || l_np_publishtime > time.Now().Unix() {
+				msgMap["详情页publishtime取值异常"] = true
 			}
 		} else {
-			msgMap["三级页publishtime值类型异常"] = true
+			msgMap["详情页publishtime值类型异常"] = true
 		}
 		contenthtml := qu.ObjToString(data["contenthtml"])
 		if strings.Contains(contenthtml, "img") {
@@ -273,7 +273,7 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 		}
 		detail := qu.ObjToString(data["detail"])
 		if DetailFilterReg.MatchString(detail) {
-			msgMap["三级页正文提取包含无效内容"] = true
+			msgMap["详情页正文提取包含无效内容"] = true
 		}
 		//校验jsondata
 		if jsondata, ok := data["jsondata"].(map[string]interface{}); ok && len(jsondata) > 0 {