Ver código fonte

机检修改

mxs 1 ano atrás
pai
commit
1dfbebf469

+ 105 - 48
src/luacheck/luacheck.go

@@ -15,6 +15,7 @@ import (
 )
 
 const SPIDER_MAXPAGENUM = 2 //默认列表页采集页数
+const DOWNLOADNODE = "bid"
 
 var (
 	htmlModelReg    = regexp.MustCompile(`{{[a-zA-z.()\d,:]{5,}}}|^(\$)`) //过滤模板语言
@@ -31,7 +32,9 @@ var (
 	//以关键词结尾
 	reg_filter_text2 = regexp.MustCompile(`((公司|代理|单位|中心|采购办|机构|设计室|(事务|鉴定|研究|管理)所|(卫生|研究|法|设计|医)院|(工程|办事)[部处]|博物馆|工作站|幼儿园|学校|委员会|平台|局|队|[小中大]学)$|(\.{3}|…|管委会|指引|视频|主页|活动|指南|总结|核查|评审|投诉|磋商|调查|列表|处理|须知|审查|名单|需求书|确认书|规则|通知|评价|征询|咨询|采购意向|审计|招标|监理|监测|测量|钻探|测绘|服务|评估|公示|信息|采购|公告|勘察|施工|标段|工程|项目|编制|谈判|意见|设计|邀请函|审核|检测|(意见|建议)书?)$)`)
 
-	CheckStart int
+	CheckLuaChan    = make(chan map[string]interface{}, 1000)
+	CheckLuaMap     = make(map[string]bool)
+	CheckLuaMapLock = &sync.Mutex{}
 )
 
 type Data struct {
@@ -43,16 +46,8 @@ type Data struct {
 	Base64Type bool
 }
 
-func TimeTaskLuaCheck() {
-	defer qu.Catch()
-	if CheckStart != 0 {
-		return
-	}
-	CheckStart = 1
-	defer func() {
-		CheckStart = 0
-	}()
-	qu.Debug("爬虫质检开始...", time.Now().Unix())
+// TimeTaskGetLua 定时取爬虫放入通道
+func TimeTaskGetLua() {
 	query := map[string]interface{}{
 		"event": map[string]interface{}{
 			"$ne": 7410,
@@ -67,16 +62,38 @@ func TimeTaskLuaCheck() {
 		},
 	}
 	list, _ := util.MgoEB.Find("luaconfig", query, nil, nil, false, -1, -1)
+	for _, lua := range *list {
+		if qu.ObjToString(lua["report"]) != "" { //反馈问题的待审核爬虫,不参与机检
+			continue
+		}
+		code := qu.ObjToString(lua["code"])
+		CheckLuaMapLock.Lock()
+		if !CheckLuaMap[code] {
+			CheckLuaChan <- lua
+			CheckLuaMap[code] = true
+		}
+		CheckLuaMapLock.Unlock()
+	}
+	qu.Debug("当前待质检爬虫个数:", len(CheckLuaMap))
+}
+
+// LuaCheckStart 质检流程
+func LuaCheckStart() {
 	wg := &sync.WaitGroup{}
 	ch := make(chan bool, 5)
-	for _, l := range *list {
+	for {
+		lua := <-CheckLuaChan //取数据
 		ch <- true
 		wg.Add(1)
 		go func(lua map[string]interface{}) {
 			defer func() {
 				<-ch
 				wg.Done()
+				CheckLuaMapLock.Lock()
+				delete(CheckLuaMap, qu.ObjToString(lua["code"]))
+				CheckLuaMapLock.Unlock()
 			}()
+			qu.Debug("开始机检爬虫:", lua["code"], time.Now().Unix())
 			warnMap := map[string]string{} //异常集合
 			errMap := map[string]string{}  //错误集合
 			LuaCheck(lua, errMap, warnMap)
@@ -99,10 +116,11 @@ func TimeTaskLuaCheck() {
 				},
 			}
 			util.MgoEB.UpdateById("luaconfig", lua["_id"], set)
-		}(l)
+			qu.Debug("结束机检爬虫:", lua["code"], time.Now().Unix())
+		}(lua)
 	}
 	wg.Wait()
-	qu.Debug("爬虫质检结束...", time.Now().Unix())
+	qu.Debug("爬虫质检异常结束...", time.Now().Unix())
 }
 
 // LuaCheck 爬虫机检
@@ -217,24 +235,29 @@ func LuaCheck(lua map[string]interface{}, err, warn map[string]string) {
 
 func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]string) {
 	defer qu.Catch()
-	downloadnode := "bid" //使用下载节点test、bid、comm
 	var script string
 	if lua["oldlua"] == nil {
 		script, _, _ = spider.GetScript(lua)
 	} else {
 		script = lua["luacontent"].(string)
 	}
-	s := spider.CreateSpider(downloadnode, script)
+	s := spider.CreateSpider(DOWNLOADNODE, script)
 	s.SpiderMaxPage = SPIDER_MAXPAGENUM //采集列表页总页数
 	s.Timeout = 60
 	result := map[int64][]map[string]interface{}{}
 	downloadNum := 0
-	for i := 1; i <= 3; i++ { //列表页重试三次
-		result, downloadNum, _ = s.DownListPageItem() //列表页采集结果
-		if downloadNum > 0 {
-			break
+	for page := 1; page <= SPIDER_MAXPAGENUM; page++ {
+		for i := 1; i <= 3; i++ { //每页列表重试三次
+			s.SpiderStartPage = int64(page)
+			result_page, downloadNum_page, _ := s.DownListPageItem() //列表页采集结果
+			result[s.SpiderStartPage] = result_page[s.SpiderStartPage]
+			if downloadNum_page > 0 {
+				downloadNum += downloadNum_page
+				break
+			}
 		}
 	}
+
 	if downloadNum == 0 {
 		err["列表页下载异常"] = "列表页下载量为0"
 		return
@@ -255,39 +278,73 @@ func listResultCheck(pageNum int, lists map[int64][]map[string]interface{}, err,
 		warn["列表页翻页异常"] = "列表页可以翻页,最大页却为1"
 	}
 	var fileNum, downloadOkNum int //附件下载量,详情页下载量
+	n := 0
+	wg := &sync.WaitGroup{}
+	lock := &sync.Mutex{}
+	ch := make(chan bool, 10)
 	for _, list := range lists {
 		for _, l := range list {
-			//校验title
-			title := qu.ObjToString(l["title"])
-			titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
-			if len(titleRandomArr) > 0 {
-				warn["列表页title含乱码"] = "列表页title含乱码(" + title + "):" + strings.Join(titleRandomArr, "")
+			if n > 50 {
+				break
 			}
-			if !util.TitleHanReg.MatchString(title) {
-				warn["列表页title无汉字"] = "列表页title中无汉字(" + title + "):"
-			} else if str := util.TitleFilterReg.FindString(title); str != "" {
-				str = "列表页title中包含异常信息(" + title + "):" + str
-				err["列表页title中含异常信息"] = str
-			}
-			//校验发布时间
-			publishtime := qu.ObjToString(l["publishtime"])
-			if publishtime == "0" || publishtime == "" {
-				warn["列表页publishtime"] = "列表页publishtime取值异常(" + title + ")"
-			} else {
-				t, err_p := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
-				if err_p != nil || t.Unix() <= 0 || t.Unix() > time.Now().Unix() {
-					warn["列表页publishtime"] = "列表页publishtime取值异常(" + title + ")"
+			n++
+			ch <- true
+			wg.Add(1)
+			go func(tmp map[string]interface{}) {
+				defer func() {
+					<-ch
+					wg.Done()
+				}()
+				spTmp := spider.CreateSpider(DOWNLOADNODE, s.ScriptFile)
+				warnTmp := map[string]string{}
+				errTmp := map[string]string{}
+				fileNumTmp := 0
+				downloadOkNumTmp := 0
+				//校验title
+				title := qu.ObjToString(tmp["title"])
+				titleRandomArr := util.RandomDFA.CheckSensitiveWord(title) //敏感词匹配乱码集
+				if len(titleRandomArr) > 0 {
+					warnTmp["列表页title含乱码"] = "列表页title含乱码(" + title + "):" + strings.Join(titleRandomArr, "")
 				}
-			}
-			//href
-			href := qu.ObjToString(l["href"])
-			if str := util.HrefReg.FindString(href); str != "" {
-				err["列表页链接异常"] = "公告链接存在异常后缀(" + title + "):" + str
-			}
-			//详情页下载校验
-			downloadDetail(&fileNum, &downloadOkNum, err, warn, l, s)
+				if !util.TitleHanReg.MatchString(title) {
+					warnTmp["列表页title无汉字"] = "列表页title中无汉字(" + title + "):"
+				} else if str := util.TitleFilterReg.FindString(title); str != "" {
+					str = "列表页title中包含异常信息(" + title + "):" + str
+					errTmp["列表页title中含异常信息"] = str
+				}
+				//校验发布时间
+				publishtime := qu.ObjToString(tmp["publishtime"])
+				if publishtime == "0" || publishtime == "" {
+					warnTmp["列表页publishtime"] = "列表页publishtime取值异常(" + title + ")"
+				} else {
+					t, err_p := time.ParseInLocation(qu.Date_Full_Layout, publishtime, time.Local)
+					if err_p != nil || t.Unix() <= 0 || t.Unix() > time.Now().Unix() {
+						warnTmp["列表页publishtime"] = "列表页publishtime取值异常(" + title + ")"
+					}
+				}
+				//href
+				href := qu.ObjToString(tmp["href"])
+				if str := util.HrefReg.FindString(href); str != "" {
+					errTmp["列表页链接异常"] = "公告链接存在异常后缀(" + title + "):" + str
+				}
+				//详情页下载校验
+				downloadDetail(&fileNumTmp, &downloadOkNumTmp, errTmp, warnTmp, tmp, spTmp)
+				spTmp.L.Close()
+
+				lock.Lock()
+				fileNum += fileNumTmp
+				downloadOkNum += downloadOkNumTmp
+				for k, v := range warnTmp {
+					warn[k] = v
+				}
+				for k, v := range errTmp {
+					err[k] = v
+				}
+				lock.Unlock()
+			}(l)
 		}
 	}
+	wg.Wait()
 	if fileNum == 0 {
 		warn["未下载任何附件"] = "未下载任何附件"
 	}
@@ -306,7 +363,7 @@ func downloadDetail(fileNum, downloadOkNum *int, err, warn map[string]string, li
 	}
 	var downloadDetailOk bool
 	for i := 1; i <= 3; i++ { //重试三次
-		_, rep_err := s.DownloadDetailPageTest(param, data)
+		_, rep_err := s.DownloadDetailPage(param, data)
 		if rep_err == nil && len(data) > 0 {
 			downloadDetailOk = true
 			*downloadOkNum++ //记录详情页下载成功个数

+ 4 - 1
src/main.go

@@ -3,9 +3,11 @@ package main
 import (
 	_ "filter"
 	"front"
+	"luacheck"
 	"quesManager"
 	"spider"
 	"task"
+	"timetask"
 	"vps"
 
 	//. "luaweb/task"
@@ -14,7 +16,6 @@ import (
 	qu "qfw/util"
 	"qfw/util/redis"
 	"taskManager"
-	"timetask"
 	u "util"
 
 	codegrpc "analysiscode/client"
@@ -99,6 +100,8 @@ func init() {
 func main() {
 	//定时任务
 	go timetask.TimeTask()
+	//爬虫质检
+	go luacheck.LuaCheckStart()
 	//提供接口,接收其他数据
 	http.HandleFunc("/spider/infos", func(w http.ResponseWriter, req *http.Request) {
 		data := req.FormValue("data")

+ 10 - 14
src/spider/msclient.go

@@ -7,7 +7,6 @@ import (
 	"time"
 )
 
-//
 type DynamicIPMap struct {
 	Code        string
 	InvalidTime int64
@@ -26,7 +25,6 @@ var AlldownloaderTest map[string]DynamicIPMap = make(map[string]DynamicIPMap)
 var AlldownloaderChromedp map[string]DynamicIPMap = make(map[string]DynamicIPMap)
 var AlldownloaderChromedpTest map[string]DynamicIPMap = make(map[string]DynamicIPMap)
 
-//
 func processevent(p *mu.Packet) {
 	defer mu.Catch()
 	var data []byte
@@ -166,7 +164,6 @@ func processeventChromedpTest(p *mu.Packet) {
 	}
 }
 
-//
 func gc4Alldownloader() {
 	n := time.Now().Unix()
 	for _, v := range Alldownloader {
@@ -222,7 +219,6 @@ func gc4AlldownloaderChromedpTest() {
 	time.AfterFunc(1*time.Minute, gc4AlldownloaderChromedpTest)
 }
 
-//
 func GetOneDownloader() string {
 	if len(AlldownloaderTest) < 1 {
 		return ""
@@ -241,14 +237,14 @@ func GetOneDownloader() string {
 	return retcode
 }
 
-//初始化,启动消息客户端
+// 初始化,启动消息客户端
 func InitMsgClient(serveraddr, serveraddrbid, serveraddrtest, name, namebid, nametest string) {
 	Msclient, _ = mu.NewClient(&mu.ClientConfig{ClientName: name,
 		MsgServerAddr:   serveraddr,
 		EventHandler:    processevent,
 		CanHandleEvents: []int{mu.SERVICE_DOWNLOAD_APPEND_NODE, mu.SERVICE_DOWNLOAD_DELETE_NODE},
-		ReadBufferSize:  10,
-		WriteBufferSize: 10,
+		ReadBufferSize:  500,
+		WriteBufferSize: 500,
 	})
 	go gc4Alldownloader()
 
@@ -256,8 +252,8 @@ func InitMsgClient(serveraddr, serveraddrbid, serveraddrtest, name, namebid, nam
 		MsgServerAddr:   serveraddrbid,
 		EventHandler:    processeventbid,
 		CanHandleEvents: []int{mu.SERVICE_DOWNLOAD_APPEND_NODE, mu.SERVICE_DOWNLOAD_DELETE_NODE},
-		ReadBufferSize:  10,
-		WriteBufferSize: 10,
+		ReadBufferSize:  500,
+		WriteBufferSize: 500,
 	})
 	go gc4AlldownloaderBid()
 
@@ -265,13 +261,13 @@ func InitMsgClient(serveraddr, serveraddrbid, serveraddrtest, name, namebid, nam
 		MsgServerAddr:   serveraddrtest,
 		EventHandler:    processeventTest,
 		CanHandleEvents: []int{mu.SERVICE_DOWNLOAD_APPEND_NODE, mu.SERVICE_DOWNLOAD_DELETE_NODE},
-		ReadBufferSize:  10,
-		WriteBufferSize: 10,
+		ReadBufferSize:  500,
+		WriteBufferSize: 500,
 	})
 	go gc4AlldownloaderTest()
 }
 
-//初始chrome启动消息客户端
+// 初始chrome启动消息客户端
 func InitChromeMsgClient(chromeaddr, chrometestaddr, name, nametest string) {
 	InitMsgClientChromedp(chromeaddr, name)
 	InitMsgClientChromedpTest(chrometestaddr, nametest)
@@ -282,8 +278,8 @@ func InitMsgClientFile(serveraddr, name string) {
 		MsgServerAddr:   serveraddr,
 		EventHandler:    processeventFile,
 		CanHandleEvents: []int{mu.SERVICE_DOWNLOAD_APPEND_NODE, mu.SERVICE_DOWNLOAD_DELETE_NODE},
-		ReadBufferSize:  200,
-		WriteBufferSize: 200,
+		ReadBufferSize:  500,
+		WriteBufferSize: 500,
 	})
 	go gc4AlldownloaderFile()
 }

+ 6 - 6
src/timetask/timetask.go

@@ -22,12 +22,12 @@ func TimeTask() {
 	c := cron.New()
 	c.Start()
 	c.AddFunc("0 20 9 ? * MON-FRI", CheckCreateTask)
-	c.AddFunc("0 30 23 * * *", UpdateSiteInfo)            //定时更新站点信息
-	c.AddFunc("0 0 23 * * *", UpdateCodeHeart)            //定时更新爬虫心跳信息
-	c.AddFunc("0 0 */1 ? * *", CheckLuaMove)              //7000节点转增量爬虫失败告警
-	c.AddFunc("0 */10 * * * *", SpiderMoveEvent)          //7000节点转增量爬虫
-	c.AddFunc("0 0 8 * * *", UpdateImportantCode)         //更新重点网站爬虫信息
-	c.AddFunc("0 */5 * * * *", luacheck.TimeTaskLuaCheck) //爬虫机检定时任务
+	c.AddFunc("0 30 23 * * *", UpdateSiteInfo)          //定时更新站点信息
+	c.AddFunc("0 0 23 * * *", UpdateCodeHeart)          //定时更新爬虫心跳信息
+	c.AddFunc("0 0 */1 ? * *", CheckLuaMove)            //7000节点转增量爬虫失败告警
+	c.AddFunc("0 */10 * * * *", SpiderMoveEvent)        //7000节点转增量爬虫
+	c.AddFunc("0 0 8 * * *", UpdateImportantCode)       //更新重点网站爬虫信息
+	c.AddFunc("0 */1 * * * *", luacheck.TimeTaskGetLua) //爬虫机检定时任务
 }
 
 // 检测创建任务失败的爬虫

+ 1 - 1
src/util/util.go

@@ -245,7 +245,7 @@ func SpiderPassCheckListAndDetail(result map[int64][]map[string]interface{}, dat
 				}
 			}
 			//href
-			href := qu.ObjToString(l["title"])
+			href := qu.ObjToString(l["href"])
 			if str := HrefReg.FindString(href); str != "" {
 				msgMap["公告链接存在异常后缀"+str] = true
 			}

+ 1 - 1
src/web/templates/index.html

@@ -534,7 +534,7 @@ $(function(){
 			}},
 			{ "data": "param_common","width":"95px",render:function(val,a,row){
 				var div=$("<div><div class=\"btn-group\"></div></div>");
-				if (row.state == 1){
+				if (row.state >= 1 && row.state <= 3){
 					var dbutton;
 					if (typeof row.checkok === "undefined"){
 						dbutton = $('<a type="button" class="btn btn-sm disabled btn-default">未机检</a>');

+ 4 - 4
src/web/templates/spideredit.html

@@ -511,10 +511,10 @@
 		{{end}}
 	})
 	function report(code,taskId){
-		if(!issave){
-			alert("请保存爬虫!")
-			return
-		}
+		// if(!issave){
+		// 	alert("请保存爬虫!")
+		// 	return
+		// }
 		var report=window.prompt("==============请输入问题原因==============\n","");
 		if(report && report!=""){
 			common.maskShow("正在提交...");