Browse Source

历史版本优化

maxiaoshan 3 years ago
parent
commit
4acab29110
3 changed files with 48 additions and 46 deletions
  1. 1 0
      src/config.json
  2. 39 40
      src/spider/handler.go
  3. 8 6
      src/spider/spider.go

+ 1 - 0
src/config.json

@@ -17,6 +17,7 @@
     "daynum": 6,
     "daynum": 6,
     "modal": 1,
     "modal": 1,
     "ishistoryevent": false,
     "ishistoryevent": false,
+    "threadbasenum": 50,
     "tesseractadd": "http://test.qmx.top:1688",
     "tesseractadd": "http://test.qmx.top:1688",
     "testdir": "res/test/spider_test.lua",
     "testdir": "res/test/spider_test.lua",
     "redisservers": "title_repeat_judgement=192.168.3.207:2679,title_repeat_fulljudgement=192.168.3.207:2679,title_repeat_listpagehref=192.168.3.207:1679",
     "redisservers": "title_repeat_judgement=192.168.3.207:2679,title_repeat_fulljudgement=192.168.3.207:2679,title_repeat_listpagehref=192.168.3.207:1679",

+ 39 - 40
src/spider/handler.go

@@ -71,32 +71,34 @@ func NoQueueScript() {
 			script := info["script"]
 			script := info["script"]
 			sp, errstr := NewSpider(code, script)
 			sp, errstr := NewSpider(code, script)
 			if errstr == "" && sp != nil && sp.Code != "nil" { //脚本加载成功
 			if errstr == "" && sp != nil && sp.Code != "nil" { //脚本加载成功
-				sp2, _ := NewSpider(code, script)
 				//sp.Index = qu.IntAll(key)
 				//sp.Index = qu.IntAll(key)
 				//sp2.Index = qu.IntAll(key)
 				//sp2.Index = qu.IntAll(key)
 				if info["createuser"] != "" {
 				if info["createuser"] != "" {
 					sp.UserName = info["createuser"]
 					sp.UserName = info["createuser"]
-					sp2.UserName = info["createuser"]
 				}
 				}
 				if info["createuseremail"] != "" {
 				if info["createuseremail"] != "" {
 					sp.UserEmail = info["createuseremail"]
 					sp.UserEmail = info["createuseremail"]
-					sp2.UserEmail = info["createuseremail"]
 				}
 				}
 				sp.MUserName = info["modifyuser"]
 				sp.MUserName = info["modifyuser"]
-				sp2.MUserName = info["modifyuser"]
 				sp.MUserEmail = info["modifyemail"]
 				sp.MUserEmail = info["modifyemail"]
-				sp2.MUserEmail = info["modifyemail"]
 				Allspiders.Store(sp.Code, sp)
 				Allspiders.Store(sp.Code, sp)
-				Allspiders2.Store(sp.Code, sp2)
 				for _, tmp := range list {
 				for _, tmp := range list {
 					if qu.ObjToString(tmp["code"]) == sp.Code {
 					if qu.ObjToString(tmp["code"]) == sp.Code {
 						sp.UpperLimit = qu.IntAll(tmp["uplimit"])
 						sp.UpperLimit = qu.IntAll(tmp["uplimit"])
-						sp2.UpperLimit = qu.IntAll(tmp["uplimit"])
+						//sp2.UpperLimit = qu.IntAll(tmp["uplimit"])
 						sp.LowerLimit = qu.IntAll(tmp["lowlimit"])
 						sp.LowerLimit = qu.IntAll(tmp["lowlimit"])
-						sp2.UpperLimit = qu.IntAll(tmp["uplimit"])
+						//sp2.LowerLimit = qu.IntAll(tmp["lowlimit"])
 						break
 						break
 					}
 					}
 				}
 				}
+				if util.Config.Modal == 1 { //列表页、三级页分开采集模式
+					sp2, _ := NewSpider(code, script)
+					sp2.UserName = sp.UserName
+					sp2.UserEmail = sp.UserEmail
+					sp2.MUserName = sp.MUserName
+					sp2.MUserEmail = sp.MUserEmail
+					Allspiders2.Store(sp.Code, sp2)
+				}
 				sp.StartJob()
 				sp.StartJob()
 				//util.TimeSleepFunc(10*time.Millisecond, TimeSleepChan)
 				//util.TimeSleepFunc(10*time.Millisecond, TimeSleepChan)
 			} else {
 			} else {
@@ -126,7 +128,7 @@ func NoQueueScript() {
 						},
 						},
 					}, true, false)
 					}, true, false)
 			}
 			}
-			time.Sleep(1 * time.Second)
+			time.Sleep(100 * time.Millisecond)
 		}
 		}
 		return true
 		return true
 	})
 	})
@@ -417,14 +419,8 @@ func UpdateSpiderByCodeState(code, state string) (bool, error) {
 	var err error
 	var err error
 	if state != "5" && state != "-1" { //脚本下架
 	if state != "5" && state != "-1" { //脚本下架
 		SpiderHeart.Delete(code) //脚本下架,删除脚本对应心跳
 		SpiderHeart.Delete(code) //脚本下架,删除脚本对应心跳
-		// Mgo.Update("spider_heart", map[string]interface{}{"code": code}, map[string]interface{}{
-		// 	"$set": map[string]interface{}{
-		// 		"del":        true,
-		// 		"updatetime": time.Now().Unix(),
-		// 	},
-		// }, false, false)
 		logger.Info("下架脚本", code)
 		logger.Info("下架脚本", code)
-		if util.Config.Working == 1 {
+		if util.Config.Working == 1 { //队列模式
 			for i, as := range []sync.Map{Allspiders, Allspiders2} {
 			for i, as := range []sync.Map{Allspiders, Allspiders2} {
 				if i == 1 && util.Config.Modal == 0 { //队列模式原始模式采集Allspiders2无用(7700下架爬虫)
 				if i == 1 && util.Config.Modal == 0 { //队列模式原始模式采集Allspiders2无用(7700下架爬虫)
 					continue
 					continue
@@ -441,25 +437,25 @@ func UpdateSpiderByCodeState(code, state string) (bool, error) {
 					logger.Info("下架脚本,Allspiders删除")
 					logger.Info("下架脚本,Allspiders删除")
 				}
 				}
 			}
 			}
-			LoopListPath.Range(func(k, v interface{}) bool {
-				//if v != nil {
-				//	info, _ := v.(map[string]string)
-				//	if info["code"] == code {
-				//		LoopListPath.Store(k, nil)
-				//		lock.Lock()
-				//		defer lock.Unlock()
-				//		ChanDels[qu.IntAll(k)] = code
-				//		logger.Info("下架脚本,LoopListPath更新为nil,ChanDels中位置:", k)
-				//	}
-				//}
-				if k == code {
-					LoopListPath.Delete(k)
-					logger.Info(code, "脚本下架成功")
-					return false //跳出循环
-				}
-				return true
-			})
-		} else {
+			//LoopListPath.Range(func(k, v interface{}) bool {
+			//	//if v != nil {
+			//	//	info, _ := v.(map[string]string)
+			//	//	if info["code"] == code {
+			//	//		LoopListPath.Store(k, nil)
+			//	//		lock.Lock()
+			//	//		defer lock.Unlock()
+			//	//		ChanDels[qu.IntAll(k)] = code
+			//	//		logger.Info("下架脚本,LoopListPath更新为nil,ChanDels中位置:", k)
+			//	//	}
+			//	//}
+			//	if k == code {
+			//		LoopListPath.Delete(k)
+			//		logger.Info(code, "脚本下架成功")
+			//		return false //跳出循环
+			//	}
+			//	return true
+			//})
+		} else { //高性能模式
 			for _, as := range []sync.Map{Allspiders, Allspiders2} {
 			for _, as := range []sync.Map{Allspiders, Allspiders2} {
 				if tmp, ok := as.Load(code); ok {
 				if tmp, ok := as.Load(code); ok {
 					sp, ok := tmp.(*Spider)
 					sp, ok := tmp.(*Spider)
@@ -471,6 +467,8 @@ func UpdateSpiderByCodeState(code, state string) (bool, error) {
 				}
 				}
 			}
 			}
 		}
 		}
+		LoopListPath.Delete(code)
+		logger.Info(code, "脚本下架成功")
 		up = true
 		up = true
 		err = nil
 		err = nil
 	} else if state == "-1" { //爬虫重采更新线上爬虫
 	} else if state == "-1" { //爬虫重采更新线上爬虫
@@ -622,7 +620,8 @@ func UpdateSpiderByCodeState(code, state string) (bool, error) {
 			}
 			}
 		} else {
 		} else {
 			for k, v := range scriptMap {
 			for k, v := range scriptMap {
-				//1、Allspiders对应7000、7100、7400脚本上架下载数据
+				LoopListPath.Store(k, v)
+				//1、Allspiders对应7000、7100、7400脚本上架下载数据(列表页爬虫集合)
 				if spd, ok := Allspiders.Load(k); ok { //对应脚本已存在,更新
 				if spd, ok := Allspiders.Load(k); ok { //对应脚本已存在,更新
 					sp := spd.(*Spider)
 					sp := spd.(*Spider)
 					sp.ScriptFile = v["script"]
 					sp.ScriptFile = v["script"]
@@ -680,8 +679,8 @@ func UpdateSpiderByCodeState(code, state string) (bool, error) {
 							}, true, false)
 							}, true, false)
 					}
 					}
 				}
 				}
-				//2、Allspiders2对应7100、7400上架采集三级页数据
-				if !util.Config.IsHistoryEvent { //7000历史节点不根据列表页数据采三级页
+				//2、Allspiders2对应7100、7110、7400上架采集三级页数据(Allspiders2三级页爬虫集合)
+				if util.Config.Modal == 1 { //高性能老模式不根据列表页数据采三级页(7000、7410)
 					//Allspiders2
 					//Allspiders2
 					if spd2, ok2 := Allspiders2.Load(k); ok2 { //对应脚本已存在,更新
 					if spd2, ok2 := Allspiders2.Load(k); ok2 { //对应脚本已存在,更新
 						sp2 := spd2.(*Spider)
 						sp2 := spd2.(*Spider)
@@ -892,7 +891,7 @@ func ReloadSpiderFile() {
 	util.TimeAfterFunc(time.Duration(15)*time.Minute, ReloadSpiderFile, TimeChan)
 	util.TimeAfterFunc(time.Duration(15)*time.Minute, ReloadSpiderFile, TimeChan)
 }
 }
 
 
-//生成爬虫,排队模式
+//排队模式生成爬虫
 func NewSpider_New(code, luafile string, newstate bool) (*Spider, string) {
 func NewSpider_New(code, luafile string, newstate bool) (*Spider, string) {
 	defer mu.Catch()
 	defer mu.Catch()
 	spider := &Spider{}
 	spider := &Spider{}
@@ -953,7 +952,7 @@ func NewSpider_New(code, luafile string, newstate bool) (*Spider, string) {
 	return spider, ""
 	return spider, ""
 }
 }
 
 
-//生成爬虫
+//高性能模式生成爬虫
 func NewSpider(code, luafile string) (*Spider, string) {
 func NewSpider(code, luafile string) (*Spider, string) {
 	defer mu.Catch()
 	defer mu.Catch()
 	spider := &Spider{}
 	spider := &Spider{}

+ 8 - 6
src/spider/spider.go

@@ -829,11 +829,11 @@ func (s *Spider) DownloadHighDetail() {
 		if !s.Stop { //爬虫是运行状态
 		if !s.Stop { //爬虫是运行状态
 			comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
 			comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
 			isEsRepeat := false                                                             //是否进行es判重
 			isEsRepeat := false                                                             //是否进行es判重
-			if delayDay := DelaySites[s.Name]; delayDay > 0 {
+			if delayTime := DelaySites[s.Name]; delayTime > 0 {
 				isEsRepeat = true
 				isEsRepeat = true
-				if delayDay <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
+				if delayTime <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
 					//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
 					//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
-					comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delayDay)
+					comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delayTime)
 				}
 				}
 			}
 			}
 			q := map[string]interface{}{
 			q := map[string]interface{}{
@@ -850,6 +850,8 @@ func (s *Spider) DownloadHighDetail() {
 			if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
 			if !s.Stop { //在下载详情页时爬虫下架,此时不再存心跳信息
 				UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录modal=1采集三级页心跳
 				UpdateHeart(s.Name, s.Channel, s.Code, s.MUserName, "detail") //记录modal=1采集三级页心跳
 			}
 			}
+			//countNum := Mgo.Count("spider_highlistdata", q) //统计util.Config.DayNum天内未下载爬虫个数
+			//threadNum := countNum % util.Config.ThreadBaseNum
 			list, _ := Mgo.Find("spider_highlistdata", q, o, f, false, 0, 100)
 			list, _ := Mgo.Find("spider_highlistdata", q, o, f, false, 0, 100)
 			if list != nil && len(*list) > 0 {
 			if list != nil && len(*list) > 0 {
 				for _, tmp := range *list {
 				for _, tmp := range *list {
@@ -979,11 +981,11 @@ func (s *Spider) DownloadListDetail() {
 	}()
 	}()
 	comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
 	comeintimeQuery := map[string]interface{}{"$gte": GetTime(-util.Config.DayNum)} //采集一周内的数据,防止有数据一直采不下来,造成积累
 	isEsRepeat := false                                                             //是否进行es判重
 	isEsRepeat := false                                                             //是否进行es判重
-	if delayDay := DelaySites[s.Name]; delayDay > 0 {
+	if delayTime := DelaySites[s.Name]; delayTime > 0 {
 		isEsRepeat = true
 		isEsRepeat = true
-		if delayDay <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
+		if delayTime <= util.Config.DayNum*24 { //判断该爬虫是否属于要延迟采集的站点,数据延迟delayDay小时采集(由于7410、7500、7700为顺序采集,无法延时)
 			//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
 			//comeintimeQuery["$lte"] = GetTime(-delayDay + 1)
-			comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delayDay)
+			comeintimeQuery["$lte"] = time.Now().Unix() - int64(3600*delayTime)
 		}
 		}
 	}
 	}
 	q := map[string]interface{}{
 	q := map[string]interface{}{