Просмотр исходного кода

全量、增量redis val内容调整

maxiaoshan 3 лет назад
Родитель
Сommit
91d102726d
4 измененных файлов с 77 добавлено и 74 удалено
  1. 2 2
      src/spider/msgservice.go
  2. 63 60
      src/spider/script.go
  3. 11 11
      src/spider/spider.go
  4. 1 1
      src/spider/store.go

+ 2 - 2
src/spider/msgservice.go

@@ -292,12 +292,12 @@ func SaveObj(event int, checkAtrr string, data map[string]interface{}, saveredis
 			}
 			//保存服务未接收成功的数据会存入data_bak中,确保数据不丢失依赖补发程序
 			if id != "" {
-				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 				if !flag { //保存服务发送成功
 					//全量(判断是否已存在防止覆盖id)
 					isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
 					if !isExist {
-						util.PutRedis("title_repeat_fulljudgement", db, hashHref, "lua_"+id, -1)
+						util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
 					}
 				}
 			}

+ 63 - 60
src/spider/script.go

@@ -127,26 +127,27 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		}
 		var retLen int64
 		ret := Download(&retLen, s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
-		if retLen > 0 {
-			key := Today + "+" + code
-			if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
-				if sfMap, ok := sf.(*SpiderFlow); ok {
-					sfMap.Flow += retLen
-					//sfMap.Site = *site
-					//sfMap.Channel = *channel
-					//sfMap.ModifyUser = *user
-					SpiderFlowMap.Store(key, sfMap)
-				}
-			} else {
-				SpiderFlowMap.Store(key, &SpiderFlow{
-					//Code:       code,
-					Site:       *site,
-					Channel:    *channel,
-					Flow:       retLen,
-					ModifyUser: *user,
-				})
-			}
-		}
+		//流量统计
+		//if retLen > 0 {
+		//	key := Today + "+" + code
+		//	if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
+		//		if sfMap, ok := sf.(*SpiderFlow); ok {
+		//			sfMap.Flow += retLen
+		//			//sfMap.Site = *site
+		//			//sfMap.Channel = *channel
+		//			//sfMap.ModifyUser = *user
+		//			SpiderFlowMap.Store(key, sfMap)
+		//		}
+		//	} else {
+		//		SpiderFlowMap.Store(key, &SpiderFlow{
+		//			//Code:       code,
+		//			Site:       *site,
+		//			Channel:    *channel,
+		//			Flow:       retLen,
+		//			ModifyUser: *user,
+		//		})
+		//	}
+		//}
 		S.Push(lua.LString(ret))
 		atomic.AddInt32(&s.ToDayRequestNum, 1)
 		atomic.AddInt32(&s.TotalRequestNum, 1)
@@ -188,26 +189,27 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		} else {
 			ret, retcookie, headers = DownloadAdv(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
 		}
-		if retLen > 0 {
-			key := Today + "+" + code
-			if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
-				if sfMap, ok := sf.(*SpiderFlow); ok {
-					sfMap.Flow += retLen
-					//sfMap.Site = *site
-					//sfMap.Channel = *channel
-					//sfMap.ModifyUser = *user
-					SpiderFlowMap.Store(key, sfMap)
-				}
-			} else {
-				SpiderFlowMap.Store(key, &SpiderFlow{
-					//Code:       code,
-					Site:       *site,
-					Channel:    *channel,
-					Flow:       retLen,
-					ModifyUser: *user,
-				})
-			}
-		}
+		//流量统计
+		//if retLen > 0 {
+		//	key := Today + "+" + code
+		//	if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
+		//		if sfMap, ok := sf.(*SpiderFlow); ok {
+		//			sfMap.Flow += retLen
+		//			//sfMap.Site = *site
+		//			//sfMap.Channel = *channel
+		//			//sfMap.ModifyUser = *user
+		//			SpiderFlowMap.Store(key, sfMap)
+		//		}
+		//	} else {
+		//		SpiderFlowMap.Store(key, &SpiderFlow{
+		//			//Code:       code,
+		//			Site:       *site,
+		//			Channel:    *channel,
+		//			Flow:       retLen,
+		//			ModifyUser: *user,
+		//		})
+		//	}
+		//}
 		S.Push(lua.LString(ret))
 		scookie, _ := json.Marshal(retcookie)
 		S.Push(lua.LString(scookie))
@@ -248,26 +250,27 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		url = strings.TrimSpace(url)
 		var retLen int64
 		ret := DownloadFile(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
-		if retLen > 0 {
-			key := Today + "+" + code
-			if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
-				if sfMap, ok := sf.(*SpiderFlow); ok {
-					sfMap.Flow += retLen
-					//sfMap.Site = *site
-					//sfMap.Channel = *channel
-					//sfMap.ModifyUser = *user
-					SpiderFlowMap.Store(key, sfMap)
-				}
-			} else {
-				SpiderFlowMap.Store(key, &SpiderFlow{
-					//Code:       code,
-					Site:       *site,
-					Channel:    *channel,
-					Flow:       retLen,
-					ModifyUser: *user,
-				})
-			}
-		}
+		//流量统计
+		//if retLen > 0 {
+		//	key := Today + "+" + code
+		//	if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
+		//		if sfMap, ok := sf.(*SpiderFlow); ok {
+		//			sfMap.Flow += retLen
+		//			//sfMap.Site = *site
+		//			//sfMap.Channel = *channel
+		//			//sfMap.ModifyUser = *user
+		//			SpiderFlowMap.Store(key, sfMap)
+		//		}
+		//	} else {
+		//		SpiderFlowMap.Store(key, &SpiderFlow{
+		//			//Code:       code,
+		//			Site:       *site,
+		//			Channel:    *channel,
+		//			Flow:       retLen,
+		//			ModifyUser: *user,
+		//		})
+		//	}
+		//}
 
 		url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
 		if strings.TrimSpace(ftype) == "" {

+ 11 - 11
src/spider/spider.go

@@ -340,7 +340,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 								db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
 								hashHref := HexText(href)
 								//增量(redis默认db0)
-								util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+								util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 								//全量(判断是否已存在防止覆盖id)
 								isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
 								if !isExist {
@@ -522,7 +522,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
 	set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
 	if data["delete"] != nil {
 		//增量
-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 		//全量
 		db := HexToBigIntMod(href)
 		hashHref := HexText(href)
@@ -601,7 +601,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 	if util.Config.Modal == 1 { //除7000、7500、7700节点外所有节点只采集列表页信息
 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
 		if isExist { //更新redis生命周期
-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 			*num++ //已采集
 			return
 		}
@@ -613,7 +613,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 		}
 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
 		if isExist { //更新redis生命周期
-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 			*num++ //已采集
 			return
 		}
@@ -629,7 +629,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 		}
 		SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7700节点列表页采集的信息
 		if isEsRepeat {                              //类竞品数据title判重数据加入redis
-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 			return
 		}
 	}
@@ -650,7 +650,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 	} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
 		log.Println("beforeHref:", href, "afterHref:", tmphref)
 		//增量
-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 		//全量
 		db := HexToBigIntMod(href)
 		hashHref := HexText(href)
@@ -667,7 +667,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 	//详情页过滤数据
 	if data["delete"] != nil {
 		//增量
-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 		//全量
 		db := HexToBigIntMod(href)
 		hashHref := HexText(href)
@@ -939,7 +939,7 @@ func (s *Spider) DownloadDetail(stype string) {
 						esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
 						count := Es.Count(EsIndex, EsType, esQuery)
 						if count > 0 { //es中含本title数据,不再采集,更新list表数据状态
-							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
 							update = append(update, query)
 							update = append(update, set)
@@ -977,7 +977,7 @@ func (s *Spider) DownloadDetail(stype string) {
 					} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
 						log.Println("beforeHref:", href, "afterHref:", tmphref)
 						//增量
-						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 						//全量
 						db := HexToBigIntMod(href)
 						hashHref := HexText(href)
@@ -1001,7 +1001,7 @@ func (s *Spider) DownloadDetail(stype string) {
 						return
 					} else if data["delete"] != nil { //三级页过滤
 						//增量
-						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 						//全量
 						db := HexToBigIntMod(href)
 						hashHref := HexText(href)
@@ -1103,7 +1103,7 @@ func NewSpiderByScript(num int, code string, info map[string]string, spChan chan
 func AnalysisProjectInfo(data map[string]interface{}) bool {
 	defer qu.Catch()
 	detail := qu.ObjToString(data["detail"])
-	if detail == "详情请访问原网页" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
+	if detail == "详情请访问原网页" || detail == "<br/>详情请访问原网页!" { //不判断包含关系因为有些数据为json拼接,字段不全,会加“详情请访问原网页”
 		if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
 			if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
 				fileOk := false

+ 1 - 1
src/spider/store.go

@@ -82,7 +82,7 @@ func Store(mode, event int, c, coverAttr string, data map[string]interface{}, fl
 			db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
 			hashHref := HexText(href)
 			//增量
-			lu.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
+			lu.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
 			//全量
 			isExist, _ := lu.ExistRedis("title_repeat_fulljudgement", db, hashHref)
 			if !isExist {