3 лет назад · 91d102726d
--- a/src/spider/msgservice.go
+++ b/src/spider/msgservice.go
@@ -292,12 +292,12 @@ func SaveObj(event int, checkAtrr string, data map[string]interface{}, saveredis
 
				 			}
			
 
				 			//保存服务未接收成功的数据会存入data_bak中，确保数据不丢失依赖补发程序
			
 
				 			if id != "" {
			
 
				-				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+				util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 				if !flag { //保存服务发送成功
			
 
				 					//全量(判断是否已存在防止覆盖id)
			
 
				 					isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
			
 
				 					if !isExist {
			
 
				-						util.PutRedis("title_repeat_fulljudgement", db, hashHref, "lua_"+id, -1)
			
 
				+						util.PutRedis("title_repeat_fulljudgement", db, hashHref, "", -1)
			
 
				 					}
			
 
				 				}
			
 
				 			}
			
--- a/src/spider/script.go
+++ b/src/spider/script.go
@@ -127,26 +127,27 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 
				 		}
			
 
				 		var retLen int64
			
 
				 		ret := Download(&retLen, s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
			
 
				-		if retLen > 0 {
			
 
				-			key := Today + "+" + code
			
 
				-			if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
			
 
				-				if sfMap, ok := sf.(*SpiderFlow); ok {
			
 
				-					sfMap.Flow += retLen
			
 
				-					//sfMap.Site = *site
			
 
				-					//sfMap.Channel = *channel
			
 
				-					//sfMap.ModifyUser = *user
			
 
				-					SpiderFlowMap.Store(key, sfMap)
			
 
				-				}
			
 
				-			} else {
			
 
				-				SpiderFlowMap.Store(key, &SpiderFlow{
			
 
				-					//Code:       code,
			
 
				-					Site:       *site,
			
 
				-					Channel:    *channel,
			
 
				-					Flow:       retLen,
			
 
				-					ModifyUser: *user,
			
 
				-				})
			
 
				-			}
			
 
				-		}
			
 
				+		//流量统计
			
 
				+		//if retLen > 0 {
			
 
				+		//	key := Today + "+" + code
			
 
				+		//	if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
			
 
				+		//		if sfMap, ok := sf.(*SpiderFlow); ok {
			
 
				+		//			sfMap.Flow += retLen
			
 
				+		//			//sfMap.Site = *site
			
 
				+		//			//sfMap.Channel = *channel
			
 
				+		//			//sfMap.ModifyUser = *user
			
 
				+		//			SpiderFlowMap.Store(key, sfMap)
			
 
				+		//		}
			
 
				+		//	} else {
			
 
				+		//		SpiderFlowMap.Store(key, &SpiderFlow{
			
 
				+		//			//Code:       code,
			
 
				+		//			Site:       *site,
			
 
				+		//			Channel:    *channel,
			
 
				+		//			Flow:       retLen,
			
 
				+		//			ModifyUser: *user,
			
 
				+		//		})
			
 
				+		//	}
			
 
				+		//}
			
 
				 		S.Push(lua.LString(ret))
			
 
				 		atomic.AddInt32(&s.ToDayRequestNum, 1)
			
 
				 		atomic.AddInt32(&s.TotalRequestNum, 1)
			
@@ -188,26 +189,27 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 
				 		} else {
			
 
				 			ret, retcookie, headers = DownloadAdv(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
			
 
				 		}
			
 
				-		if retLen > 0 {
			
 
				-			key := Today + "+" + code
			
 
				-			if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
			
 
				-				if sfMap, ok := sf.(*SpiderFlow); ok {
			
 
				-					sfMap.Flow += retLen
			
 
				-					//sfMap.Site = *site
			
 
				-					//sfMap.Channel = *channel
			
 
				-					//sfMap.ModifyUser = *user
			
 
				-					SpiderFlowMap.Store(key, sfMap)
			
 
				-				}
			
 
				-			} else {
			
 
				-				SpiderFlowMap.Store(key, &SpiderFlow{
			
 
				-					//Code:       code,
			
 
				-					Site:       *site,
			
 
				-					Channel:    *channel,
			
 
				-					Flow:       retLen,
			
 
				-					ModifyUser: *user,
			
 
				-				})
			
 
				-			}
			
 
				-		}
			
 
				+		//流量统计
			
 
				+		//if retLen > 0 {
			
 
				+		//	key := Today + "+" + code
			
 
				+		//	if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
			
 
				+		//		if sfMap, ok := sf.(*SpiderFlow); ok {
			
 
				+		//			sfMap.Flow += retLen
			
 
				+		//			//sfMap.Site = *site
			
 
				+		//			//sfMap.Channel = *channel
			
 
				+		//			//sfMap.ModifyUser = *user
			
 
				+		//			SpiderFlowMap.Store(key, sfMap)
			
 
				+		//		}
			
 
				+		//	} else {
			
 
				+		//		SpiderFlowMap.Store(key, &SpiderFlow{
			
 
				+		//			//Code:       code,
			
 
				+		//			Site:       *site,
			
 
				+		//			Channel:    *channel,
			
 
				+		//			Flow:       retLen,
			
 
				+		//			ModifyUser: *user,
			
 
				+		//		})
			
 
				+		//	}
			
 
				+		//}
			
 
				 		S.Push(lua.LString(ret))
			
 
				 		scookie, _ := json.Marshal(retcookie)
			
 
				 		S.Push(lua.LString(scookie))
			
@@ -248,26 +250,27 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 
				 		url = strings.TrimSpace(url)
			
 
				 		var retLen int64
			
 
				 		ret := DownloadFile(&retLen, s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, s.Userproxy, ishttps, s.SCode, s.Timeout)
			
 
				-		if retLen > 0 {
			
 
				-			key := Today + "+" + code
			
 
				-			if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
			
 
				-				if sfMap, ok := sf.(*SpiderFlow); ok {
			
 
				-					sfMap.Flow += retLen
			
 
				-					//sfMap.Site = *site
			
 
				-					//sfMap.Channel = *channel
			
 
				-					//sfMap.ModifyUser = *user
			
 
				-					SpiderFlowMap.Store(key, sfMap)
			
 
				-				}
			
 
				-			} else {
			
 
				-				SpiderFlowMap.Store(key, &SpiderFlow{
			
 
				-					//Code:       code,
			
 
				-					Site:       *site,
			
 
				-					Channel:    *channel,
			
 
				-					Flow:       retLen,
			
 
				-					ModifyUser: *user,
			
 
				-				})
			
 
				-			}
			
 
				-		}
			
 
				+		//流量统计
			
 
				+		//if retLen > 0 {
			
 
				+		//	key := Today + "+" + code
			
 
				+		//	if sf, ok := SpiderFlowMap.Load(key); ok && sf != nil {
			
 
				+		//		if sfMap, ok := sf.(*SpiderFlow); ok {
			
 
				+		//			sfMap.Flow += retLen
			
 
				+		//			//sfMap.Site = *site
			
 
				+		//			//sfMap.Channel = *channel
			
 
				+		//			//sfMap.ModifyUser = *user
			
 
				+		//			SpiderFlowMap.Store(key, sfMap)
			
 
				+		//		}
			
 
				+		//	} else {
			
 
				+		//		SpiderFlowMap.Store(key, &SpiderFlow{
			
 
				+		//			//Code:       code,
			
 
				+		//			Site:       *site,
			
 
				+		//			Channel:    *channel,
			
 
				+		//			Flow:       retLen,
			
 
				+		//			ModifyUser: *user,
			
 
				+		//		})
			
 
				+		//	}
			
 
				+		//}
			
 
				 
			
 
				 		url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
			
 
				 		if strings.TrimSpace(ftype) == "" {
			
--- a/src/spider/spider.go
+++ b/src/spider/spider.go
@@ -340,7 +340,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
 
				 								db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
			
 
				 								hashHref := HexText(href)
			
 
				 								//增量(redis默认db0)
			
 
				-								util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+								util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 								//全量(判断是否已存在防止覆盖id)
			
 
				 								isExist, _ := util.ExistRedis("title_repeat_fulljudgement", db, hashHref)
			
 
				 								if !isExist {
			
@@ -522,7 +522,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
 
				 	set := map[string]interface{}{"state": 1, "updatetime": time.Now().Unix()}
			
 
				 	if data["delete"] != nil {
			
 
				 		//增量
			
 
				-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 		//全量
			
 
				 		db := HexToBigIntMod(href)
			
 
				 		hashHref := HexText(href)
			
@@ -601,7 +601,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
				 	if util.Config.Modal == 1 { //除7000、7500、7700节点外所有节点只采集列表页信息
			
 
				 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
			
 
				 		if isExist { //更新redis生命周期
			
 
				-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 			*num++ //已采集
			
 
				 			return
			
 
				 		}
			
@@ -613,7 +613,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
				 		}
			
 
				 		isExist, _ := util.ExistRedis("title_repeat_judgement", 0, "url_repeat_"+href)
			
 
				 		if isExist { //更新redis生命周期
			
 
				-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 			*num++ //已采集
			
 
				 			return
			
 
				 		}
			
@@ -629,7 +629,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
				 		}
			
 
				 		SaveListPageData(paramdata, &id, isEsRepeat) //保存7000、7410、7500、7700节点列表页采集的信息
			
 
				 		if isEsRepeat {                              //类竞品数据title判重数据加入redis
			
 
				-			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+			util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 			return
			
 
				 		}
			
 
				 	}
			
@@ -650,7 +650,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
				 	} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
			
 
				 		log.Println("beforeHref:", href, "afterHref:", tmphref)
			
 
				 		//增量
			
 
				-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 		//全量
			
 
				 		db := HexToBigIntMod(href)
			
 
				 		hashHref := HexText(href)
			
@@ -667,7 +667,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
 
				 	//详情页过滤数据
			
 
				 	if data["delete"] != nil {
			
 
				 		//增量
			
 
				-		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+		util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 		//全量
			
 
				 		db := HexToBigIntMod(href)
			
 
				 		hashHref := HexText(href)
			
@@ -939,7 +939,7 @@ func (s *Spider) DownloadDetail(stype string) {
 
				 						esQuery := `{"query": {"filtered": {"filter": {"bool": {"must": [{"range": {"comeintime": {"gte": "` + fmt.Sprint(sTime) + `","lte": "` + fmt.Sprint(eTime) + `"}}}]}},"query": {"bool": {"must": [{"multi_match": {"query": "` + title + `","type": "phrase","fields": ["title"]}}]}}}}}`
			
 
				 						count := Es.Count(EsIndex, EsType, esQuery)
			
 
				 						if count > 0 { //es中含本title数据，不再采集，更新list表数据状态
			
 
				-							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+							util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 							set := map[string]interface{}{"$set": map[string]interface{}{"state": 1, "exist": true, "updatetime": time.Now().Unix()}} //已存在state置为1
			
 
				 							update = append(update, query)
			
 
				 							update = append(update, set)
			
@@ -977,7 +977,7 @@ func (s *Spider) DownloadDetail(stype string) {
 
				 					} else if tmphref := qu.ObjToString(data["href"]); tmphref != href { //三级页href替换导致前后href不同
			
 
				 						log.Println("beforeHref:", href, "afterHref:", tmphref)
			
 
				 						//增量
			
 
				-						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 						//全量
			
 
				 						db := HexToBigIntMod(href)
			
 
				 						hashHref := HexText(href)
			
@@ -1001,7 +1001,7 @@ func (s *Spider) DownloadDetail(stype string) {
 
				 						return
			
 
				 					} else if data["delete"] != nil { //三级页过滤
			
 
				 						//增量
			
 
				-						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+						util.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 						//全量
			
 
				 						db := HexToBigIntMod(href)
			
 
				 						hashHref := HexText(href)
			
@@ -1103,7 +1103,7 @@ func NewSpiderByScript(num int, code string, info map[string]string, spChan chan
 
				 func AnalysisProjectInfo(data map[string]interface{}) bool {
			
 
				 	defer qu.Catch()
			
 
				 	detail := qu.ObjToString(data["detail"])
			
 
				-	if detail == "详情请访问原网页" || detail == "<br/>详情请访问原网页！" { //不判断包含关系因为有些数据为json拼接，字段不全，会加“详情请访问原网页”
			
 
				+	if detail == "详情请访问原网页！" || detail == "<br/>详情请访问原网页！" { //不判断包含关系因为有些数据为json拼接，字段不全，会加“详情请访问原网页”
			
 
				 		if projectinfo, ok := data["projectinfo"].(map[string]interface{}); ok && len(projectinfo) > 0 {
			
 
				 			if attachments, ok := projectinfo["attachments"].(map[string]interface{}); ok && len(attachments) > 0 {
			
 
				 				fileOk := false
			
--- a/src/spider/store.go
+++ b/src/spider/store.go
@@ -82,7 +82,7 @@ func Store(mode, event int, c, coverAttr string, data map[string]interface{}, fl
 
				 			db := HexToBigIntMod(href) //根据href的哈希值选择Redis的db
			
 
				 			hashHref := HexText(href)
			
 
				 			//增量
			
 
				-			lu.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, href, 3600*24*365)
			
 
				+			lu.PutRedis("title_repeat_judgement", 0, "url_repeat_"+href, "", 3600*24*365)
			
 
				 			//全量
			
 
				 			isExist, _ := lu.ExistRedis("title_repeat_fulljudgement", db, hashHref)
			
 
				 			if !isExist {