瀏覽代碼

spider_listdata相关统计调整

maxiaoshan 2 年之前
父節點
當前提交
4b79e25b5f
共有 3 個文件被更改,包括 735 次插入9 次删除
  1. 516 0
      src/luatask/newtask.go
  2. 37 2
      src/luatask/sitecount.go
  3. 182 7
      src/luatask/task.go

+ 516 - 0
src/luatask/newtask.go

@@ -0,0 +1,516 @@
+package luatask
+
+import (
+	"encoding/json"
+	"github.com/donnie4w/go-logger/logger"
+	qu "qfw/util"
+	"sync"
+	"util"
+)
+
+var NewCodeInfoMap = map[string]*NewSpider{}
+
+type NewSpider struct {
+	//爬虫基本信息
+	Code         string                 `json:"code"`
+	Site         string                 `json:"site"`
+	Channel      string                 `json:"channel"`
+	Platform     string                 `json:"platform"`
+	Event        int                    `json:"event"`
+	PendState    int                    `json:"pendstate"`
+	ModifyUser   string                 `json:"modifyuser"`
+	ModifyId     string                 `json:"modifyuserid"`
+	ModifyTime   int64                  `json:"modifytime"`
+	Model        int                    `json:"model"`
+	Working      int                    `json:"working"`
+	AuditTime    int64                  `json:"l_uploadtime"`
+	ListIsFilter bool                   `json:"listisfilter"`
+	TaskTags     map[string]interface{} `json:"tasktags"`
+	//统计信息
+	Detail_DownloadNum        int               `json:"detail_downloadnum"`
+	Detail_DownloadSuccessNum int               `json:"detail_downloadsuccessnum"`
+	Detail_DownloadFailNum    int               `json:"detail_downloadfailnum"`
+	List_IsGetData            bool              `json:"list_isgetdata"`
+	List_RunTimes             int               `json:"list_runtimes"`
+	List_NoDataTimes          int               `json:"list_nodatatimes"`
+	List_AllInTimes           int               `json:"list_allintimes"`
+	WarnInfoMap               map[int]*WarnInfo `json:"warninfo"`
+	//python
+	Py_TaskId   string `json:"py_taskid"`
+	Py_NodeName string `json:"py_nodename"`
+	//补充信息
+	Comeintime int64 `json:"comeintime"`
+}
+
+type WarnInfo struct {
+	Info   string         `json:"info"`
+	Num    int            `json:"num"`
+	Fields map[string]int `json:"fields"`
+}
+
+func NewStartTask() {
+	InitInfo() //初始化时间
+	logger.Info(StartTime, EndTime, Publishtime)
+	getCodeBaseInfo()      //获取爬虫基本信息
+	getPythonSummaryInfo() //获取python汇总信息
+	getLuaSummaryInfo()    //获取lua汇总信息
+	getWarnInfo()          //异常信息汇总
+	saveCodeInfo()         //保存记录
+}
+
+func getCodeBaseInfo() {
+	defer qu.Catch()
+	sess := util.MgoEB.GetMgoConn()
+	defer util.MgoEB.DestoryMongoConn(sess)
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	query := map[string]interface{}{
+		"$or": []interface{}{
+			//lua、python上线爬虫
+			map[string]interface{}{
+				"state": map[string]interface{}{
+					"$in": []int{5, 11}, //上架、上线爬虫
+				},
+			},
+			//lua正在被维护的爬虫和上架爬虫
+			map[string]interface{}{
+				"platform": "golua平台",
+				"state": map[string]interface{}{
+					"$in": []int{0, 1, 2}, //待完成、待审核、未通过
+				},
+				"event": map[string]interface{}{
+					"$ne": 7000,
+				},
+			},
+		},
+	}
+	fields := map[string]interface{}{
+		"code":         1,
+		"site":         1,
+		"channel":      1,
+		"platform":     1,
+		"event":        1,
+		"pendstate":    1,
+		"modifyuser":   1,
+		"modifyuserid": 1,
+		"modifytime":   1,
+		"l_uploadtime": 1,
+		"listisfilter": 1,
+		"tasktags":     1,
+	}
+	it := sess.DB(util.MgoEB.DbName).C("luaconfig").Find(&query).Select(&fields).Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			sp := &NewSpider{
+				WarnInfoMap: map[int]*WarnInfo{},
+			}
+			luaByte, _ := json.Marshal(tmp)
+			if json.Unmarshal(luaByte, &sp) != nil {
+				qu.Info("初始化爬虫失败:", tmp["_id"])
+				return
+			}
+			sp.Working = util.CodeEventWorking[sp.Working]
+			sp.Model = util.CodeEventModel[sp.Event]
+			lock.Lock()
+			NewCodeInfoMap[sp.Code] = sp
+			lock.Unlock()
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("爬虫基本信息准备完成...", len(CodeInfoMap))
+}
+
+func getPythonSummaryInfo() {
+	defer qu.Catch()
+	sess := util.MgoPy.GetMgoConn()
+	defer util.MgoPy.DestoryMongoConn(sess)
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	query := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": util.GetTime(0),
+		},
+	}
+	it := sess.DB(util.MgoPy.DbName).C("spider_monitor").Find(&query).Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			code := qu.ObjToString(tmp["code"])
+			if is_valid, _ := tmp["is_valid"].(bool); !is_valid { //无效监控爬虫
+				lock.Lock()
+				delete(NewCodeInfoMap, code)
+				lock.Unlock()
+				return
+			}
+			py_taskid := qu.ObjToString(tmp["py_taskid"])
+			py_nodename := qu.ObjToString(tmp["py_nodename"])
+			list_isgetdata, _ := tmp["list_isgetdata"].(bool)
+			list_allintimes := qu.IntAll(tmp["list_allintimes"])
+			list_nodatatimes := qu.IntAll(tmp["list_nodatatimes"])
+			list_runtimes := qu.IntAll(tmp["list_runtimes"])
+			detail_downloadnum := qu.IntAll(tmp["detail_downloadnum"])
+			detail_downloadsuccessnum := qu.IntAll(tmp["detail_downloadsuccessnum"])
+			detail_downloadfailnum := qu.IntAll(tmp["detail_downloadfailnum"])
+			lock.Lock()
+			if sp := NewCodeInfoMap[code]; sp != nil {
+				sp.Py_TaskId = py_taskid
+				sp.Py_NodeName = py_nodename
+				sp.List_IsGetData = list_isgetdata
+				sp.List_AllInTimes = list_allintimes
+				sp.List_NoDataTimes = list_nodatatimes
+				sp.List_RunTimes = list_runtimes
+				sp.Detail_DownloadNum = detail_downloadnum
+				sp.Detail_DownloadSuccessNum = detail_downloadsuccessnum
+				sp.Detail_DownloadFailNum = detail_downloadfailnum
+			}
+			lock.Unlock()
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("python汇总信息完成...")
+
+}
+
+func getLuaSummaryInfo() {
+	getCodeHeart()                 //获取心跳信息
+	getSpiderHighListDownloadNum() //获取下载量信息
+	getSpiderListDownloadNum()     //获取下载量信息
+	getSpiderDownloadRateDataNew() //获取下载详情
+}
+
+func getWarnInfo() {
+	defer qu.Catch()
+	sess := util.MgoS.GetMgoConn()
+	defer util.MgoS.DestoryMongoConn(sess)
+	query := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": StartTime,
+			"$lt":  EndTime,
+		},
+	}
+	fields := map[string]interface{}{
+		"data": 0,
+	}
+	it := sess.DB(util.MgoS.DbName).C("spider_warn").Find(&query).Select(&fields).Iter()
+	n := 0
+	ch := make(chan bool, 5)
+	wg := &sync.WaitGroup{}
+	lock := &sync.Mutex{}
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			infotype := qu.IntAll(tmp["infotype"])
+			level := qu.IntAll(tmp["level"])
+			field := qu.ObjToString(tmp["field"])
+			if infotype == 3 || infotype == 7 {
+				return
+			}
+			if (infotype == 5 || infotype == 6) && level == 1 {
+				return
+			} else if infotype == 8 && field == "projectinfo" {
+				return
+			}
+			code := qu.ObjToString(tmp["code"])
+			info := qu.ObjToString(tmp["info"])
+			lock.Lock()
+			if sp := NewCodeInfoMap[code]; sp != nil {
+				if wf := sp.WarnInfoMap[infotype]; wf != nil {
+					wf.Fields[field] += 1
+				} else {
+					sp.WarnInfoMap[infotype] = &WarnInfo{
+						Info:   info,
+						Num:    1,
+						Fields: map[string]int{field: 1},
+					}
+				}
+			}
+			lock.Unlock()
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("错误信息数据统计完成...")
+}
+
+func getCodeHeart() {
+	defer qu.Catch()
+	sess := util.MgoS.GetMgoConn()
+	defer util.MgoS.DestoryMongoConn(sess)
+	query := map[string]interface{}{
+		"del": false,
+	}
+	fields := map[string]interface{}{
+		"code":     1,
+		"findlist": 1,
+	}
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	it := sess.DB(util.MgoS.DbName).C("spider_heart").Find(&query).Select(&fields).Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			code := qu.ObjToString(tmp["code"])
+			findListHeart := qu.Int64All(tmp["findlist"])
+			lock.Lock()
+			if sp := NewCodeInfoMap[code]; sp != nil {
+				limitDayNum := 0
+				if sp.Event == 7520 { //由于7520节点爬虫循环一轮的时间较长,心跳有可能仍是前一天的
+					limitDayNum = -1
+				}
+				sp.List_IsGetData = findListHeart > util.GetTime(limitDayNum)
+			}
+			lock.Unlock()
+		}(tmp)
+		if n%100 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("lua统计心跳信息完成...")
+}
+
+func getSpiderHighListDownloadNum() {
+	defer qu.Catch()
+	sess := util.MgoS.GetMgoConn()
+	defer util.MgoS.DestoryMongoConn(sess)
+	query := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": StartTime,
+			"$lt":  EndTime,
+		},
+	}
+	fields := map[string]interface{}{
+		"spidercode": 1,
+		"state":      1,
+	}
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	//1、统计spider_highlistdata
+	it := sess.DB(util.MgoS.DbName).C("spider_highlistdata").Find(&query).Select(&fields).Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			code := qu.ObjToString(tmp["spidercode"])
+			state := qu.IntAll(tmp["state"])
+			lock.Lock()
+			if sp := NewCodeInfoMap[code]; sp != nil {
+				if state == 1 {
+					sp.Detail_DownloadSuccessNum++
+				} else {
+					sp.Detail_DownloadFailNum++
+				}
+				sp.Detail_DownloadNum++
+			}
+			lock.Unlock()
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("lua统计采集量spider_highlistdata完成...")
+}
+
+func getSpiderListDownloadNum() {
+	defer qu.Catch()
+	sess := util.MgoS.GetMgoConn()
+	defer util.MgoS.DestoryMongoConn(sess)
+	query := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": StartTime,
+			"$lt":  EndTime,
+		},
+	}
+	fields := map[string]interface{}{
+		"spidercode": 1,
+		"state":      1,
+		"href":       1,
+	}
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	repeatHrefMap := map[string]int{}
+	it := sess.DB(util.MgoS.DbName).C("spider_listdata").Find(&query).Select(&fields).Sort("_id").Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			state := qu.IntAll(tmp["state"])
+			code := qu.ObjToString(tmp["spidercode"])
+			href := qu.ObjToString(tmp["href"])
+			lock.Lock()
+			defer lock.Unlock()
+			tmpState := repeatHrefMap[href]
+			if tmpState == 1 { //该href已记录下载成功,后续不做任务记录
+				return
+			} else if tmpState == 0 { //未曾记录该href
+				if sp := NewCodeInfoMap[code]; sp != nil {
+					if state == 1 {
+						sp.Detail_DownloadSuccessNum++
+					} else {
+						state = -1
+						sp.Detail_DownloadFailNum++
+					}
+					sp.Detail_DownloadNum++
+					repeatHrefMap[href] = state
+				}
+			} else if tmpState == -1 && state == 1 { //已记录状态是下载失败,当前下载成功,记录该href最终为下载成功
+				if sp := NewCodeInfoMap[code]; sp != nil {
+					sp.Detail_DownloadSuccessNum++
+					sp.Detail_DownloadFailNum--
+					repeatHrefMap[href] = state
+				}
+			}
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	repeatHrefMap = map[string]int{}
+	logger.Info("lua统计spider_listdata采集量完成...")
+}
+
+func getSpiderDownloadRateDataNew() {
+	defer qu.Catch()
+	sess := util.MgoS.GetMgoConn()
+	defer util.MgoS.DestoryMongoConn(sess)
+	ch := make(chan bool, 5)
+	wg := &sync.WaitGroup{}
+	lock := &sync.Mutex{}
+	date := qu.FormatDateByInt64(&StartTime, qu.Date_Short_Layout)
+	query := map[string]interface{}{
+		"date": date,
+		"event": map[string]interface{}{
+			"$ne": 7000,
+		},
+	}
+	fields := map[string]interface{}{
+		"spidercode": 1,
+		"alltimes":   1,
+		"zero":       1,
+		"oh_percent": 1,
+	}
+	it := sess.DB(util.MgoS.DbName).C("spider_downloadrate").Find(&query).Select(&fields).Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(tmp); n++ {
+		ch <- true
+		wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			code := qu.ObjToString(tmp["spidercode"])
+			alltimes := qu.IntAll(tmp["alltimes"])
+			zero := qu.IntAll(tmp["zero"])
+			oh_percent := qu.IntAll(tmp["oh_percent"])
+			lock.Lock()
+			if sp := NewCodeInfoMap[code]; sp != nil {
+				sp.List_NoDataTimes = zero
+				sp.List_AllInTimes = alltimes
+				sp.List_AllInTimes = oh_percent
+			}
+			lock.Unlock()
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info("current:", n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("lua爬虫采集详情统计完成...")
+}
+
+func saveCodeInfo() {
+	defer qu.Catch()
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	arr := []map[string]interface{}{}
+	for _, spider := range NewCodeInfoMap {
+		ch <- true
+		wg.Add(1)
+		go func(sp *NewSpider) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			spByte, err := json.Marshal(sp)
+			if err != nil {
+				logger.Info("Json Marshal Error", sp.Code)
+				return
+			}
+			tmp := map[string]interface{}{}
+			if json.Unmarshal(spByte, &tmp) == nil {
+				lock.Lock()
+				arr = append(arr, tmp)
+				if len(arr) > 500 {
+					util.MgoS.SaveBulk("spider_info", arr...)
+					arr = []map[string]interface{}{}
+				}
+				lock.Unlock()
+			}
+
+		}(spider)
+	}
+	wg.Wait()
+	if len(arr) > 0 {
+		util.MgoS.SaveBulk("spider_info", arr...)
+		arr = []map[string]interface{}{}
+	}
+	logger.Info("爬虫统计完成...")
+}

+ 37 - 2
src/luatask/sitecount.go

@@ -243,7 +243,7 @@ func GetSpiderHighListDataNum(stime int64, strStime string, siteInfoMap map[stri
 	logger.Debug("三天前发布spider_highlistdata统计完毕...")
 }
 
-func GetSpiderListDataNum(stime int64, strStime string, siteInfoMap map[string]*SiteInfo) {
+func GetSpiderListDataNum_back(stime int64, strStime string, siteInfoMap map[string]*SiteInfo) {
 	defer qu.Catch()
 	sess := util.MgoS.GetMgoConn()
 	defer util.MgoS.DestoryMongoConn(sess)
@@ -261,6 +261,7 @@ func GetSpiderListDataNum(stime int64, strStime string, siteInfoMap map[string]*
 	fieles := map[string]interface{}{
 		"site":  1,
 		"event": 1,
+		"count": 1,
 	}
 	it := sess.DB(util.MgoS.DbName).C("spider_listdata").Find(&query).Select(&fieles).Iter()
 	n := 0
@@ -272,7 +273,7 @@ func GetSpiderListDataNum(stime int64, strStime string, siteInfoMap map[string]*
 				<-ch
 				wg.Done()
 			}()
-			if qu.IntAll(tmp["event"]) == 7000 { //排除7000节点
+			if qu.IntAll(tmp["event"]) == 7000 || qu.IntAll(tmp["count"]) != 0 { //排除7000节点和重复数据
 				return
 			}
 			site := qu.ObjToString(tmp["site"])
@@ -290,6 +291,40 @@ func GetSpiderListDataNum(stime int64, strStime string, siteInfoMap map[string]*
 	wg.Wait()
 	logger.Debug("三天前发布spider_listdata统计完毕...")
 }
+func GetSpiderListDataNum(stime int64, strStime string, siteInfoMap map[string]*SiteInfo) {
+	query := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": stime,
+		},
+		"publishtime": map[string]interface{}{
+			"$regex": strStime,
+		},
+	}
+	result := getSpiderListHrefState(query)
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	for href, hrefInfo := range result {
+		ch <- true
+		wg.Add(1)
+		go func(href string, hf *HrefInfo) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			if hf.Event == 7000 { //排除7000节点
+				return
+			}
+			lock.Lock()
+			if sInfo := siteInfoMap[hf.Site]; sInfo != nil { //要统计的重点站点
+				sInfo.ThreeDaysAgoNum++
+			}
+			lock.Unlock()
+		}(href, hrefInfo)
+	}
+	wg.Wait()
+	result = map[string]*HrefInfo{}
+}
 
 func GetPythonDataNum(stime int64, strStime string, siteInfoMap map[string]*SiteInfo) {
 	defer qu.Catch()

+ 182 - 7
src/luatask/task.go

@@ -245,12 +245,12 @@ func GetCodeBaseInfo() {
 				},
 			},
 			//python正在被维护的爬虫和上线爬虫
-			map[string]interface{}{
-				"platform": "python",
-				"state": map[string]interface{}{
-					"$in": []int{1, 2, 6}, //待审核、未通过、已下架
-				},
-			},
+			//map[string]interface{}{
+			//	"platform": "python",
+			//	"state": map[string]interface{}{
+			//		"$in": []int{1, 2, 6}, //待审核、未通过、已下架
+			//	},
+			//},
 		},
 	}
 	fields := map[string]interface{}{
@@ -595,7 +595,7 @@ func GetSpiderHighListDownloadNum() {
 	logger.Debug("统计采集量spider_highlistdata完成...")
 }
 
-func GetSpiderListDownloadNum() {
+func GetSpiderListDownloadNum_back() {
 	defer qu.Catch()
 	sess := util.MgoS.GetMgoConn()
 	defer util.MgoS.DestoryMongoConn(sess)
@@ -612,6 +612,7 @@ func GetSpiderListDownloadNum() {
 		"site":        1,
 		"times":       1,
 		"publishtime": 1,
+		"count":       1,
 	}
 	lock := &sync.Mutex{}
 	wg := &sync.WaitGroup{}
@@ -626,6 +627,9 @@ func GetSpiderListDownloadNum() {
 				<-ch
 				wg.Done()
 			}()
+			if qu.IntAll(tmp["count"]) != 0 { //去除重复数据
+				return
+			}
 			code := qu.ObjToString(tmp["spidercode"])
 			href := qu.ObjToString(tmp["href"])
 			state := qu.IntAll(tmp["state"])
@@ -751,6 +755,177 @@ func GetSpiderListDownloadNum() {
 	SameDayHref = map[string]string{}
 	logger.Debug("统计spider_listdata采集量完成...")
 }
+func GetSpiderListDownloadNum() {
+	defer qu.Catch()
+	query := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gte": StartTime,
+			"$lt":  EndTime,
+		},
+	}
+	result := getSpiderListHrefState(query)
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	for href, hrefInfo := range result {
+		ch <- true
+		wg.Add(1)
+		go func(href string, hf *HrefInfo) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			sameDay := strings.Contains(hf.Publishtime, Publishtime) //判断是否是当天的数据
+			lock.Lock()
+			if sp := CodeInfoMap[hf.Code]; sp != nil {
+				//href不去重统计
+				success := true
+				sp.DownloadAllNum++
+				if sameDay {
+					sp.PTimeAllNum++
+				}
+				if hf.State == 1 { //下载成功
+					sp.DownloadSuccessNum++
+					if sameDay {
+						sp.PTimeSuccessNum++
+					}
+				} else if hf.State == -1 { //下载失败
+					success = false
+					sp.DownloadFailedNum++
+					if sameDay {
+						sp.PTimeFailedNum++
+					}
+				} else { //未下载
+					sp.NoDownloadNum++
+					if sameDay {
+						sp.PTimeNoDownloadNum++
+					}
+				}
+				//按当天发布时间href去重
+				if sameDay && SameDayHref[href] != hf.Site {
+					sp.RepeatDownloadAllNum++
+					sp.RepeatPTimeAllNum++
+					if hf.State == 1 {
+						sp.RepeatDownloadSuccessNum++
+						sp.RepeatPTimeSuccessNum++
+					} else if hf.State == -1 { //下载失败
+						sp.RepeatDownloadFailedNum++
+						sp.RepeatPTimeFailedNum++
+					}
+					SameDayHref[href] = hf.Site
+					AllHref[href] = hf.Site
+				} else if AllHref[href] != hf.Site { //按全量href去重
+					sp.RepeatDownloadAllNum++
+					if hf.State == 1 { //下载成功
+						sp.RepeatDownloadSuccessNum++
+					} else if hf.State == -1 { //下载失败
+						sp.RepeatDownloadFailedNum++
+					}
+					AllHref[href] = hf.Site
+				}
+				if !success { //下载失败记录href
+					if errorInfo := sp.Error["download"]; errorInfo == nil {
+						sp.Error["download"] = &ErrorInfo{
+							Num: sp.DownloadFailedNum,
+							Err: []*ErrRemark{
+								&ErrRemark{
+									Href:   href,
+									Remark: "Download Failed",
+								},
+							},
+						}
+					} else {
+						errorInfo.Num = sp.DownloadFailedNum
+						if len(errorInfo.Err) < 3 {
+							errorInfo.Err = append(errorInfo.Err, &ErrRemark{
+								Href:   href,
+								Remark: "Download Failed",
+							})
+						}
+					}
+				}
+			}
+			lock.Unlock()
+		}(href, hrefInfo)
+	}
+	wg.Wait()
+	result = map[string]*HrefInfo{}
+}
+
+type HrefInfo struct {
+	Publishtime string
+	Site        string
+	State       int
+	Event       int
+	Code        string
+}
+
+//统计某个爬虫的某个href,当天采集结束后最终的采集状态
+func getSpiderListHrefState(query map[string]interface{}) (repeatHrefMap map[string]*HrefInfo) {
+	defer qu.Catch()
+	sess := util.MgoS.GetMgoConn()
+	defer util.MgoS.DestoryMongoConn(sess)
+	fields := map[string]interface{}{
+		"spidercode":  1,
+		"state":       1,
+		"href":        1,
+		"publishtime": 1,
+		"site":        1,
+		"event":       1,
+	}
+	lock := &sync.Mutex{}
+	wg := &sync.WaitGroup{}
+	ch := make(chan bool, 5)
+	repeatHrefMap = map[string]*HrefInfo{}
+	it := sess.DB(util.MgoS.DbName).C("spider_listdata").Find(&query).Select(&fields).Sort("_id").Iter()
+	n := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		wg.Add(1)
+		ch <- true
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			state := qu.IntAll(tmp["state"])
+			if state != 1 {
+				state = -1
+			}
+			event := qu.IntAll(tmp["event"])
+			code := qu.ObjToString(tmp["spidercode"])
+			href := qu.ObjToString(tmp["href"])
+			publishtime := qu.ObjToString(tmp["publishtime"])
+			site := qu.ObjToString(tmp["site"])
+			tmpHf := &HrefInfo{
+				State:       state,
+				Publishtime: publishtime,
+				Site:        site,
+				Event:       event,
+				Code:        code,
+			}
+			lock.Lock()
+			defer lock.Unlock()
+			if hf := repeatHrefMap[href]; hf != nil {
+				if hf.State == 1 { //有历史成功记录
+					return
+				} else if hf.State == 0 { //无记录
+					repeatHrefMap[href] = tmpHf
+				} else if hf.State == -1 && state == 1 { //历史记录失败,当前记录成功,存储成功记录
+					repeatHrefMap[href] = tmpHf
+				}
+			} else {
+				repeatHrefMap[href] = tmpHf
+			}
+		}(tmp)
+		if n%1000 == 0 {
+			logger.Info(n)
+		}
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	logger.Info("lua统计spider_listdata内部去重完成...")
+	return
+}
 
 func GetSpiderDataBakDownloadNum() {
 	defer qu.Catch()