Ver código fonte

爬虫补采模块修改

mxs 1 ano atrás
pai
commit
de42a4f20c
4 arquivos alterados com 24 adições e 9 exclusões
  1. 1 0
      src/spider/spider.go
  2. 0 8
      src/spider/store.go
  3. 5 0
      src/spider/supplement.go
  4. 18 1
      src/spider/util.go

+ 1 - 0
src/spider/spider.go

@@ -957,6 +957,7 @@ func (s *Spider) SupplementDownListPageItem() (errs interface{}) {
 		Spidercode: s.Code,
 		Modifyuser: s.MUserName,
 		Finish:     finish,
+		Comeintime: time.Now().Unix(), //提前赋值(无法运行完毕的爬虫不会有后期赋值)
 	}
 	for {
 		if errtimes >= Supplement_MaxErrorTimes || publishtimeAllZeroTimes > Supplement_Publishtime_ZeroTimes { //连续异常次数超过10次,爬虫不再翻页

+ 0 - 8
src/spider/store.go

@@ -220,14 +220,6 @@ func SaveListPageData(tmp map[string]interface{}, id *string, isEsRepeat bool) {
 	*id = MgoS.Save("spider_listdata", tmp)
 }
 
-// 获取第day天凌晨的时间戳
-func GetTime(day int) int64 {
-	nowTime := time.Now().AddDate(0, 0, day)
-	timeStr := util.FormatDate(&nowTime, util.Date_Short_Layout)
-	t, _ := time.ParseInLocation(util.Date_Short_Layout, timeStr, time.Local)
-	return t.Unix()
-}
-
 // 更新state状态
 func UpdateHighListDataByCode(code string) {
 	query := map[string]interface{}{

+ 5 - 0
src/spider/supplement.go

@@ -41,6 +41,7 @@ type SupplementSpider struct {
 	Success            int    `bson:"success"`
 	Failed             int    `bson:"failed"`
 	PublishtimeZeroNum int    `bson:"ptimezeronum"`
+	EffectiveNum       int    `bson:"effectivenum"`
 }
 
 func InitSupplement() {
@@ -82,6 +83,8 @@ func SupplementEnd() {
 
 func SupplementDataCount() {
 	logger.Info("补采数据统计开始...")
+	timeEnd := GetStrTime(-1)
+	timeStart := GetStrTime(-3)
 	sess := MgoS.GetMgoConn()
 	defer MgoS.DestoryMongoConn(sess)
 	ch := make(chan bool, 5)
@@ -124,6 +127,8 @@ func SupplementDataCount() {
 				}
 				if publishtime == "0" || publishtime == "" {
 					ss.PublishtimeZeroNum++
+				} else if publishtime >= timeStart && publishtime < timeEnd {
+					ss.EffectiveNum++
 				}
 			}
 			lock.Unlock()

+ 18 - 1
src/spider/util.go

@@ -1,7 +1,9 @@
 package spider
 
 import (
+	util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	qu "qfw/util"
+	"time"
 )
 
 // var SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
@@ -9,7 +11,7 @@ import (
 // var HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+")
 var ErrFid = "a6879f0a8570256aa21fb978e6dabb50429a30dfacff697cf0b898abbc5c262e" //限制访问的附件
 
-//初始化延迟采集站点集合
+// 初始化延迟采集站点集合
 func InitOther() {
 	defer qu.Catch()
 	DelaySiteMap = map[string]*DelaySite{}
@@ -24,3 +26,18 @@ func InitOther() {
 		}
 	}
 }
+
+// 获取第day天凌晨的时间戳
+func GetTime(day int) int64 {
+	nowTime := time.Now().AddDate(0, 0, day)
+	timeStr := util.FormatDate(&nowTime, util.Date_Short_Layout)
+	t, _ := time.ParseInLocation(util.Date_Short_Layout, timeStr, time.Local)
+	return t.Unix()
+}
+
+// 获取第day天凌晨的时间字符串
+func GetStrTime(day int) string {
+	nowTime := time.Now().AddDate(0, 0, day)
+	timeStr := util.FormatDate(&nowTime, util.Date_Short_Layout)
+	return timeStr
+}