Procházet zdrojové kódy

更新数据指标统计

wcc před 1 rokem
rodič
revize
3341ede689
3 změnil soubory, kde provedl 122 přidání a 13 odebrání
  1. 4 4
      data_indicators/go.mod
  2. 8 8
      data_indicators/go.sum
  3. 110 1
      data_indicators/main.go

+ 4 - 4
data_indicators/go.mod

@@ -34,11 +34,11 @@ require (
 	github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect
 	go.uber.org/atomic v1.9.0 // indirect
 	go.uber.org/multierr v1.8.0 // indirect
-	golang.org/x/crypto v0.9.0 // indirect
-	golang.org/x/net v0.10.0 // indirect
+	golang.org/x/crypto v0.17.0 // indirect
+	golang.org/x/net v0.19.0 // indirect
 	golang.org/x/sync v0.1.0 // indirect
-	golang.org/x/sys v0.8.0 // indirect
-	golang.org/x/text v0.9.0 // indirect
+	golang.org/x/sys v0.15.0 // indirect
+	golang.org/x/text v0.14.0 // indirect
 	gopkg.in/ini.v1 v1.67.0 // indirect
 	gopkg.in/mgo.v2 v2.0.0-20190816093944-a6b53ec6cb22 // indirect
 	gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect

+ 8 - 8
data_indicators/go.sum

@@ -252,8 +252,8 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh
 golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4=
 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/crypto v0.9.0 h1:LF6fAI+IutBocDJ2OT0Q1g8plpYljMZ4+lty+dsqw3g=
-golang.org/x/crypto v0.9.0/go.mod h1:yrmDGqONDYtNj3tH8X9dzUun2m2lzPa9ngI6/RUPGR0=
+golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k=
+golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
@@ -324,8 +324,8 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b
 golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk=
-golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
-golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
+golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c=
+golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
 golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
@@ -387,8 +387,8 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU=
-golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc=
+golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
 golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
@@ -399,8 +399,8 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
-golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
+golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
 golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
 golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=

+ 110 - 1
data_indicators/main.go

@@ -33,6 +33,8 @@ var (
 	dataCompete    = make(map[string]interface{}, 0) //竞品对比指标
 	dataTime       = make(map[string]interface{}, 0) //数据时效指标
 	dataQuality    = make(map[string]interface{}, 0) //数据质量指标
+	//竞品网站
+	competeSites = []string{"元博网(采购与招标网)", "中国招标与采购网", "北京隆道网络科技有限公司", "友云采"}
 )
 
 func main() {
@@ -299,6 +301,8 @@ func coverageA() {
 			"rate":      fmt.Sprintf("%.2f%%", float64(matches)/float64(len(qlmData))*100),
 		},
 	}
+	dataCompete["千里马对剑鱼多出数比例(标讯)"] = fmt.Sprintf("%.2f%%", float64(len(qlmData)-matches)/float64(count)*100)
+
 	//5.1.2 统计 标讯-招标预告 数据
 	matchesPre := countMatches(preData, titlesInB, projectsInB)
 	matchesA["招标预告"] = map[string]interface{}{
@@ -437,6 +441,8 @@ func coverageB() {
 			"rate":         fmt.Sprintf("%.2f%%", float64(matches)/float64(len(qlmData))*100),
 		},
 	}
+	dataCompete["剑鱼对千里马多出数据量(标讯)"] = fmt.Sprintf("%.2f%%", float64(len(qlmData)-matches)/float64(count)*100)
+
 	//5.1.2 统计 标讯-招标预告 数据
 	matchesPre := countMatches(preData, titlesInB, projectsInB)
 	matchesA["招标预告"] = map[string]interface{}{
@@ -492,6 +498,36 @@ func coverageB() {
 
 // getTimeLines 获取时效性指标
 func getTimeLines() {
+	type MaxDifference struct {
+		Bidding    map[string]interface{}
+		Difference int64
+	}
+	// 保存差值最大的前1000条数据及对应的bidding数据
+	var maxDifferences []MaxDifference
+
+	quantileMap := make(map[string]int) //分位数统计指标
+	quantileTotal := 0
+	whereAuditor := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$gt": time.Now().AddDate(0, -3, 0).Unix(),
+		},
+		"types": "审核",
+	}
+	//最近3天上架或者维护的采集;lua  python 脚本
+	auditors, _ := MgoC.Find("lua_logs_auditor", whereAuditor, nil, map[string]interface{}{"code": 1, "comeintime": 1}, false, -1, -1)
+	codeMap := make([]string, 0)
+	for _, v := range *auditors {
+		code := utils.ObjToString(v["code"])
+		codeMap = append(codeMap, code)
+	}
+
+	auditors2, _ := MgoC.Find("python_logs_auditor", whereAuditor, nil, map[string]interface{}{"spidercode": 1, "comeintime": 1}, false, -1, -1)
+	for _, v := range *auditors2 {
+		code := utils.ObjToString(v["spidercode"])
+		codeMap = append(codeMap, code)
+	}
+
+	log.Info("最近3天上架或者维护的采集", zap.Int("脚本总数是:", len(codeMap)))
 	//6.数据整体流程均耗时(分钟)
 	whereBidding := map[string]interface{}{
 		"comeintime": map[string]interface{}{
@@ -501,7 +537,7 @@ func getTimeLines() {
 	}
 	sessB := MgoB.GetMgoConn()
 	defer MgoB.DestoryMongoConn(sessB)
-	fd := bson.M{"extracttype": 1, "sensitive": 1, "dataging": 1, "site": 1, "infoformat": 1, "comeintime": 1, "pici": 1, "publishtime": 1, "competehref": 1, "attach_text": 1}
+	fd := bson.M{"extracttype": 1, "sensitive": 1, "dataging": 1, "site": 1, "infoformat": 1, "comeintime": 1, "pici": 1, "publishtime": 1, "competehref": 1, "attach_text": 1, "spidercode": 1, "href": 1, "title": 1, "projectname": 1}
 
 	queryB := sessB.DB("qfw").C("bidding").Find(whereBidding).Select(fd).Iter()
 
@@ -514,6 +550,9 @@ func getTimeLines() {
 			comeintime := utils.Int64All(tmp["comeintime"])
 			publishtime := utils.Int64All(tmp["publishtime"])
 			pici := utils.Int64All(tmp["pici"])
+			if pici == 0 || publishtime == 0 || comeintime == 0 {
+				continue
+			}
 			if pici > 0 {
 				esCount++
 			}
@@ -524,9 +563,79 @@ func getTimeLines() {
 				pici_publish_totaltime += diff1
 				pici_comein_totaltime += diff2
 			}
+			//排除竞品网站
+			if !IsInStringArray(utils.ObjToString(tmp["site"]), competeSites) {
+				diff := pici - publishtime
+				if diff < 0 {
+					continue
+				} else if diff < 5*60 {
+					quantileMap["a1"]++
+				} else if diff < 15*60 {
+					quantileMap["a2"]++
+				} else if diff < 30*60 {
+					quantileMap["a3"]++
+				} else if diff < 60*60 {
+					quantileMap["a4"]++
+				} else if diff < 3*60*60 {
+					quantileMap["a5"]++
+				} else if diff < 7*60*60 {
+					quantileMap["a6"]++
+				} else if diff < 15*60*60 {
+					quantileMap["a7"]++
+				} else if diff < 24*60*60 {
+					quantileMap["a8"]++
+				} else if diff < 48*60*60 {
+					quantileMap["a9"]++
+				} else if diff < 72*60*60 {
+					quantileMap["a10"]++
+				} else {
+					quantileMap["a11"]++
+				}
+				quantileTotal++
+				spiderCode := utils.ObjToString(tmp["spidercode"])
+				//爬虫代码不是 三天内新上架和三天内维护的代码
+				if !IsInStringArray(spiderCode, codeMap) {
+					if diff > 7*24*3600 {
+						continue
+					}
+					// 如果切片还没有满,直接追加
+					if len(maxDifferences) < 1000 {
+						maxDifferences = append(maxDifferences, MaxDifference{Bidding: tmp, Difference: diff})
+					} else {
+						// 替换切片中最小的值
+						minIndex := 0
+						for j, d := range maxDifferences {
+							if d.Difference < maxDifferences[minIndex].Difference {
+								minIndex = j
+							}
+						}
+						if diff > maxDifferences[minIndex].Difference {
+							maxDifferences[minIndex] = MaxDifference{Bidding: tmp, Difference: diff}
+						}
+					}
+				}
+			}
 		}
+		tmp = make(map[string]interface{})
 	}
 
+	//统计前1000条最大时差
+	diffTotal := int64(0)
+	for k, _ := range maxDifferences {
+		data := maxDifferences[k]
+		diffTotal += data.Difference
+		data.Bidding["top_time"] = time.Now().Format("2006-01-02")
+		data.Bidding["diff"] = data.Difference
+		MgoB.SaveByOriID("bidding_top_1000", data.Bidding)
+	}
+	dataTime["数据时效极值(用时最长)top1000"] = fmt.Sprintf("%.2f", float64(diffTotal)/float64(1000*60))
+
+	quantileData := make(map[string]interface{})
+	//统计时效分位数
+	for k, v := range quantileMap {
+		quantileData[k] = fmt.Sprintf("%.2f", float64(v)/float64(quantileTotal))
+	}
+	dataTime["数据时效分位数统计"] = quantileData
 	dataCollection["数据采集日索引量"] = esCount //数据采集指标-数据采集日索引量
 	if biddingRealCount > 0 {
 		pici_publish_avgtime := pici_publish_totaltime / int64(biddingRealCount)