|
@@ -33,6 +33,8 @@ var (
|
|
|
dataCompete = make(map[string]interface{}, 0) //竞品对比指标
|
|
|
dataTime = make(map[string]interface{}, 0) //数据时效指标
|
|
|
dataQuality = make(map[string]interface{}, 0) //数据质量指标
|
|
|
+ //竞品网站
|
|
|
+ competeSites = []string{"元博网(采购与招标网)", "中国招标与采购网", "北京隆道网络科技有限公司", "友云采"}
|
|
|
)
|
|
|
|
|
|
func main() {
|
|
@@ -299,6 +301,8 @@ func coverageA() {
|
|
|
"rate": fmt.Sprintf("%.2f%%", float64(matches)/float64(len(qlmData))*100),
|
|
|
},
|
|
|
}
|
|
|
+ dataCompete["千里马对剑鱼多出数比例(标讯)"] = fmt.Sprintf("%.2f%%", float64(len(qlmData)-matches)/float64(count)*100)
|
|
|
+
|
|
|
//5.1.2 统计 标讯-招标预告 数据
|
|
|
matchesPre := countMatches(preData, titlesInB, projectsInB)
|
|
|
matchesA["招标预告"] = map[string]interface{}{
|
|
@@ -437,6 +441,8 @@ func coverageB() {
|
|
|
"rate": fmt.Sprintf("%.2f%%", float64(matches)/float64(len(qlmData))*100),
|
|
|
},
|
|
|
}
|
|
|
+ dataCompete["剑鱼对千里马多出数据量(标讯)"] = fmt.Sprintf("%.2f%%", float64(len(qlmData)-matches)/float64(count)*100)
|
|
|
+
|
|
|
//5.1.2 统计 标讯-招标预告 数据
|
|
|
matchesPre := countMatches(preData, titlesInB, projectsInB)
|
|
|
matchesA["招标预告"] = map[string]interface{}{
|
|
@@ -492,6 +498,36 @@ func coverageB() {
|
|
|
|
|
|
// getTimeLines 获取时效性指标
|
|
|
func getTimeLines() {
|
|
|
+ type MaxDifference struct {
|
|
|
+ Bidding map[string]interface{}
|
|
|
+ Difference int64
|
|
|
+ }
|
|
|
+ // 保存差值最大的前1000条数据及对应的bidding数据
|
|
|
+ var maxDifferences []MaxDifference
|
|
|
+
|
|
|
+ quantileMap := make(map[string]int) //分位数统计指标
|
|
|
+ quantileTotal := 0
|
|
|
+ whereAuditor := map[string]interface{}{
|
|
|
+ "comeintime": map[string]interface{}{
|
|
|
+ "$gt": time.Now().AddDate(0, -3, 0).Unix(),
|
|
|
+ },
|
|
|
+ "types": "审核",
|
|
|
+ }
|
|
|
+ //最近3天上架或者维护的采集;lua python 脚本
|
|
|
+ auditors, _ := MgoC.Find("lua_logs_auditor", whereAuditor, nil, map[string]interface{}{"code": 1, "comeintime": 1}, false, -1, -1)
|
|
|
+ codeMap := make([]string, 0)
|
|
|
+ for _, v := range *auditors {
|
|
|
+ code := utils.ObjToString(v["code"])
|
|
|
+ codeMap = append(codeMap, code)
|
|
|
+ }
|
|
|
+
|
|
|
+ auditors2, _ := MgoC.Find("python_logs_auditor", whereAuditor, nil, map[string]interface{}{"spidercode": 1, "comeintime": 1}, false, -1, -1)
|
|
|
+ for _, v := range *auditors2 {
|
|
|
+ code := utils.ObjToString(v["spidercode"])
|
|
|
+ codeMap = append(codeMap, code)
|
|
|
+ }
|
|
|
+
|
|
|
+ log.Info("最近3天上架或者维护的采集", zap.Int("脚本总数是:", len(codeMap)))
|
|
|
//6.数据整体流程均耗时(分钟)
|
|
|
whereBidding := map[string]interface{}{
|
|
|
"comeintime": map[string]interface{}{
|
|
@@ -501,7 +537,7 @@ func getTimeLines() {
|
|
|
}
|
|
|
sessB := MgoB.GetMgoConn()
|
|
|
defer MgoB.DestoryMongoConn(sessB)
|
|
|
- fd := bson.M{"extracttype": 1, "sensitive": 1, "dataging": 1, "site": 1, "infoformat": 1, "comeintime": 1, "pici": 1, "publishtime": 1, "competehref": 1, "attach_text": 1}
|
|
|
+ fd := bson.M{"extracttype": 1, "sensitive": 1, "dataging": 1, "site": 1, "infoformat": 1, "comeintime": 1, "pici": 1, "publishtime": 1, "competehref": 1, "attach_text": 1, "spidercode": 1, "href": 1, "title": 1, "projectname": 1}
|
|
|
|
|
|
queryB := sessB.DB("qfw").C("bidding").Find(whereBidding).Select(fd).Iter()
|
|
|
|
|
@@ -514,6 +550,9 @@ func getTimeLines() {
|
|
|
comeintime := utils.Int64All(tmp["comeintime"])
|
|
|
publishtime := utils.Int64All(tmp["publishtime"])
|
|
|
pici := utils.Int64All(tmp["pici"])
|
|
|
+ if pici == 0 || publishtime == 0 || comeintime == 0 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
if pici > 0 {
|
|
|
esCount++
|
|
|
}
|
|
@@ -524,9 +563,79 @@ func getTimeLines() {
|
|
|
pici_publish_totaltime += diff1
|
|
|
pici_comein_totaltime += diff2
|
|
|
}
|
|
|
+ //排除竞品网站
|
|
|
+ if !IsInStringArray(utils.ObjToString(tmp["site"]), competeSites) {
|
|
|
+ diff := pici - publishtime
|
|
|
+ if diff < 0 {
|
|
|
+ continue
|
|
|
+ } else if diff < 5*60 {
|
|
|
+ quantileMap["a1"]++
|
|
|
+ } else if diff < 15*60 {
|
|
|
+ quantileMap["a2"]++
|
|
|
+ } else if diff < 30*60 {
|
|
|
+ quantileMap["a3"]++
|
|
|
+ } else if diff < 60*60 {
|
|
|
+ quantileMap["a4"]++
|
|
|
+ } else if diff < 3*60*60 {
|
|
|
+ quantileMap["a5"]++
|
|
|
+ } else if diff < 7*60*60 {
|
|
|
+ quantileMap["a6"]++
|
|
|
+ } else if diff < 15*60*60 {
|
|
|
+ quantileMap["a7"]++
|
|
|
+ } else if diff < 24*60*60 {
|
|
|
+ quantileMap["a8"]++
|
|
|
+ } else if diff < 48*60*60 {
|
|
|
+ quantileMap["a9"]++
|
|
|
+ } else if diff < 72*60*60 {
|
|
|
+ quantileMap["a10"]++
|
|
|
+ } else {
|
|
|
+ quantileMap["a11"]++
|
|
|
+ }
|
|
|
+ quantileTotal++
|
|
|
+ spiderCode := utils.ObjToString(tmp["spidercode"])
|
|
|
+ //爬虫代码不是 三天内新上架和三天内维护的代码
|
|
|
+ if !IsInStringArray(spiderCode, codeMap) {
|
|
|
+ if diff > 7*24*3600 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ // 如果切片还没有满,直接追加
|
|
|
+ if len(maxDifferences) < 1000 {
|
|
|
+ maxDifferences = append(maxDifferences, MaxDifference{Bidding: tmp, Difference: diff})
|
|
|
+ } else {
|
|
|
+ // 替换切片中最小的值
|
|
|
+ minIndex := 0
|
|
|
+ for j, d := range maxDifferences {
|
|
|
+ if d.Difference < maxDifferences[minIndex].Difference {
|
|
|
+ minIndex = j
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if diff > maxDifferences[minIndex].Difference {
|
|
|
+ maxDifferences[minIndex] = MaxDifference{Bidding: tmp, Difference: diff}
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
+ tmp = make(map[string]interface{})
|
|
|
}
|
|
|
|
|
|
+ //统计前1000条最大时差
|
|
|
+ diffTotal := int64(0)
|
|
|
+ for k, _ := range maxDifferences {
|
|
|
+ data := maxDifferences[k]
|
|
|
+ diffTotal += data.Difference
|
|
|
+ data.Bidding["top_time"] = time.Now().Format("2006-01-02")
|
|
|
+ data.Bidding["diff"] = data.Difference
|
|
|
+ MgoB.SaveByOriID("bidding_top_1000", data.Bidding)
|
|
|
+ }
|
|
|
+ dataTime["数据时效极值(用时最长)top1000"] = fmt.Sprintf("%.2f", float64(diffTotal)/float64(1000*60))
|
|
|
+
|
|
|
+ quantileData := make(map[string]interface{})
|
|
|
+ //统计时效分位数
|
|
|
+ for k, v := range quantileMap {
|
|
|
+ quantileData[k] = fmt.Sprintf("%.2f", float64(v)/float64(quantileTotal))
|
|
|
+ }
|
|
|
+ dataTime["数据时效分位数统计"] = quantileData
|
|
|
dataCollection["数据采集日索引量"] = esCount //数据采集指标-数据采集日索引量
|
|
|
if biddingRealCount > 0 {
|
|
|
pici_publish_avgtime := pici_publish_totaltime / int64(biddingRealCount)
|