/* 抽取结果对比 */ package main import ( "flag" "fmt" "jy/mongodbutil" "log" qu "qfw/util" "strings" "github.com/tealeg/xlsx" "gopkg.in/mgo.v2/bson" ) var ( SysConfig map[string]interface{} Premgo *mongodbutil.Pool //上个版本库 Newmgo *mongodbutil.Pool //当前版本库 FieldData map[string]map[string]*Data Compares map[string]*Compare Sid, Eid string Fields []string FieldsQuery string Url = "https://www.jianyu360.com/article/content/%s.html" ) type Compare struct { Field string //属性 PreExtNum, NewExtNum int //上个版、当前版有值数量 PreNilnum, NewNilnum int //上个版、当前版无值数量 EqNum, NEqNum int //相等、不等数据量 } type Data struct { Id string PreVal, NewVal string } func init() { flag.StringVar(&Sid, "sid", "5df5071ce9d1f601e495fa54", "开始id") flag.StringVar(&Eid, "eid", "5e09c05f0cf41612e0626abc", "结束id") flag.Parse() qu.ReadConfig(&SysConfig) Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"])) Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"])) tmp, _ := SysConfig["fields"].([]interface{}) for k, v := range tmp { Fields = append(Fields, qu.ObjToString(v)) if k < (len(tmp) - 1) { FieldsQuery += `"` + qu.ObjToString(v) + `":1,` } else { FieldsQuery += `"` + qu.ObjToString(v) + `":1` } } FieldData = map[string]map[string]*Data{} Compares = map[string]*Compare{} } func main() { getVersionData() createXlsx() //biaozhucompare() } func createXlsx() { xf, err := xlsx.OpenFile("template.xlsx") if err != nil { log.Println(err) return } //生成第一个sheet信息 sh := xf.Sheets[0] for i, field := range Fields { for k, row := range sh.Rows { if k > 2+i { style := (*row).Cells[1].GetStyle() style.Font.Color = "000000" (*row).Cells[0].SetString(field) (*row).Cells[1].SetStyle(style) (*row).Cells[2].SetStyle(style) (*row).Cells[3].SetStyle(style) (*row).Cells[4].SetStyle(style) if Compares[field] == nil { (*row).Cells[1].SetInt(0) (*row).Cells[2].SetInt(0) (*row).Cells[3].SetInt(0) (*row).Cells[4].SetInt(0) } else { (*row).Cells[1].SetInt(Compares[field].PreExtNum) (*row).Cells[2].SetInt(Compares[field].NewExtNum) (*row).Cells[3].SetInt(Compares[field].EqNum) (*row).Cells[4].SetInt(Compares[field].NEqNum) } } sh.Rows[k] = row } } var idsall = map[string]bool{} //生成信息sheet for _, field := range Fields { sh, _ := xf.AddSheet(field) rowh := sh.AddRow() rowh.AddCell().SetString("id") rowh.AddCell().SetString("preval") rowh.AddCell().SetString("newval") rowh.AddCell().SetString("url") tmp := FieldData[field] for k, v := range tmp { if v.NewVal != v.PreVal { row := sh.AddRow() row.AddCell().SetString(k) row.AddCell().SetString(v.PreVal) row.AddCell().SetString(v.NewVal) row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", v.Id))) idsall[v.Id] = true } } } log.Println("不同数据总量", len(idsall)) //生全量信息不同部分 shall, _ := xf.AddSheet("全量数据(不同部分)") rowh := shall.AddRow() rowh.AddCell().SetString("id") for _, v := range Fields { rowh.AddCell().SetString("preval_" + v) rowh.AddCell().SetString("newval_" + v) } rowh.AddCell().SetString("url") i := 0 for k, _ := range idsall { i++ row := shall.AddRow() row.AddCell().SetString(k) for _, field := range Fields { tmp := FieldData[field] v := tmp[k] if v != nil { if v.NewVal != v.PreVal { row.AddCell().SetString(v.PreVal) row.AddCell().SetString(v.NewVal) } else { row.AddCell().SetString("") row.AddCell().SetString("") } } else { row.AddCell().SetString("") row.AddCell().SetString("") } } row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", k))) } log.Println("数据处理完成,正在生成文件") err = xf.Save("result.xlsx") if err != nil { log.Println("保存xlsx失败:", err) return } log.Println("xlsx保存成功") } func getVersionData() { query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}} log.Println(qu.ObjToString(SysConfig["prec"]), query) list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{`+FieldsQuery+`,}`, false, -1, -1) for _, v := range *list1 { for _, key := range Fields { rd := FieldData[key] if rd == nil { rd = map[string]*Data{} } if v[key] == nil && strings.TrimSpace(qu.ObjToString(v[key])) == "" { continue } rd[qu.BsonIdToSId(v["_id"])] = &Data{ Id: qu.BsonIdToSId(v["_id"]), PreVal: fmt.Sprint(v[key]), } FieldData[key] = rd } } log.Println("pre version 加载完成") list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{`+FieldsQuery+`,"s_winner":1}`, false, -1, -1) for _, v := range *list2 { for _, field := range Fields { rd := FieldData[field] if field == "winner" { field = "s_winner" } if rd == nil { rd = map[string]*Data{} } if v[field] == nil && strings.TrimSpace(qu.ObjToString(v[field])) == "" { continue } _id := qu.BsonIdToSId(v["_id"]) tmp := rd[_id] if tmp != nil { tmp.NewVal = fmt.Sprint(v[field]) rd[_id] = tmp } else { rd[_id] = &Data{ Id: qu.BsonIdToSId(_id), NewVal: fmt.Sprint(v[field]), } } FieldData[field] = rd } } log.Println("new version 加载完成") for k, v := range FieldData { cp := &Compare{Field: k} for _, d := range v { if d.NewVal != "" && d.PreVal != "" { if d.NewVal == d.PreVal { cp.EqNum++ } else { cp.NEqNum++ } cp.PreExtNum++ cp.NewExtNum++ } else { if d.NewVal == "" { cp.NewNilnum++ if d.PreVal != "" { cp.NEqNum++ cp.PreExtNum++ } } if d.PreVal == "" { cp.PreNilnum++ if d.NewVal != "" { cp.NewExtNum++ cp.NEqNum++ } } } } Compares[k] = cp } } type BidData struct { id string key map[string]interface{} } type BidCom struct { Val []int Ids []map[string]interface{} } //标注正确率统计 func biaozhucompare() { exts, _ := Newmgo.Find("bid_v3", `{}`, `{"_id":1}`, nil, false, -1, -1) extDatas := []BidData{} for _, v := range *exts { key := map[string]interface{}{ "projectname": v["projectname"], "projectcode": v["projectcode"], "buyer": v["buyer"], "budget": qu.Float64All(v["budget"]), "bidamount": qu.Float64All(v["bidamount"]), "agency": v["agency"], "buyerperson": v["buyerperson"], "buyertel": v["buyertel"], } ext := BidData{ id: qu.BsonIdToSId(v["_id"]), key: key, } extDatas = append(extDatas, ext) } log.Println("exts ok") bzs, _ := Newmgo.Find("bid_biaozhuid", `{}`, `{"_id":1}`, nil, false, -1, -1) bzDatas := []BidData{} for _, v := range *bzs { bidamount := float64(0) if bigprices, ok := v["bigprice"].([]interface{}); ok { bidamount = qu.Float64All(bigprices[0]) } key := map[string]interface{}{ "projectname": qu.ObjToString(v["projectname"]), "projectcode": qu.ObjToString(v["projectcode"]), //qu.If(qu.ObjToString(v["t_bidno"]) == "", qu.ObjToString(v["b_projectno"]), qu.ObjToString(v["t_bidno"])), "buyer": qu.ObjToString(v["buyer"]), // qu.If(qu.ObjToString(v["t_buyer"]) == "", qu.ObjToString(v["b_buyer"]), qu.ObjToString(v["t_buyer"])), "budget": qu.Float64All(qu.ObjToString(v["budget"])), // qu.Float64All(qu.ObjToString(v["t_budget"])), "bidamount": bidamount, "agency": qu.ObjToString(v["agency"]), "buyerperson": qu.ObjToString(v["buyerperson"]), "buyertel": qu.ObjToString(v["buyertel"]), } bz := BidData{ id: qu.BsonIdToSId(v["_id"]), key: key, } bzDatas = append(bzDatas, bz) } log.Println("bzs ok") bcoms := map[string]*BidCom{} for _, ext := range extDatas { for _, bz := range bzDatas { if bz.id == ext.id { for key, val := range ext.key { // if key == "budget" { // log.Println(key, ext.key[key], ";;;;;", bz.key[key]) // } if qu.ObjToString(val) != "" || qu.Float64All(val) > 0 { bcom := bcoms[key] if bcom == nil { bcom = &BidCom{ Val: []int{0, 0}, Ids: []map[string]interface{}{}, } } if val == bz.key[key] { bcom.Val[0] += 1 } else { bcom.Val[1] += 1 tmp := map[string]interface{}{ "id": ext.id, "ext": val, "bz": bz.key[key], } bcom.Ids = append(bcom.Ids, tmp) } bcoms[key] = bcom } } break } } } xl := xlsx.NewFile() sh, _ := xl.AddSheet("统计") h := sh.AddRow() h.AddCell().SetString("field") h.AddCell().SetString("相同") h.AddCell().SetString("不同") for k, v := range bcoms { row := sh.AddRow() row.AddCell().SetString(k) row.AddCell().SetInt(v.Val[0]) row.AddCell().SetInt(v.Val[1]) ksh, _ := xl.AddSheet(k) rh := ksh.AddRow() rh.AddCell().SetString("id") rh.AddCell().SetString("标注") rh.AddCell().SetString("抽取") rh.AddCell().SetString("url") for _, v := range v.Ids { rw := ksh.AddRow() rw.AddCell().SetString(qu.ObjToString(v["id"])) rw.AddCell().SetString(fmt.Sprint(v["bz"])) rw.AddCell().SetString(fmt.Sprint(v["ext"])) rw.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", qu.ObjToString(v["id"])))) } log.Println(k, v.Val) } xl.Save("ext_bz.xlsx") }