123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323 |
- /*
- 抽取结果对比
- */
- package main
- import (
- "flag"
- "fmt"
- "jy/mongodbutil"
- "log"
- qu "qfw/util"
- "strings"
- "github.com/tealeg/xlsx"
- "gopkg.in/mgo.v2/bson"
- )
- var (
- SysConfig map[string]interface{}
- Premgo *mongodbutil.Pool //上个版本库
- Newmgo *mongodbutil.Pool //当前版本库
- FieldData map[string]map[string]*Data
- Compares map[string]*Compare
- Sid, Eid string
- Fields []string
- FieldsQuery string
- Url = "https://www.jianyu360.com/article/content/%s.html"
- )
- type Compare struct {
- Field string //属性
- PreExtNum, NewExtNum int //上个版、当前版有值数量
- PreNilnum, NewNilnum int //上个版、当前版无值数量
- EqNum, NEqNum int //相等、不等数据量
- }
- type Data struct {
- Id string
- PreVal, NewVal string
- }
- func init() {
- flag.StringVar(&Sid, "sid", "5df5071ce9d1f601e495fa54", "开始id")
- flag.StringVar(&Eid, "eid", "5e09c05f0cf41612e0626abc", "结束id")
- flag.Parse()
- qu.ReadConfig(&SysConfig)
- Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
- Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"]))
- tmp, _ := SysConfig["fields"].([]interface{})
- for k, v := range tmp {
- Fields = append(Fields, qu.ObjToString(v))
- if k < (len(tmp) - 1) {
- FieldsQuery += `"` + qu.ObjToString(v) + `":1,`
- } else {
- FieldsQuery += `"` + qu.ObjToString(v) + `":1`
- }
- }
- FieldData = map[string]map[string]*Data{}
- Compares = map[string]*Compare{}
- }
- func main() {
- getVersionData()
- createXlsx()
- //biaozhucompare()
- }
- func createXlsx() {
- xf, err := xlsx.OpenFile("template.xlsx")
- if err != nil {
- log.Println(err)
- return
- }
- //生成第一个sheet信息
- sh := xf.Sheets[0]
- for i, field := range Fields {
- for k, row := range sh.Rows {
- if k > 2+i {
- style := (*row).Cells[1].GetStyle()
- style.Font.Color = "000000"
- (*row).Cells[0].SetString(field)
- (*row).Cells[1].SetStyle(style)
- (*row).Cells[2].SetStyle(style)
- (*row).Cells[3].SetStyle(style)
- (*row).Cells[4].SetStyle(style)
- if Compares[field] == nil {
- (*row).Cells[1].SetInt(0)
- (*row).Cells[2].SetInt(0)
- (*row).Cells[3].SetInt(0)
- (*row).Cells[4].SetInt(0)
- } else {
- (*row).Cells[1].SetInt(Compares[field].PreExtNum)
- (*row).Cells[2].SetInt(Compares[field].NewExtNum)
- (*row).Cells[3].SetInt(Compares[field].EqNum)
- (*row).Cells[4].SetInt(Compares[field].NEqNum)
- }
- }
- sh.Rows[k] = row
- }
- }
- //生成信息sheet
- for _, field := range Fields {
- sh, _ := xf.AddSheet(field)
- rowh := sh.AddRow()
- rowh.AddCell().SetString("id")
- rowh.AddCell().SetString("preval")
- rowh.AddCell().SetString("newval")
- rowh.AddCell().SetString("url")
- tmp := FieldData[field]
- for k, v := range tmp {
- if v.NewVal != v.PreVal {
- row := sh.AddRow()
- row.AddCell().SetString(k)
- row.AddCell().SetString(v.PreVal)
- row.AddCell().SetString(v.NewVal)
- row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", v.Id)))
- }
- }
- }
- err = xf.Save("result.xlsx")
- if err != nil {
- log.Println("保存xlsx失败:", err)
- return
- }
- log.Println("xlsx保存成功")
- }
- func getVersionData() {
- query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}}
- log.Println(qu.ObjToString(SysConfig["prec"]), query)
- list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
- for _, v := range *list1 {
- for _, key := range Fields {
- rd := FieldData[key]
- if rd == nil {
- rd = map[string]*Data{}
- }
- if v[key] == nil && strings.TrimSpace(qu.ObjToString(v[key])) == "" {
- continue
- }
- rd[qu.BsonIdToSId(v["_id"])] = &Data{
- Id: qu.BsonIdToSId(v["_id"]),
- PreVal: fmt.Sprint(v[key]),
- }
- FieldData[key] = rd
- }
- }
- log.Println("pre version 加载完成")
- list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
- for _, v := range *list2 {
- for _, field := range Fields {
- rd := FieldData[field]
- if rd == nil {
- rd = map[string]*Data{}
- }
- if v[field] == nil && strings.TrimSpace(qu.ObjToString(v[field])) == "" {
- continue
- }
- _id := qu.BsonIdToSId(v["_id"])
- tmp := rd[_id]
- if tmp != nil {
- tmp.NewVal = fmt.Sprint(v[field])
- rd[_id] = tmp
- } else {
- rd[_id] = &Data{
- Id: qu.BsonIdToSId(_id),
- NewVal: fmt.Sprint(v[field]),
- }
- }
- FieldData[field] = rd
- }
- }
- log.Println("new version 加载完成")
- for k, v := range FieldData {
- cp := &Compare{Field: k}
- for _, d := range v {
- if d.NewVal != "" && d.PreVal != "" {
- if d.NewVal == d.PreVal {
- cp.EqNum++
- } else {
- cp.NEqNum++
- }
- cp.PreExtNum++
- cp.NewExtNum++
- } else {
- if d.NewVal == "" {
- cp.NewNilnum++
- if d.PreVal != "" {
- cp.NEqNum++
- cp.PreExtNum++
- }
- }
- if d.PreVal == "" {
- cp.PreNilnum++
- if d.NewVal != "" {
- cp.NewExtNum++
- cp.NEqNum++
- }
- }
- }
- }
- Compares[k] = cp
- }
- }
- type BidData struct {
- id string
- key map[string]interface{}
- }
- type BidCom struct {
- Val []int
- Ids []map[string]interface{}
- }
- //标注正确率统计
- func biaozhucompare() {
- exts, _ := Newmgo.Find("bid_v3", `{}`, `{"_id":1}`, nil, false, -1, -1)
- extDatas := []BidData{}
- for _, v := range *exts {
- key := map[string]interface{}{
- "projectname": v["projectname"],
- "projectcode": v["projectcode"],
- "buyer": v["buyer"],
- "budget": qu.Float64All(v["budget"]),
- "bidamount": qu.Float64All(v["bidamount"]),
- "agency": v["agency"],
- "buyerperson": v["buyerperson"],
- "buyertel": v["buyertel"],
- }
- ext := BidData{
- id: qu.BsonIdToSId(v["_id"]),
- key: key,
- }
- extDatas = append(extDatas, ext)
- }
- log.Println("exts ok")
- bzs, _ := Newmgo.Find("bid_biaozhuid", `{}`, `{"_id":1}`, nil, false, -1, -1)
- bzDatas := []BidData{}
- for _, v := range *bzs {
- bidamount := float64(0)
- if bigprices, ok := v["bigprice"].([]interface{}); ok {
- bidamount = qu.Float64All(bigprices[0])
- }
- key := map[string]interface{}{
- "projectname": qu.ObjToString(v["projectname"]),
- "projectcode": qu.ObjToString(v["projectcode"]), //qu.If(qu.ObjToString(v["t_bidno"]) == "", qu.ObjToString(v["b_projectno"]), qu.ObjToString(v["t_bidno"])),
- "buyer": qu.ObjToString(v["buyer"]), // qu.If(qu.ObjToString(v["t_buyer"]) == "", qu.ObjToString(v["b_buyer"]), qu.ObjToString(v["t_buyer"])),
- "budget": qu.Float64All(qu.ObjToString(v["budget"])), // qu.Float64All(qu.ObjToString(v["t_budget"])),
- "bidamount": bidamount,
- "agency": qu.ObjToString(v["agency"]),
- "buyerperson": qu.ObjToString(v["buyerperson"]),
- "buyertel": qu.ObjToString(v["buyertel"]),
- }
- bz := BidData{
- id: qu.BsonIdToSId(v["_id"]),
- key: key,
- }
- bzDatas = append(bzDatas, bz)
- }
- log.Println("bzs ok")
- bcoms := map[string]*BidCom{}
- for _, ext := range extDatas {
- for _, bz := range bzDatas {
- if bz.id == ext.id {
- for key, val := range ext.key {
- // if key == "budget" {
- // log.Println(key, ext.key[key], ";;;;;", bz.key[key])
- // }
- if qu.ObjToString(val) != "" || qu.Float64All(val) > 0 {
- bcom := bcoms[key]
- if bcom == nil {
- bcom = &BidCom{
- Val: []int{0, 0},
- Ids: []map[string]interface{}{},
- }
- }
- if val == bz.key[key] {
- bcom.Val[0] += 1
- } else {
- bcom.Val[1] += 1
- tmp := map[string]interface{}{
- "id": ext.id,
- "ext": val,
- "bz": bz.key[key],
- }
- bcom.Ids = append(bcom.Ids, tmp)
- }
- bcoms[key] = bcom
- }
- }
- break
- }
- }
- }
- xl := xlsx.NewFile()
- sh, _ := xl.AddSheet("统计")
- h := sh.AddRow()
- h.AddCell().SetString("field")
- h.AddCell().SetString("相同")
- h.AddCell().SetString("不同")
- for k, v := range bcoms {
- row := sh.AddRow()
- row.AddCell().SetString(k)
- row.AddCell().SetInt(v.Val[0])
- row.AddCell().SetInt(v.Val[1])
- ksh, _ := xl.AddSheet(k)
- rh := ksh.AddRow()
- rh.AddCell().SetString("id")
- rh.AddCell().SetString("标注")
- rh.AddCell().SetString("抽取")
- rh.AddCell().SetString("url")
- for _, v := range v.Ids {
- rw := ksh.AddRow()
- rw.AddCell().SetString(qu.ObjToString(v["id"]))
- rw.AddCell().SetString(fmt.Sprint(v["bz"]))
- rw.AddCell().SetString(fmt.Sprint(v["ext"]))
- rw.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", qu.ObjToString(v["id"]))))
- }
- log.Println(k, v.Val)
- }
- xl.Save("ext_bz.xlsx")
- }
|