main.go 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. /*
  2. 抽取结果对比
  3. */
  4. package main
  5. import (
  6. "flag"
  7. "fmt"
  8. "jy/mongodbutil"
  9. "log"
  10. qu "qfw/util"
  11. "strings"
  12. "github.com/tealeg/xlsx"
  13. "gopkg.in/mgo.v2/bson"
  14. )
  15. var (
  16. SysConfig map[string]interface{}
  17. Premgo *mongodbutil.Pool //上个版本库
  18. Newmgo *mongodbutil.Pool //当前版本库
  19. FieldData map[string]map[string]*Data
  20. Compares map[string]*Compare
  21. Sid, Eid string
  22. Fields []string
  23. FieldsQuery string
  24. Url = "https://www.jianyu360.com/article/content/%s.html"
  25. )
  26. type Compare struct {
  27. Field string //属性
  28. PreExtNum, NewExtNum int //上个版、当前版有值数量
  29. PreNilnum, NewNilnum int //上个版、当前版无值数量
  30. EqNum, NEqNum int //相等、不等数据量
  31. }
  32. type Data struct {
  33. Id string
  34. PreVal, NewVal string
  35. }
  36. func init() {
  37. flag.StringVar(&Sid, "sid", "5df5071ce9d1f601e495fa54", "开始id")
  38. flag.StringVar(&Eid, "eid", "5e09c05f0cf41612e0626abc", "结束id")
  39. flag.Parse()
  40. qu.ReadConfig(&SysConfig)
  41. Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
  42. Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"]))
  43. tmp, _ := SysConfig["fields"].([]interface{})
  44. for k, v := range tmp {
  45. Fields = append(Fields, qu.ObjToString(v))
  46. if k < (len(tmp) - 1) {
  47. FieldsQuery += `"` + qu.ObjToString(v) + `":1,`
  48. } else {
  49. FieldsQuery += `"` + qu.ObjToString(v) + `":1`
  50. }
  51. }
  52. FieldData = map[string]map[string]*Data{}
  53. Compares = map[string]*Compare{}
  54. }
  55. func main() {
  56. getVersionData()
  57. createXlsx()
  58. //biaozhucompare()
  59. }
  60. func createXlsx() {
  61. xf, err := xlsx.OpenFile("template.xlsx")
  62. if err != nil {
  63. log.Println(err)
  64. return
  65. }
  66. //生成第一个sheet信息
  67. sh := xf.Sheets[0]
  68. for i, field := range Fields {
  69. for k, row := range sh.Rows {
  70. if k > 2+i {
  71. style := (*row).Cells[1].GetStyle()
  72. style.Font.Color = "000000"
  73. (*row).Cells[0].SetString(field)
  74. (*row).Cells[1].SetStyle(style)
  75. (*row).Cells[2].SetStyle(style)
  76. (*row).Cells[3].SetStyle(style)
  77. (*row).Cells[4].SetStyle(style)
  78. if Compares[field] == nil {
  79. (*row).Cells[1].SetInt(0)
  80. (*row).Cells[2].SetInt(0)
  81. (*row).Cells[3].SetInt(0)
  82. (*row).Cells[4].SetInt(0)
  83. } else {
  84. (*row).Cells[1].SetInt(Compares[field].PreExtNum)
  85. (*row).Cells[2].SetInt(Compares[field].NewExtNum)
  86. (*row).Cells[3].SetInt(Compares[field].EqNum)
  87. (*row).Cells[4].SetInt(Compares[field].NEqNum)
  88. }
  89. }
  90. sh.Rows[k] = row
  91. }
  92. }
  93. //生成信息sheet
  94. for _, field := range Fields {
  95. sh, _ := xf.AddSheet(field)
  96. rowh := sh.AddRow()
  97. rowh.AddCell().SetString("id")
  98. rowh.AddCell().SetString("preval")
  99. rowh.AddCell().SetString("newval")
  100. rowh.AddCell().SetString("url")
  101. tmp := FieldData[field]
  102. for k, v := range tmp {
  103. if v.NewVal != v.PreVal {
  104. row := sh.AddRow()
  105. row.AddCell().SetString(k)
  106. row.AddCell().SetString(v.PreVal)
  107. row.AddCell().SetString(v.NewVal)
  108. row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", v.Id)))
  109. }
  110. }
  111. }
  112. err = xf.Save("result.xlsx")
  113. if err != nil {
  114. log.Println("保存xlsx失败:", err)
  115. return
  116. }
  117. log.Println("xlsx保存成功")
  118. }
  119. func getVersionData() {
  120. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}}
  121. log.Println(qu.ObjToString(SysConfig["prec"]), query)
  122. list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
  123. for _, v := range *list1 {
  124. for _, key := range Fields {
  125. rd := FieldData[key]
  126. if rd == nil {
  127. rd = map[string]*Data{}
  128. }
  129. if v[key] == nil && strings.TrimSpace(qu.ObjToString(v[key])) == "" {
  130. continue
  131. }
  132. rd[qu.BsonIdToSId(v["_id"])] = &Data{
  133. Id: qu.BsonIdToSId(v["_id"]),
  134. PreVal: fmt.Sprint(v[key]),
  135. }
  136. FieldData[key] = rd
  137. }
  138. }
  139. log.Println("pre version 加载完成")
  140. list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
  141. for _, v := range *list2 {
  142. for _, field := range Fields {
  143. rd := FieldData[field]
  144. if rd == nil {
  145. rd = map[string]*Data{}
  146. }
  147. if v[field] == nil && strings.TrimSpace(qu.ObjToString(v[field])) == "" {
  148. continue
  149. }
  150. _id := qu.BsonIdToSId(v["_id"])
  151. tmp := rd[_id]
  152. if tmp != nil {
  153. tmp.NewVal = fmt.Sprint(v[field])
  154. rd[_id] = tmp
  155. } else {
  156. rd[_id] = &Data{
  157. Id: qu.BsonIdToSId(_id),
  158. NewVal: fmt.Sprint(v[field]),
  159. }
  160. }
  161. FieldData[field] = rd
  162. }
  163. }
  164. log.Println("new version 加载完成")
  165. for k, v := range FieldData {
  166. cp := &Compare{Field: k}
  167. for _, d := range v {
  168. if d.NewVal != "" && d.PreVal != "" {
  169. if d.NewVal == d.PreVal {
  170. cp.EqNum++
  171. } else {
  172. cp.NEqNum++
  173. }
  174. cp.PreExtNum++
  175. cp.NewExtNum++
  176. } else {
  177. if d.NewVal == "" {
  178. cp.NewNilnum++
  179. if d.PreVal != "" {
  180. cp.NEqNum++
  181. cp.PreExtNum++
  182. }
  183. }
  184. if d.PreVal == "" {
  185. cp.PreNilnum++
  186. if d.NewVal != "" {
  187. cp.NewExtNum++
  188. cp.NEqNum++
  189. }
  190. }
  191. }
  192. }
  193. Compares[k] = cp
  194. }
  195. }
  196. type BidData struct {
  197. id string
  198. key map[string]interface{}
  199. }
  200. type BidCom struct {
  201. Val []int
  202. Ids []map[string]interface{}
  203. }
  204. //标注正确率统计
  205. func biaozhucompare() {
  206. exts, _ := Newmgo.Find("bid_v3", `{}`, `{"_id":1}`, nil, false, -1, -1)
  207. extDatas := []BidData{}
  208. for _, v := range *exts {
  209. key := map[string]interface{}{
  210. "projectname": v["projectname"],
  211. "projectcode": v["projectcode"],
  212. "buyer": v["buyer"],
  213. "budget": qu.Float64All(v["budget"]),
  214. "bidamount": qu.Float64All(v["bidamount"]),
  215. "agency": v["agency"],
  216. "buyerperson": v["buyerperson"],
  217. "buyertel": v["buyertel"],
  218. }
  219. ext := BidData{
  220. id: qu.BsonIdToSId(v["_id"]),
  221. key: key,
  222. }
  223. extDatas = append(extDatas, ext)
  224. }
  225. log.Println("exts ok")
  226. bzs, _ := Newmgo.Find("bid_biaozhuid", `{}`, `{"_id":1}`, nil, false, -1, -1)
  227. bzDatas := []BidData{}
  228. for _, v := range *bzs {
  229. bidamount := float64(0)
  230. if bigprices, ok := v["bigprice"].([]interface{}); ok {
  231. bidamount = qu.Float64All(bigprices[0])
  232. }
  233. key := map[string]interface{}{
  234. "projectname": qu.ObjToString(v["projectname"]),
  235. "projectcode": qu.ObjToString(v["projectcode"]), //qu.If(qu.ObjToString(v["t_bidno"]) == "", qu.ObjToString(v["b_projectno"]), qu.ObjToString(v["t_bidno"])),
  236. "buyer": qu.ObjToString(v["buyer"]), // qu.If(qu.ObjToString(v["t_buyer"]) == "", qu.ObjToString(v["b_buyer"]), qu.ObjToString(v["t_buyer"])),
  237. "budget": qu.Float64All(qu.ObjToString(v["budget"])), // qu.Float64All(qu.ObjToString(v["t_budget"])),
  238. "bidamount": bidamount,
  239. "agency": qu.ObjToString(v["agency"]),
  240. "buyerperson": qu.ObjToString(v["buyerperson"]),
  241. "buyertel": qu.ObjToString(v["buyertel"]),
  242. }
  243. bz := BidData{
  244. id: qu.BsonIdToSId(v["_id"]),
  245. key: key,
  246. }
  247. bzDatas = append(bzDatas, bz)
  248. }
  249. log.Println("bzs ok")
  250. bcoms := map[string]*BidCom{}
  251. for _, ext := range extDatas {
  252. for _, bz := range bzDatas {
  253. if bz.id == ext.id {
  254. for key, val := range ext.key {
  255. // if key == "budget" {
  256. // log.Println(key, ext.key[key], ";;;;;", bz.key[key])
  257. // }
  258. if qu.ObjToString(val) != "" || qu.Float64All(val) > 0 {
  259. bcom := bcoms[key]
  260. if bcom == nil {
  261. bcom = &BidCom{
  262. Val: []int{0, 0},
  263. Ids: []map[string]interface{}{},
  264. }
  265. }
  266. if val == bz.key[key] {
  267. bcom.Val[0] += 1
  268. } else {
  269. bcom.Val[1] += 1
  270. tmp := map[string]interface{}{
  271. "id": ext.id,
  272. "ext": val,
  273. "bz": bz.key[key],
  274. }
  275. bcom.Ids = append(bcom.Ids, tmp)
  276. }
  277. bcoms[key] = bcom
  278. }
  279. }
  280. break
  281. }
  282. }
  283. }
  284. xl := xlsx.NewFile()
  285. sh, _ := xl.AddSheet("统计")
  286. h := sh.AddRow()
  287. h.AddCell().SetString("field")
  288. h.AddCell().SetString("相同")
  289. h.AddCell().SetString("不同")
  290. for k, v := range bcoms {
  291. row := sh.AddRow()
  292. row.AddCell().SetString(k)
  293. row.AddCell().SetInt(v.Val[0])
  294. row.AddCell().SetInt(v.Val[1])
  295. ksh, _ := xl.AddSheet(k)
  296. rh := ksh.AddRow()
  297. rh.AddCell().SetString("id")
  298. rh.AddCell().SetString("标注")
  299. rh.AddCell().SetString("抽取")
  300. rh.AddCell().SetString("url")
  301. for _, v := range v.Ids {
  302. rw := ksh.AddRow()
  303. rw.AddCell().SetString(qu.ObjToString(v["id"]))
  304. rw.AddCell().SetString(fmt.Sprint(v["bz"]))
  305. rw.AddCell().SetString(fmt.Sprint(v["ext"]))
  306. rw.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", qu.ObjToString(v["id"]))))
  307. }
  308. log.Println(k, v.Val)
  309. }
  310. xl.Save("ext_bz.xlsx")
  311. }