main.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. /*
  2. 抽取结果对比
  3. */
  4. package main
  5. import (
  6. "flag"
  7. "fmt"
  8. "jy/mongodbutil"
  9. "log"
  10. qu "qfw/util"
  11. "strings"
  12. "github.com/tealeg/xlsx"
  13. "gopkg.in/mgo.v2/bson"
  14. )
  15. var (
  16. SysConfig map[string]interface{}
  17. Premgo *mongodbutil.Pool //上个版本库
  18. Newmgo *mongodbutil.Pool //当前版本库
  19. FieldData map[string]map[string]*Data
  20. Compares map[string]*Compare
  21. Sid, Eid string
  22. Fields []string
  23. FieldsQuery string
  24. Url = "https://www.jianyu360.com/article/content/%s.html"
  25. )
  26. type Compare struct {
  27. Field string //属性
  28. PreExtNum, NewExtNum int //上个版、当前版有值数量
  29. PreNilnum, NewNilnum int //上个版、当前版无值数量
  30. EqNum, NEqNum int //相等、不等数据量
  31. }
  32. type Data struct {
  33. Id string
  34. PreVal, NewVal string
  35. }
  36. func init() {
  37. flag.StringVar(&Sid, "sid", "5df5071ce9d1f601e495fa54", "开始id")
  38. flag.StringVar(&Eid, "eid", "5e09c05f0cf41612e0626abc", "结束id")
  39. flag.Parse()
  40. qu.ReadConfig(&SysConfig)
  41. Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
  42. Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"]))
  43. tmp, _ := SysConfig["fields"].([]interface{})
  44. for k, v := range tmp {
  45. Fields = append(Fields, qu.ObjToString(v))
  46. if k < (len(tmp) - 1) {
  47. FieldsQuery += `"` + qu.ObjToString(v) + `":1,`
  48. } else {
  49. FieldsQuery += `"` + qu.ObjToString(v) + `":1`
  50. }
  51. }
  52. FieldData = map[string]map[string]*Data{}
  53. Compares = map[string]*Compare{}
  54. }
  55. func main() {
  56. getVersionData()
  57. createXlsx()
  58. //biaozhucompare()
  59. }
  60. func createXlsx() {
  61. xf, err := xlsx.OpenFile("template.xlsx")
  62. if err != nil {
  63. log.Println(err)
  64. return
  65. }
  66. //生成第一个sheet信息
  67. sh := xf.Sheets[0]
  68. for i, field := range Fields {
  69. for k, row := range sh.Rows {
  70. if k > 2+i {
  71. style := (*row).Cells[1].GetStyle()
  72. style.Font.Color = "000000"
  73. (*row).Cells[0].SetString(field)
  74. (*row).Cells[1].SetStyle(style)
  75. (*row).Cells[2].SetStyle(style)
  76. (*row).Cells[3].SetStyle(style)
  77. (*row).Cells[4].SetStyle(style)
  78. if Compares[field] == nil {
  79. (*row).Cells[1].SetInt(0)
  80. (*row).Cells[2].SetInt(0)
  81. (*row).Cells[3].SetInt(0)
  82. (*row).Cells[4].SetInt(0)
  83. } else {
  84. (*row).Cells[1].SetInt(Compares[field].PreExtNum)
  85. (*row).Cells[2].SetInt(Compares[field].NewExtNum)
  86. (*row).Cells[3].SetInt(Compares[field].EqNum)
  87. (*row).Cells[4].SetInt(Compares[field].NEqNum)
  88. }
  89. }
  90. sh.Rows[k] = row
  91. }
  92. }
  93. var idsall = map[string]bool{}
  94. //生成信息sheet
  95. for _, field := range Fields {
  96. sh, _ := xf.AddSheet(field)
  97. rowh := sh.AddRow()
  98. rowh.AddCell().SetString("id")
  99. rowh.AddCell().SetString("preval")
  100. rowh.AddCell().SetString("newval")
  101. rowh.AddCell().SetString("url")
  102. tmp := FieldData[field]
  103. for k, v := range tmp {
  104. if v.NewVal != v.PreVal {
  105. row := sh.AddRow()
  106. row.AddCell().SetString(k)
  107. row.AddCell().SetString(v.PreVal)
  108. row.AddCell().SetString(v.NewVal)
  109. row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", v.Id)))
  110. idsall[v.Id] = true
  111. }
  112. }
  113. }
  114. log.Println("不同数据总量", len(idsall))
  115. //生全量信息不同部分
  116. shall, _ := xf.AddSheet("全量数据(不同部分)")
  117. rowh := shall.AddRow()
  118. rowh.AddCell().SetString("id")
  119. for _, v := range Fields {
  120. rowh.AddCell().SetString("preval_" + v)
  121. rowh.AddCell().SetString("newval_" + v)
  122. }
  123. rowh.AddCell().SetString("url")
  124. i := 0
  125. for k, _ := range idsall {
  126. i++
  127. row := shall.AddRow()
  128. row.AddCell().SetString(k)
  129. for _, field := range Fields {
  130. tmp := FieldData[field]
  131. v := tmp[k]
  132. if v != nil {
  133. if v.NewVal != v.PreVal {
  134. row.AddCell().SetString(v.PreVal)
  135. row.AddCell().SetString(v.NewVal)
  136. } else {
  137. row.AddCell().SetString("")
  138. row.AddCell().SetString("")
  139. }
  140. } else {
  141. row.AddCell().SetString("")
  142. row.AddCell().SetString("")
  143. }
  144. }
  145. row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", k)))
  146. }
  147. log.Println("数据处理完成,正在生成文件")
  148. err = xf.Save("result.xlsx")
  149. if err != nil {
  150. log.Println("保存xlsx失败:", err)
  151. return
  152. }
  153. log.Println("xlsx保存成功")
  154. }
  155. func getVersionData() {
  156. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}}
  157. log.Println(qu.ObjToString(SysConfig["prec"]), query)
  158. list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
  159. for _, v := range *list1 {
  160. for _, key := range Fields {
  161. rd := FieldData[key]
  162. if rd == nil {
  163. rd = map[string]*Data{}
  164. }
  165. if v[key] == nil && strings.TrimSpace(qu.ObjToString(v[key])) == "" {
  166. continue
  167. }
  168. rd[qu.BsonIdToSId(v["_id"])] = &Data{
  169. Id: qu.BsonIdToSId(v["_id"]),
  170. PreVal: fmt.Sprint(v[key]),
  171. }
  172. FieldData[key] = rd
  173. }
  174. }
  175. log.Println("pre version 加载完成")
  176. list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{`+FieldsQuery+`}`, false, -1, -1)
  177. for _, v := range *list2 {
  178. for _, field := range Fields {
  179. rd := FieldData[field]
  180. if rd == nil {
  181. rd = map[string]*Data{}
  182. }
  183. if v[field] == nil && strings.TrimSpace(qu.ObjToString(v[field])) == "" {
  184. continue
  185. }
  186. _id := qu.BsonIdToSId(v["_id"])
  187. tmp := rd[_id]
  188. if tmp != nil {
  189. tmp.NewVal = fmt.Sprint(v[field])
  190. rd[_id] = tmp
  191. } else {
  192. rd[_id] = &Data{
  193. Id: qu.BsonIdToSId(_id),
  194. NewVal: fmt.Sprint(v[field]),
  195. }
  196. }
  197. FieldData[field] = rd
  198. }
  199. }
  200. log.Println("new version 加载完成")
  201. for k, v := range FieldData {
  202. cp := &Compare{Field: k}
  203. for _, d := range v {
  204. if d.NewVal != "" && d.PreVal != "" {
  205. if d.NewVal == d.PreVal {
  206. cp.EqNum++
  207. } else {
  208. cp.NEqNum++
  209. }
  210. cp.PreExtNum++
  211. cp.NewExtNum++
  212. } else {
  213. if d.NewVal == "" {
  214. cp.NewNilnum++
  215. if d.PreVal != "" {
  216. cp.NEqNum++
  217. cp.PreExtNum++
  218. }
  219. }
  220. if d.PreVal == "" {
  221. cp.PreNilnum++
  222. if d.NewVal != "" {
  223. cp.NewExtNum++
  224. cp.NEqNum++
  225. }
  226. }
  227. }
  228. }
  229. Compares[k] = cp
  230. }
  231. }
  232. type BidData struct {
  233. id string
  234. key map[string]interface{}
  235. }
  236. type BidCom struct {
  237. Val []int
  238. Ids []map[string]interface{}
  239. }
  240. //标注正确率统计
  241. func biaozhucompare() {
  242. exts, _ := Newmgo.Find("bid_v3", `{}`, `{"_id":1}`, nil, false, -1, -1)
  243. extDatas := []BidData{}
  244. for _, v := range *exts {
  245. key := map[string]interface{}{
  246. "projectname": v["projectname"],
  247. "projectcode": v["projectcode"],
  248. "buyer": v["buyer"],
  249. "budget": qu.Float64All(v["budget"]),
  250. "bidamount": qu.Float64All(v["bidamount"]),
  251. "agency": v["agency"],
  252. "buyerperson": v["buyerperson"],
  253. "buyertel": v["buyertel"],
  254. }
  255. ext := BidData{
  256. id: qu.BsonIdToSId(v["_id"]),
  257. key: key,
  258. }
  259. extDatas = append(extDatas, ext)
  260. }
  261. log.Println("exts ok")
  262. bzs, _ := Newmgo.Find("bid_biaozhuid", `{}`, `{"_id":1}`, nil, false, -1, -1)
  263. bzDatas := []BidData{}
  264. for _, v := range *bzs {
  265. bidamount := float64(0)
  266. if bigprices, ok := v["bigprice"].([]interface{}); ok {
  267. bidamount = qu.Float64All(bigprices[0])
  268. }
  269. key := map[string]interface{}{
  270. "projectname": qu.ObjToString(v["projectname"]),
  271. "projectcode": qu.ObjToString(v["projectcode"]), //qu.If(qu.ObjToString(v["t_bidno"]) == "", qu.ObjToString(v["b_projectno"]), qu.ObjToString(v["t_bidno"])),
  272. "buyer": qu.ObjToString(v["buyer"]), // qu.If(qu.ObjToString(v["t_buyer"]) == "", qu.ObjToString(v["b_buyer"]), qu.ObjToString(v["t_buyer"])),
  273. "budget": qu.Float64All(qu.ObjToString(v["budget"])), // qu.Float64All(qu.ObjToString(v["t_budget"])),
  274. "bidamount": bidamount,
  275. "agency": qu.ObjToString(v["agency"]),
  276. "buyerperson": qu.ObjToString(v["buyerperson"]),
  277. "buyertel": qu.ObjToString(v["buyertel"]),
  278. }
  279. bz := BidData{
  280. id: qu.BsonIdToSId(v["_id"]),
  281. key: key,
  282. }
  283. bzDatas = append(bzDatas, bz)
  284. }
  285. log.Println("bzs ok")
  286. bcoms := map[string]*BidCom{}
  287. for _, ext := range extDatas {
  288. for _, bz := range bzDatas {
  289. if bz.id == ext.id {
  290. for key, val := range ext.key {
  291. // if key == "budget" {
  292. // log.Println(key, ext.key[key], ";;;;;", bz.key[key])
  293. // }
  294. if qu.ObjToString(val) != "" || qu.Float64All(val) > 0 {
  295. bcom := bcoms[key]
  296. if bcom == nil {
  297. bcom = &BidCom{
  298. Val: []int{0, 0},
  299. Ids: []map[string]interface{}{},
  300. }
  301. }
  302. if val == bz.key[key] {
  303. bcom.Val[0] += 1
  304. } else {
  305. bcom.Val[1] += 1
  306. tmp := map[string]interface{}{
  307. "id": ext.id,
  308. "ext": val,
  309. "bz": bz.key[key],
  310. }
  311. bcom.Ids = append(bcom.Ids, tmp)
  312. }
  313. bcoms[key] = bcom
  314. }
  315. }
  316. break
  317. }
  318. }
  319. }
  320. xl := xlsx.NewFile()
  321. sh, _ := xl.AddSheet("统计")
  322. h := sh.AddRow()
  323. h.AddCell().SetString("field")
  324. h.AddCell().SetString("相同")
  325. h.AddCell().SetString("不同")
  326. for k, v := range bcoms {
  327. row := sh.AddRow()
  328. row.AddCell().SetString(k)
  329. row.AddCell().SetInt(v.Val[0])
  330. row.AddCell().SetInt(v.Val[1])
  331. ksh, _ := xl.AddSheet(k)
  332. rh := ksh.AddRow()
  333. rh.AddCell().SetString("id")
  334. rh.AddCell().SetString("标注")
  335. rh.AddCell().SetString("抽取")
  336. rh.AddCell().SetString("url")
  337. for _, v := range v.Ids {
  338. rw := ksh.AddRow()
  339. rw.AddCell().SetString(qu.ObjToString(v["id"]))
  340. rw.AddCell().SetString(fmt.Sprint(v["bz"]))
  341. rw.AddCell().SetString(fmt.Sprint(v["ext"]))
  342. rw.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", qu.ObjToString(v["id"]))))
  343. }
  344. log.Println(k, v.Val)
  345. }
  346. xl.Save("ext_bz.xlsx")
  347. }