main.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. /*
  2. 抽取结果对比
  3. */
  4. package main
  5. import (
  6. "flag"
  7. "fmt"
  8. "jy/mongodbutil"
  9. "log"
  10. qu "qfw/util"
  11. "strings"
  12. "github.com/tealeg/xlsx"
  13. "gopkg.in/mgo.v2/bson"
  14. )
  15. var (
  16. SysConfig map[string]interface{}
  17. Premgo *mongodbutil.Pool //上个版本库
  18. Newmgo *mongodbutil.Pool //当前版本库
  19. FieldData map[string]map[string]*Data
  20. Compares map[string]*Compare
  21. Sid, Eid string
  22. Fields []string
  23. FieldsQuery string
  24. Url = "https://www.jianyu360.com/article/content/%s.html"
  25. )
  26. type Compare struct {
  27. Field string //属性
  28. PreExtNum, NewExtNum int //上个版、当前版有值数量
  29. PreNilnum, NewNilnum int //上个版、当前版无值数量
  30. EqNum, NEqNum int //相等、不等数据量
  31. }
  32. type Data struct {
  33. Id string
  34. PreVal, NewVal string
  35. }
  36. func init() {
  37. flag.StringVar(&Sid, "sid", "5df5071ce9d1f601e495fa54", "开始id")
  38. flag.StringVar(&Eid, "eid", "5e09c05f0cf41612e0626abc", "结束id")
  39. flag.Parse()
  40. qu.ReadConfig(&SysConfig)
  41. Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
  42. Newmgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["newmgo"]), qu.ObjToString(SysConfig["newdb"]))
  43. tmp, _ := SysConfig["fields"].([]interface{})
  44. for k, v := range tmp {
  45. Fields = append(Fields, qu.ObjToString(v))
  46. if k < (len(tmp) - 1) {
  47. FieldsQuery += `"` + qu.ObjToString(v) + `":1,`
  48. } else {
  49. FieldsQuery += `"` + qu.ObjToString(v) + `":1`
  50. }
  51. }
  52. FieldData = map[string]map[string]*Data{}
  53. Compares = map[string]*Compare{}
  54. }
  55. func main() {
  56. getVersionData()
  57. createXlsx()
  58. //biaozhucompare()
  59. }
  60. func createXlsx() {
  61. xf, err := xlsx.OpenFile("template.xlsx")
  62. if err != nil {
  63. log.Println(err)
  64. return
  65. }
  66. //生成第一个sheet信息
  67. sh := xf.Sheets[0]
  68. for i, field := range Fields {
  69. for k, row := range sh.Rows {
  70. if k > 2+i {
  71. style := (*row).Cells[1].GetStyle()
  72. style.Font.Color = "000000"
  73. (*row).Cells[0].SetString(field)
  74. (*row).Cells[1].SetStyle(style)
  75. (*row).Cells[2].SetStyle(style)
  76. (*row).Cells[3].SetStyle(style)
  77. (*row).Cells[4].SetStyle(style)
  78. if Compares[field] == nil {
  79. (*row).Cells[1].SetInt(0)
  80. (*row).Cells[2].SetInt(0)
  81. (*row).Cells[3].SetInt(0)
  82. (*row).Cells[4].SetInt(0)
  83. } else {
  84. (*row).Cells[1].SetInt(Compares[field].PreExtNum)
  85. (*row).Cells[2].SetInt(Compares[field].NewExtNum)
  86. (*row).Cells[3].SetInt(Compares[field].EqNum)
  87. (*row).Cells[4].SetInt(Compares[field].NEqNum)
  88. }
  89. }
  90. sh.Rows[k] = row
  91. }
  92. }
  93. var idsall = map[string]bool{}
  94. //生成信息sheet
  95. for _, field := range Fields {
  96. sh, _ := xf.AddSheet(field)
  97. rowh := sh.AddRow()
  98. rowh.AddCell().SetString("id")
  99. rowh.AddCell().SetString("preval")
  100. rowh.AddCell().SetString("newval")
  101. rowh.AddCell().SetString("url")
  102. tmp := FieldData[field]
  103. for k, v := range tmp {
  104. if v.NewVal != v.PreVal {
  105. row := sh.AddRow()
  106. row.AddCell().SetString(k)
  107. row.AddCell().SetString(v.PreVal)
  108. row.AddCell().SetString(v.NewVal)
  109. row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", v.Id)))
  110. idsall[v.Id] = true
  111. }
  112. }
  113. }
  114. log.Println("不同数据总量", len(idsall))
  115. //生全量信息不同部分
  116. shall, _ := xf.AddSheet("全量数据(不同部分)")
  117. rowh := shall.AddRow()
  118. rowh.AddCell().SetString("id")
  119. for _, v := range Fields {
  120. rowh.AddCell().SetString("preval_" + v)
  121. rowh.AddCell().SetString("newval_" + v)
  122. }
  123. rowh.AddCell().SetString("url")
  124. i := 0
  125. for k, _ := range idsall {
  126. i++
  127. row := shall.AddRow()
  128. row.AddCell().SetString(k)
  129. for _, field := range Fields {
  130. tmp := FieldData[field]
  131. v := tmp[k]
  132. if v != nil {
  133. if v.NewVal != v.PreVal {
  134. row.AddCell().SetString(v.PreVal)
  135. row.AddCell().SetString(v.NewVal)
  136. } else {
  137. row.AddCell().SetString("")
  138. row.AddCell().SetString("")
  139. }
  140. } else {
  141. row.AddCell().SetString("")
  142. row.AddCell().SetString("")
  143. }
  144. }
  145. row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", k)))
  146. }
  147. log.Println("数据处理完成,正在生成文件")
  148. err = xf.Save("result.xlsx")
  149. if err != nil {
  150. log.Println("保存xlsx失败:", err)
  151. return
  152. }
  153. log.Println("xlsx保存成功")
  154. }
  155. func getVersionData() {
  156. query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(Sid), "$lte": bson.ObjectIdHex(Eid)}}
  157. log.Println(qu.ObjToString(SysConfig["prec"]), query)
  158. list1, _ := Premgo.Find(qu.ObjToString(SysConfig["prec"]), query, nil, `{`+FieldsQuery+`,}`, false, -1, -1)
  159. for _, v := range *list1 {
  160. for _, key := range Fields {
  161. rd := FieldData[key]
  162. if rd == nil {
  163. rd = map[string]*Data{}
  164. }
  165. if v[key] == nil && strings.TrimSpace(qu.ObjToString(v[key])) == "" {
  166. continue
  167. }
  168. rd[qu.BsonIdToSId(v["_id"])] = &Data{
  169. Id: qu.BsonIdToSId(v["_id"]),
  170. PreVal: fmt.Sprint(v[key]),
  171. }
  172. FieldData[key] = rd
  173. }
  174. }
  175. log.Println("pre version 加载完成")
  176. list2, _ := Newmgo.Find(qu.ObjToString(SysConfig["newc"]), query, nil, `{`+FieldsQuery+`,"s_winner":1}`, false, -1, -1)
  177. for _, v := range *list2 {
  178. for _, field := range Fields {
  179. rd := FieldData[field]
  180. if field == "winner" {
  181. field = "s_winner"
  182. }
  183. if rd == nil {
  184. rd = map[string]*Data{}
  185. }
  186. if v[field] == nil && strings.TrimSpace(qu.ObjToString(v[field])) == "" {
  187. continue
  188. }
  189. _id := qu.BsonIdToSId(v["_id"])
  190. tmp := rd[_id]
  191. if tmp != nil {
  192. tmp.NewVal = fmt.Sprint(v[field])
  193. rd[_id] = tmp
  194. } else {
  195. rd[_id] = &Data{
  196. Id: qu.BsonIdToSId(_id),
  197. NewVal: fmt.Sprint(v[field]),
  198. }
  199. }
  200. FieldData[field] = rd
  201. }
  202. }
  203. log.Println("new version 加载完成")
  204. for k, v := range FieldData {
  205. cp := &Compare{Field: k}
  206. for _, d := range v {
  207. if d.NewVal != "" && d.PreVal != "" {
  208. if d.NewVal == d.PreVal {
  209. cp.EqNum++
  210. } else {
  211. cp.NEqNum++
  212. }
  213. cp.PreExtNum++
  214. cp.NewExtNum++
  215. } else {
  216. if d.NewVal == "" {
  217. cp.NewNilnum++
  218. if d.PreVal != "" {
  219. cp.NEqNum++
  220. cp.PreExtNum++
  221. }
  222. }
  223. if d.PreVal == "" {
  224. cp.PreNilnum++
  225. if d.NewVal != "" {
  226. cp.NewExtNum++
  227. cp.NEqNum++
  228. }
  229. }
  230. }
  231. }
  232. Compares[k] = cp
  233. }
  234. }
  235. type BidData struct {
  236. id string
  237. key map[string]interface{}
  238. }
  239. type BidCom struct {
  240. Val []int
  241. Ids []map[string]interface{}
  242. }
  243. //标注正确率统计
  244. func biaozhucompare() {
  245. exts, _ := Newmgo.Find("bid_v3", `{}`, `{"_id":1}`, nil, false, -1, -1)
  246. extDatas := []BidData{}
  247. for _, v := range *exts {
  248. key := map[string]interface{}{
  249. "projectname": v["projectname"],
  250. "projectcode": v["projectcode"],
  251. "buyer": v["buyer"],
  252. "budget": qu.Float64All(v["budget"]),
  253. "bidamount": qu.Float64All(v["bidamount"]),
  254. "agency": v["agency"],
  255. "buyerperson": v["buyerperson"],
  256. "buyertel": v["buyertel"],
  257. }
  258. ext := BidData{
  259. id: qu.BsonIdToSId(v["_id"]),
  260. key: key,
  261. }
  262. extDatas = append(extDatas, ext)
  263. }
  264. log.Println("exts ok")
  265. bzs, _ := Newmgo.Find("bid_biaozhuid", `{}`, `{"_id":1}`, nil, false, -1, -1)
  266. bzDatas := []BidData{}
  267. for _, v := range *bzs {
  268. bidamount := float64(0)
  269. if bigprices, ok := v["bigprice"].([]interface{}); ok {
  270. bidamount = qu.Float64All(bigprices[0])
  271. }
  272. key := map[string]interface{}{
  273. "projectname": qu.ObjToString(v["projectname"]),
  274. "projectcode": qu.ObjToString(v["projectcode"]), //qu.If(qu.ObjToString(v["t_bidno"]) == "", qu.ObjToString(v["b_projectno"]), qu.ObjToString(v["t_bidno"])),
  275. "buyer": qu.ObjToString(v["buyer"]), // qu.If(qu.ObjToString(v["t_buyer"]) == "", qu.ObjToString(v["b_buyer"]), qu.ObjToString(v["t_buyer"])),
  276. "budget": qu.Float64All(qu.ObjToString(v["budget"])), // qu.Float64All(qu.ObjToString(v["t_budget"])),
  277. "bidamount": bidamount,
  278. "agency": qu.ObjToString(v["agency"]),
  279. "buyerperson": qu.ObjToString(v["buyerperson"]),
  280. "buyertel": qu.ObjToString(v["buyertel"]),
  281. }
  282. bz := BidData{
  283. id: qu.BsonIdToSId(v["_id"]),
  284. key: key,
  285. }
  286. bzDatas = append(bzDatas, bz)
  287. }
  288. log.Println("bzs ok")
  289. bcoms := map[string]*BidCom{}
  290. for _, ext := range extDatas {
  291. for _, bz := range bzDatas {
  292. if bz.id == ext.id {
  293. for key, val := range ext.key {
  294. // if key == "budget" {
  295. // log.Println(key, ext.key[key], ";;;;;", bz.key[key])
  296. // }
  297. if qu.ObjToString(val) != "" || qu.Float64All(val) > 0 {
  298. bcom := bcoms[key]
  299. if bcom == nil {
  300. bcom = &BidCom{
  301. Val: []int{0, 0},
  302. Ids: []map[string]interface{}{},
  303. }
  304. }
  305. if val == bz.key[key] {
  306. bcom.Val[0] += 1
  307. } else {
  308. bcom.Val[1] += 1
  309. tmp := map[string]interface{}{
  310. "id": ext.id,
  311. "ext": val,
  312. "bz": bz.key[key],
  313. }
  314. bcom.Ids = append(bcom.Ids, tmp)
  315. }
  316. bcoms[key] = bcom
  317. }
  318. }
  319. break
  320. }
  321. }
  322. }
  323. xl := xlsx.NewFile()
  324. sh, _ := xl.AddSheet("统计")
  325. h := sh.AddRow()
  326. h.AddCell().SetString("field")
  327. h.AddCell().SetString("相同")
  328. h.AddCell().SetString("不同")
  329. for k, v := range bcoms {
  330. row := sh.AddRow()
  331. row.AddCell().SetString(k)
  332. row.AddCell().SetInt(v.Val[0])
  333. row.AddCell().SetInt(v.Val[1])
  334. ksh, _ := xl.AddSheet(k)
  335. rh := ksh.AddRow()
  336. rh.AddCell().SetString("id")
  337. rh.AddCell().SetString("标注")
  338. rh.AddCell().SetString("抽取")
  339. rh.AddCell().SetString("url")
  340. for _, v := range v.Ids {
  341. rw := ksh.AddRow()
  342. rw.AddCell().SetString(qu.ObjToString(v["id"]))
  343. rw.AddCell().SetString(fmt.Sprint(v["bz"]))
  344. rw.AddCell().SetString(fmt.Sprint(v["ext"]))
  345. rw.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", qu.ObjToString(v["id"]))))
  346. }
  347. log.Println(k, v.Val)
  348. }
  349. xl.Save("ext_bz.xlsx")
  350. }