deduplication.go 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. package main
  2. import (
  3. "crypto/sha256"
  4. "encoding/hex"
  5. "fmt"
  6. "log"
  7. "strings"
  8. "app.yhyue.com/moapp/jybase/mongodb"
  9. "github.com/PuerkitoBio/goquery"
  10. "github.com/gogf/gf/v2/util/gconv"
  11. )
  12. //年份、项目名称、项目情况、绩效目标、实施单位 的哈希值
  13. func Hash(year, projectname, procure_content, kpi, institution string) string {
  14. procure_content = CleanString(procure_content)
  15. kpi = CleanString(kpi)
  16. str := fmt.Sprintf("%s%s%s%s%s", year, projectname, procure_content, kpi, institution)
  17. return HashCode(str)
  18. }
  19. //刷库 更新project_yusuan
  20. func UpdateProject_yusuan() {
  21. log.Println("开始..")
  22. sess := db.GetMgoConn()
  23. defer db.DestoryMongoConn(sess)
  24. it := sess.DB(cf.Collections).C(cf.ProjectItem).Find(nil).Select(map[string]interface{}{
  25. "year": 1,
  26. "projectname": 1,
  27. "procure_content": 1,
  28. "kpi": 1,
  29. "institution": 1,
  30. "_id": 1,
  31. "hash_code": 1,
  32. }).Iter()
  33. i := 0
  34. for m := make(map[string]interface{}); it.Next(&m); {
  35. i++
  36. if i%1000 == 0 {
  37. log.Println(cf.ProjectItem, "count:", i)
  38. }
  39. hash_code := gconv.String(m["hash_code"])
  40. year := gconv.String(m["year"])
  41. projectname := gconv.String(m["projectname"])
  42. procure_content := gconv.String(m["procure_content"])
  43. kpi := gconv.String(m["kpi"])
  44. institution := gconv.String(m["institution"])
  45. id := mongodb.BsonIdToSId(m["_id"])
  46. //清洗
  47. procure_content = CleanString(procure_content)
  48. kpi = CleanString(kpi)
  49. newHashCode := Hash(year, projectname, procure_content, kpi, institution)
  50. //修改hash值
  51. if hash_code != newHashCode {
  52. db.UpdateById(cf.ProjectItem, id, map[string]interface{}{
  53. "$set": map[string]interface{}{
  54. "hash_code": newHashCode,
  55. },
  56. })
  57. }
  58. m = make(map[string]interface{})
  59. }
  60. }
  61. //生成hashCode
  62. func HashCode(input string) string {
  63. hash := sha256.Sum256([]byte(input))
  64. hashString := hex.EncodeToString(hash[:])
  65. return hashString
  66. }
  67. // 纯文本
  68. func HtmlToText(con string) string {
  69. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  70. //log.Println(doc2.Html())
  71. doc2.Find("tr").Each(func(i int, selection *goquery.Selection) {
  72. selection.AfterHtml(string(rune(10)))
  73. })
  74. //log.Println(doc2.Html())
  75. return doc2.Text()
  76. }
  77. //刷库 更新project_yusuan
  78. func UpdateProject_huipu() {
  79. huipudb := "zxl_project_huipu"
  80. sess := db.GetMgoConn()
  81. defer db.DestoryMongoConn(sess)
  82. it := sess.DB(cf.Collections).C(huipudb).Find(nil).Select(map[string]interface{}{
  83. "year": 1,
  84. "projectname": 1,
  85. "procure_content": 1,
  86. "kpi": 1,
  87. "institution": 1,
  88. "_id": 1,
  89. "hash_code": 1,
  90. }).Iter()
  91. i := 0
  92. for m := make(map[string]interface{}); it.Next(&m); {
  93. i++
  94. if i%1000 == 0 {
  95. log.Println("count:", i)
  96. }
  97. hash_code := gconv.String(m["hash_code"])
  98. year := gconv.String(m["year"])
  99. projectname := gconv.String(m["projectname"])
  100. procure_content := gconv.String(m["procure_content"])
  101. kpi := gconv.String(m["kpi"])
  102. institution := gconv.String(m["institution"])
  103. id := mongodb.BsonIdToSId(m["_id"])
  104. //清洗
  105. procure_content = CleanString(procure_content)
  106. kpi = CleanString(kpi)
  107. newHashCode := Hash(year, projectname, procure_content, kpi, institution)
  108. //修改hash值
  109. if hash_code != newHashCode {
  110. db.UpdateById(huipudb, id, map[string]interface{}{
  111. "$set": map[string]interface{}{
  112. "hash_code": newHashCode,
  113. },
  114. })
  115. }
  116. m = make(map[string]interface{})
  117. }
  118. }