deduplication.go 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. package main
  2. import (
  3. "crypto/sha256"
  4. "encoding/hex"
  5. "fmt"
  6. "log"
  7. "strings"
  8. "app.yhyue.com/moapp/jybase/mongodb"
  9. "github.com/PuerkitoBio/goquery"
  10. "github.com/gogf/gf/v2/util/gconv"
  11. )
  12. //年份、项目名称、项目情况、绩效目标、实施单位 的哈希值
  13. func Hash(year, projectname, procure_content, kpi, institution string) string {
  14. procure_content = CleanString(procure_content)
  15. kpi = CleanString(kpi)
  16. str := fmt.Sprintf("%s%s%s%s%s", year, projectname, procure_content, kpi, institution)
  17. return HashCode(str)
  18. }
  19. //刷库 更新project_yusuan
  20. func UpdateProject_yusuan() {
  21. sess := db.GetMgoConn()
  22. defer db.DestoryMongoConn(sess)
  23. it := sess.DB(cf.Collections).C(cf.ProjectItem).Find(nil).Select(map[string]interface{}{
  24. "year": 1,
  25. "projectname": 1,
  26. "procure_content": 1,
  27. "kpi": 1,
  28. "institution": 1,
  29. "_id": 1,
  30. "hash_code": 1,
  31. }).Iter()
  32. i := 0
  33. for m := make(map[string]interface{}); it.Next(&m); {
  34. i++
  35. if i%1000 == 0 {
  36. log.Println("count:", i)
  37. }
  38. hash_code := gconv.String(m["hash_code"])
  39. year := gconv.String(m["year"])
  40. projectname := gconv.String(m["projectname"])
  41. procure_content := gconv.String(m["procure_content"])
  42. kpi := gconv.String(m["kpi"])
  43. institution := gconv.String(m["institution"])
  44. id := mongodb.BsonIdToSId(m["_id"])
  45. //清洗
  46. procure_content = CleanString(procure_content)
  47. kpi = CleanString(kpi)
  48. newHashCode := Hash(year, projectname, procure_content, kpi, institution)
  49. //修改hash值
  50. if hash_code != newHashCode {
  51. db.UpdateById(cf.ProjectItem, id, map[string]interface{}{
  52. "$set": map[string]interface{}{
  53. "hash_code": newHashCode,
  54. },
  55. })
  56. }
  57. m = make(map[string]interface{})
  58. }
  59. }
  60. //生成hashCode
  61. func HashCode(input string) string {
  62. hash := sha256.Sum256([]byte(input))
  63. hashString := hex.EncodeToString(hash[:])
  64. return hashString
  65. }
  66. // 纯文本
  67. func HtmlToText(con string) string {
  68. doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
  69. //log.Println(doc2.Html())
  70. doc2.Find("tr").Each(func(i int, selection *goquery.Selection) {
  71. selection.AfterHtml(string(rune(10)))
  72. })
  73. //log.Println(doc2.Html())
  74. return doc2.Text()
  75. }
  76. //刷库 更新project_yusuan
  77. func UpdateProject_huipu() {
  78. huipudb := "zxl_project_huipu"
  79. sess := db.GetMgoConn()
  80. defer db.DestoryMongoConn(sess)
  81. it := sess.DB(cf.Collections).C(huipudb).Find(nil).Select(map[string]interface{}{
  82. "year": 1,
  83. "projectname": 1,
  84. "procure_content": 1,
  85. "kpi": 1,
  86. "institution": 1,
  87. "_id": 1,
  88. "hash_code": 1,
  89. }).Iter()
  90. i := 0
  91. for m := make(map[string]interface{}); it.Next(&m); {
  92. i++
  93. if i%1000 == 0 {
  94. log.Println("count:", i)
  95. }
  96. hash_code := gconv.String(m["hash_code"])
  97. year := gconv.String(m["year"])
  98. projectname := gconv.String(m["projectname"])
  99. procure_content := gconv.String(m["procure_content"])
  100. kpi := gconv.String(m["kpi"])
  101. institution := gconv.String(m["institution"])
  102. id := mongodb.BsonIdToSId(m["_id"])
  103. //清洗
  104. procure_content = CleanString(procure_content)
  105. kpi = CleanString(kpi)
  106. newHashCode := Hash(year, projectname, procure_content, kpi, institution)
  107. //修改hash值
  108. if hash_code != newHashCode {
  109. db.UpdateById(huipudb, id, map[string]interface{}{
  110. "$set": map[string]interface{}{
  111. "hash_code": newHashCode,
  112. },
  113. })
  114. }
  115. m = make(map[string]interface{})
  116. }
  117. }