task.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. package main
  2. import (
  3. "go.mongodb.org/mongo-driver/bson"
  4. "log"
  5. "mongodb"
  6. qu "qfw/util"
  7. "regexp"
  8. "strings"
  9. "time"
  10. )
  11. var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"}
  12. var packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)`)
  13. var listSource []*dataSource
  14. type dataSource struct {
  15. _id, id, title string
  16. projectname, projectcode, contractcode string
  17. buyer, agency, s_winner string
  18. budget, bidamount float64
  19. isrepeat bool
  20. repeat_id_source string
  21. repeat_id map[string]string
  22. repeatText string
  23. }
  24. func task4(coll, startTime, endTime, sortType string) {
  25. log.Printf("表名:%s,开始时间:%s,结束时间:%s,排序方式:%s", coll, startTime, endTime, sortType)
  26. stime, _ := time.Parse(qu.Date_Short_Layout, startTime)
  27. etime, _ := time.Parse(qu.Date_Short_Layout, endTime)
  28. query := bson.M{}
  29. query["$and"] = []interface{}{
  30. bson.M{"publishtime": bson.M{"$gte": stime.Unix()}},
  31. bson.M{"publishtime": bson.M{"$lte": etime.Unix()}},
  32. }
  33. sort := "publishtime"
  34. if sortType == "-1" {
  35. sort = "-publishtime"
  36. }
  37. log.Println(query, sort)
  38. sess := Mgo.GetMgoConn()
  39. defer Mgo.DestoryMongoConn(sess)
  40. it := sess.DB(Mgo.DbName).C(coll).Find(query).Sort(sort).Iter()
  41. index := 0
  42. for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
  43. d := &dataSource{
  44. _id: mongodb.BsonIdToSId(tmp["_id"]),
  45. id: qu.ObjToString(tmp["id"]),
  46. title: strings.ToLower(qu.ObjToString(tmp["title"])),
  47. projectname: strings.ToLower(qu.ObjToString(tmp["projectname"])),
  48. projectcode: strings.ToLower(qu.ObjToString(tmp["projectcode"])),
  49. contractcode: strings.ToLower(qu.ObjToString(tmp["contractcode"])),
  50. buyer: strings.ToLower(qu.ObjToString(tmp["buyer"])),
  51. agency: strings.ToLower(qu.ObjToString(tmp["agency"])),
  52. s_winner: strings.ToLower(qu.ObjToString(tmp["s_winner"])),
  53. budget: qu.Float64All(tmp["budget"]),
  54. bidamount: qu.Float64All(tmp["bidamount"]),
  55. repeat_id: map[string]string{},
  56. }
  57. //log.Println(tmp["_id"], d.id)
  58. if index%10000 == 0 {
  59. log.Println("加载数据:", index)
  60. }
  61. listSource = append(listSource, d)
  62. tmp = map[string]interface{}{}
  63. }
  64. log.Println("数据加载完成")
  65. dataItem()
  66. dd := 0
  67. for i := 0; i < len(listSource); i++ {
  68. a := listSource[i]
  69. if a.isrepeat {
  70. dd++
  71. }
  72. //更新数据
  73. Mgo.UpdateById(coll, a._id,
  74. map[string]interface{}{"$set": map[string]interface{}{
  75. "repeatid": a.repeat_id_source, //和那条数据重复id
  76. "repeat": a.isrepeat, //本条数据是否本判重
  77. "repeatid_ids": a.repeat_id, //和我重复的数据都有哪些
  78. "repeattext": a.repeatText, //本数据被判重的原因
  79. }})
  80. if i%1000 == 0 {
  81. log.Println("已更新", i)
  82. }
  83. }
  84. log.Println(dd)
  85. }
  86. var listSize = 20000
  87. func dataItem() {
  88. for i := 0; i < len(listSource); i++ {
  89. a := listSource[i]
  90. // if a.isrepeat {
  91. // continue
  92. // }
  93. b := &dataSource{}
  94. for j := i + 1; j < len(listSource); j++ {
  95. b = listSource[j]
  96. // if b.isrepeat {
  97. // continue
  98. // }
  99. a, b = panchong(*a, *b)
  100. listSource[j] = b
  101. listSource[i] = a
  102. // if b.isrepeat {
  103. // log.Println("sss", a.id, b.isrepeat, b.repeat_id)
  104. // }
  105. }
  106. if i%500 == 0 {
  107. log.Println("已处理:", i)
  108. }
  109. }
  110. }
  111. func panchong(a, b dataSource) (c, d *dataSource) {
  112. switch {
  113. case a.title == b.title: //标题相等
  114. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  115. } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  116. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  117. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  118. b.repeat_id_source = a.id
  119. a.repeat_id[b.id] = ""
  120. b.isrepeat = true
  121. b.repeatText = "标题相等 && buyer && s_winner"
  122. //log.Println("1111", a.id, b.id, b.isrepeat)
  123. }
  124. } else {
  125. r := key_list(a, b)
  126. if r {
  127. b.repeat_id_source = a.id
  128. a.repeat_id[b.id] = ""
  129. b.isrepeat = true
  130. b.repeatText = "标题相等 && budget && key_list"
  131. }
  132. }
  133. } else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
  134. r := key_list(a, b)
  135. if r {
  136. b.repeat_id_source = a.id
  137. a.repeat_id[b.id] = ""
  138. b.isrepeat = true
  139. b.repeatText = "标题相等 && projectcode && key_list"
  140. }
  141. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  142. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  143. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  144. b.repeat_id_source = a.id
  145. a.repeat_id[b.id] = ""
  146. b.isrepeat = true
  147. b.repeatText = "标题相等 && bidamount && buyer && s_winner"
  148. //log.Println("1111", a.id, b.id, b.isrepeat)
  149. }
  150. } else {
  151. r := key_list(a, b)
  152. if r {
  153. b.repeat_id_source = a.id
  154. a.repeat_id[b.id] = ""
  155. b.isrepeat = true
  156. b.repeatText = "标题相等 && bidamount && key_list"
  157. }
  158. }
  159. } else {
  160. //
  161. }
  162. case a.title != b.title: //标题不相等
  163. //项目名称包含及相等
  164. if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) {
  165. isp := packreg.MatchString(a.title)
  166. //有分包
  167. if isp {
  168. //项目名称相等
  169. if a.projectname == b.projectname {
  170. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  171. //
  172. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  173. b.repeat_id_source = a.id
  174. a.repeat_id[b.id] = ""
  175. b.isrepeat = true
  176. b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
  177. } else if a.bidamount != b.bidamount {
  178. //
  179. } else {
  180. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  181. b.repeat_id_source = a.id
  182. a.repeat_id[b.id] = ""
  183. b.isrepeat = true
  184. b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget"
  185. }
  186. }
  187. } else { //项目名称包含
  188. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  189. //
  190. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  191. if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
  192. b.repeat_id_source = a.id
  193. a.repeat_id[b.id] = ""
  194. b.isrepeat = true
  195. b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode"
  196. } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  197. b.repeat_id_source = a.id
  198. a.repeat_id[b.id] = ""
  199. b.isrepeat = true
  200. b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner"
  201. } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  202. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  203. b.repeat_id_source = a.id
  204. a.repeat_id[b.id] = ""
  205. b.isrepeat = true
  206. b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer"
  207. } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
  208. b.repeat_id_source = a.id
  209. a.repeat_id[b.id] = ""
  210. b.isrepeat = true
  211. b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency"
  212. } else {
  213. //
  214. }
  215. }
  216. } else if a.bidamount != b.bidamount {
  217. //
  218. } else {
  219. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  220. b.repeat_id_source = a.id
  221. a.repeat_id[b.id] = ""
  222. b.isrepeat = true
  223. b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget"
  224. } else {
  225. //
  226. }
  227. }
  228. }
  229. } else { //无分包
  230. //项目名称相等
  231. if a.projectname == b.projectname {
  232. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  233. //
  234. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  235. b.repeat_id_source = a.id
  236. a.repeat_id[b.id] = ""
  237. b.isrepeat = true
  238. b.repeatText = "标题不相等-->无分包 && projectname && bidamount"
  239. } else if a.bidamount != b.bidamount {
  240. //
  241. } else {
  242. if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
  243. b.repeat_id_source = a.id
  244. a.repeat_id[b.id] = ""
  245. b.isrepeat = true
  246. b.repeatText = "标题不相等-->无分包 && projectname && projectcode"
  247. } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  248. b.repeat_id_source = a.id
  249. a.repeat_id[b.id] = ""
  250. b.isrepeat = true
  251. b.repeatText = "标题不相等-->无分包 && projectname && s_winner"
  252. } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  253. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  254. b.repeat_id_source = a.id
  255. a.repeat_id[b.id] = ""
  256. b.isrepeat = true
  257. b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer"
  258. } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
  259. b.repeat_id_source = a.id
  260. a.repeat_id[b.id] = ""
  261. b.isrepeat = true
  262. b.repeatText = "标题不相等-->无分包 && projectname && budget && agency"
  263. } else {
  264. //
  265. }
  266. }
  267. }
  268. } else { //项目名称包含
  269. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  270. //
  271. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  272. if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
  273. b.repeat_id_source = a.id
  274. a.repeat_id[b.id] = ""
  275. b.isrepeat = true
  276. b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount"
  277. } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  278. b.repeat_id_source = a.id
  279. a.repeat_id[b.id] = ""
  280. b.isrepeat = true
  281. b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner"
  282. } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  283. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  284. b.repeat_id_source = a.id
  285. a.repeat_id[b.id] = ""
  286. b.isrepeat = true
  287. b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer"
  288. } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
  289. b.repeat_id_source = a.id
  290. a.repeat_id[b.id] = ""
  291. b.isrepeat = true
  292. b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency"
  293. } else {
  294. //
  295. }
  296. } else {
  297. //
  298. }
  299. } else if a.bidamount != b.bidamount {
  300. //
  301. } else {
  302. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == a.budget && (a.budget > 0 || b.budget > 0) {
  303. b.repeat_id_source = a.id
  304. a.repeat_id[b.id] = ""
  305. b.isrepeat = true
  306. b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget"
  307. }
  308. }
  309. }
  310. }
  311. }
  312. default:
  313. }
  314. return &a, &b
  315. }
  316. // zhb_key_list 判断
  317. // "budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"
  318. func key_list(a, b dataSource) bool {
  319. for i := 0; i < len(zhb_key_list); i++ {
  320. key := zhb_key_list[i]
  321. switch key {
  322. case "budget":
  323. if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  324. continue
  325. } else {
  326. return false
  327. }
  328. case "buyer":
  329. if a.buyer == b.buyer && pankong(a.buyer) && pankong(b.buyer) {
  330. continue
  331. } else {
  332. return false
  333. }
  334. case "agency":
  335. if a.agency == b.agency && pankong(a.agency) && pankong(b.agency) {
  336. continue
  337. } else {
  338. return false
  339. }
  340. case "s_winner":
  341. if a.s_winner == b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) {
  342. continue
  343. } else {
  344. return false
  345. }
  346. case "bidamount":
  347. if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  348. continue
  349. } else {
  350. return false
  351. }
  352. case "projectcode":
  353. if a.projectcode == b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) {
  354. continue
  355. } else {
  356. return false
  357. }
  358. case "contractcode":
  359. if a.contractcode == b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) {
  360. continue
  361. } else {
  362. return false
  363. }
  364. }
  365. }
  366. return true
  367. }
  368. func pankong(a string) bool {
  369. if a != "" {
  370. return true
  371. } else {
  372. return false
  373. }
  374. }