task.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. package main
  2. import (
  3. "go.mongodb.org/mongo-driver/bson"
  4. "log"
  5. "mongodb"
  6. qu "qfw/util"
  7. "regexp"
  8. "strings"
  9. "time"
  10. )
  11. var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"}
  12. var packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)`)
  13. var listSource []*dataSource
  14. type dataSource struct {
  15. _id, id, title string
  16. projectname, projectcode, contractcode string
  17. buyer, agency, s_winner string
  18. budget, bidamount float64
  19. isrepeat bool
  20. repeat_id_source string
  21. repeat_id map[string]string
  22. repeatText string
  23. }
  24. func task4(coll, startTime, endTime, sortType string) {
  25. log.Printf("表名:%s,开始时间:%s,结束时间:%s,排序方式:%s", coll, startTime, endTime, sortType)
  26. stime, _ := time.Parse(qu.Date_Short_Layout, startTime)
  27. etime, _ := time.Parse(qu.Date_Short_Layout, endTime)
  28. query := bson.M{}
  29. query["$and"] = []interface{}{
  30. bson.M{"publishtime": bson.M{"$gte": stime.Unix()}},
  31. bson.M{"publishtime": bson.M{"$lte": etime.Unix()}},
  32. }
  33. sort := "publishtime"
  34. if sortType == "-1" {
  35. sort = "-publishtime"
  36. }
  37. log.Println(query, sort)
  38. sess := Mgo.GetMgoConn()
  39. defer Mgo.DestoryMongoConn(sess)
  40. f := bson.M{"details": 0, "detail": 0, "filetext": 0}
  41. it := sess.DB(Mgo.DbName).C(coll).Find(query).Select(f).Sort(sort).Iter()
  42. index := 0
  43. for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
  44. d := &dataSource{
  45. _id: mongodb.BsonIdToSId(tmp["_id"]),
  46. id: qu.ObjToString(tmp["id"]),
  47. title: strings.ToLower(qu.ObjToString(tmp["title"])),
  48. projectname: strings.ToLower(qu.ObjToString(tmp["projectname"])),
  49. projectcode: strings.ToLower(qu.ObjToString(tmp["projectcode"])),
  50. contractcode: strings.ToLower(qu.ObjToString(tmp["contractcode"])),
  51. buyer: strings.ToLower(qu.ObjToString(tmp["buyer"])),
  52. agency: strings.ToLower(qu.ObjToString(tmp["agency"])),
  53. s_winner: strings.ToLower(qu.ObjToString(tmp["s_winner"])),
  54. budget: qu.Float64All(tmp["budget"]),
  55. bidamount: qu.Float64All(tmp["bidamount"]),
  56. repeat_id: map[string]string{},
  57. }
  58. //log.Println(tmp["_id"], d.id)
  59. if index%10000 == 0 {
  60. log.Println("加载数据:", index)
  61. }
  62. listSource = append(listSource, d)
  63. tmp = map[string]interface{}{}
  64. }
  65. log.Println("数据加载完成")
  66. dataItem()
  67. dd := 0
  68. for i := 0; i < len(listSource); i++ {
  69. a := listSource[i]
  70. if a.isrepeat {
  71. dd++
  72. }
  73. //更新数据
  74. Mgo.UpdateById(coll, a._id,
  75. map[string]interface{}{"$set": map[string]interface{}{
  76. "repeatid": a.repeat_id_source, //和那条数据重复id
  77. "repeat": a.isrepeat, //本条数据是否本判重
  78. "repeatid_ids": a.repeat_id, //和我重复的数据都有哪些
  79. "repeattext": a.repeatText, //本数据被判重的原因
  80. }})
  81. if i%1000 == 0 {
  82. log.Println("已更新", i)
  83. }
  84. }
  85. log.Println(dd)
  86. }
  87. var listSize = 20000
  88. func dataItem() {
  89. for i := 0; i < len(listSource); i++ {
  90. a := listSource[i]
  91. // if a.isrepeat {
  92. // continue
  93. // }
  94. b := &dataSource{}
  95. for j := i + 1; j < len(listSource); j++ {
  96. b = listSource[j]
  97. // if b.isrepeat {
  98. // continue
  99. // }
  100. a, b = panchong(*a, *b)
  101. listSource[j] = b
  102. listSource[i] = a
  103. // if b.isrepeat {
  104. // log.Println("sss", a.id, b.isrepeat, b.repeat_id)
  105. // }
  106. }
  107. if i%500 == 0 {
  108. log.Println("已处理:", i)
  109. }
  110. }
  111. }
  112. func panchong(a, b dataSource) (c, d *dataSource) {
  113. switch {
  114. case a.title == b.title: //标题相等
  115. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  116. } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  117. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  118. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  119. b.repeat_id_source = a.id
  120. a.repeat_id[b.id] = ""
  121. b.isrepeat = true
  122. b.repeatText = "标题相等 && buyer && s_winner"
  123. //log.Println("1111", a.id, b.id, b.isrepeat)
  124. }
  125. } else {
  126. r := key_list(a, b)
  127. if r {
  128. b.repeat_id_source = a.id
  129. a.repeat_id[b.id] = ""
  130. b.isrepeat = true
  131. b.repeatText = "标题相等 && budget && key_list"
  132. }
  133. }
  134. } else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
  135. r := key_list(a, b)
  136. if r {
  137. b.repeat_id_source = a.id
  138. a.repeat_id[b.id] = ""
  139. b.isrepeat = true
  140. b.repeatText = "标题相等 && projectcode && key_list"
  141. }
  142. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  143. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  144. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  145. b.repeat_id_source = a.id
  146. a.repeat_id[b.id] = ""
  147. b.isrepeat = true
  148. b.repeatText = "标题相等 && bidamount && buyer && s_winner"
  149. //log.Println("1111", a.id, b.id, b.isrepeat)
  150. }
  151. } else {
  152. r := key_list(a, b)
  153. if r {
  154. b.repeat_id_source = a.id
  155. a.repeat_id[b.id] = ""
  156. b.isrepeat = true
  157. b.repeatText = "标题相等 && bidamount && key_list"
  158. }
  159. }
  160. } else {
  161. //
  162. }
  163. case a.title != b.title: //标题不相等
  164. //项目名称包含及相等
  165. if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) {
  166. isp := packreg.MatchString(a.title)
  167. //有分包
  168. if isp {
  169. //项目名称相等
  170. if a.projectname == b.projectname {
  171. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  172. //
  173. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  174. b.repeat_id_source = a.id
  175. a.repeat_id[b.id] = ""
  176. b.isrepeat = true
  177. b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
  178. } else if a.bidamount != b.bidamount {
  179. //
  180. } else {
  181. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  182. b.repeat_id_source = a.id
  183. a.repeat_id[b.id] = ""
  184. b.isrepeat = true
  185. b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget"
  186. }
  187. }
  188. } else { //项目名称包含
  189. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  190. //
  191. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  192. if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
  193. b.repeat_id_source = a.id
  194. a.repeat_id[b.id] = ""
  195. b.isrepeat = true
  196. b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode"
  197. } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  198. b.repeat_id_source = a.id
  199. a.repeat_id[b.id] = ""
  200. b.isrepeat = true
  201. b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner"
  202. } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  203. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  204. b.repeat_id_source = a.id
  205. a.repeat_id[b.id] = ""
  206. b.isrepeat = true
  207. b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer"
  208. } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
  209. b.repeat_id_source = a.id
  210. a.repeat_id[b.id] = ""
  211. b.isrepeat = true
  212. b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency"
  213. } else {
  214. //
  215. }
  216. }
  217. } else if a.bidamount != b.bidamount {
  218. //
  219. } else {
  220. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  221. b.repeat_id_source = a.id
  222. a.repeat_id[b.id] = ""
  223. b.isrepeat = true
  224. b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget"
  225. } else {
  226. //
  227. }
  228. }
  229. }
  230. } else { //无分包
  231. //项目名称相等
  232. if a.projectname == b.projectname {
  233. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  234. //
  235. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  236. b.repeat_id_source = a.id
  237. a.repeat_id[b.id] = ""
  238. b.isrepeat = true
  239. b.repeatText = "标题不相等-->无分包 && projectname && bidamount"
  240. } else if a.bidamount != b.bidamount {
  241. //
  242. } else {
  243. if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
  244. b.repeat_id_source = a.id
  245. a.repeat_id[b.id] = ""
  246. b.isrepeat = true
  247. b.repeatText = "标题不相等-->无分包 && projectname && projectcode"
  248. } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  249. b.repeat_id_source = a.id
  250. a.repeat_id[b.id] = ""
  251. b.isrepeat = true
  252. b.repeatText = "标题不相等-->无分包 && projectname && s_winner"
  253. } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  254. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  255. b.repeat_id_source = a.id
  256. a.repeat_id[b.id] = ""
  257. b.isrepeat = true
  258. b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer"
  259. } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
  260. b.repeat_id_source = a.id
  261. a.repeat_id[b.id] = ""
  262. b.isrepeat = true
  263. b.repeatText = "标题不相等-->无分包 && projectname && budget && agency"
  264. } else {
  265. //
  266. }
  267. }
  268. }
  269. } else { //项目名称包含
  270. if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
  271. //
  272. } else if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  273. if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
  274. b.repeat_id_source = a.id
  275. a.repeat_id[b.id] = ""
  276. b.isrepeat = true
  277. b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount"
  278. } else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
  279. b.repeat_id_source = a.id
  280. a.repeat_id[b.id] = ""
  281. b.isrepeat = true
  282. b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner"
  283. } else if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  284. if pankong(a.buyer) && pankong(b.buyer) && a.buyer == b.buyer {
  285. b.repeat_id_source = a.id
  286. a.repeat_id[b.id] = ""
  287. b.isrepeat = true
  288. b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer"
  289. } else if pankong(a.agency) && pankong(b.agency) && a.agency == b.agency {
  290. b.repeat_id_source = a.id
  291. a.repeat_id[b.id] = ""
  292. b.isrepeat = true
  293. b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency"
  294. } else {
  295. //
  296. }
  297. } else {
  298. //
  299. }
  300. } else if a.bidamount != b.bidamount {
  301. //
  302. } else {
  303. if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && a.budget == a.budget && (a.budget > 0 || b.budget > 0) {
  304. b.repeat_id_source = a.id
  305. a.repeat_id[b.id] = ""
  306. b.isrepeat = true
  307. b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget"
  308. }
  309. }
  310. }
  311. }
  312. }
  313. default:
  314. }
  315. return &a, &b
  316. }
  317. // zhb_key_list 判断
  318. // "budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"
  319. func key_list(a, b dataSource) bool {
  320. for i := 0; i < len(zhb_key_list); i++ {
  321. key := zhb_key_list[i]
  322. switch key {
  323. case "budget":
  324. if a.budget == b.budget && (a.budget > 0 || b.budget > 0) {
  325. continue
  326. } else {
  327. return false
  328. }
  329. case "buyer":
  330. if a.buyer == b.buyer && pankong(a.buyer) && pankong(b.buyer) {
  331. continue
  332. } else {
  333. return false
  334. }
  335. case "agency":
  336. if a.agency == b.agency && pankong(a.agency) && pankong(b.agency) {
  337. continue
  338. } else {
  339. return false
  340. }
  341. case "s_winner":
  342. if a.s_winner == b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) {
  343. continue
  344. } else {
  345. return false
  346. }
  347. case "bidamount":
  348. if a.bidamount == b.bidamount && (a.bidamount > 0 || b.bidamount > 0) {
  349. continue
  350. } else {
  351. return false
  352. }
  353. case "projectcode":
  354. if a.projectcode == b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) {
  355. continue
  356. } else {
  357. return false
  358. }
  359. case "contractcode":
  360. if a.contractcode == b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) {
  361. continue
  362. } else {
  363. return false
  364. }
  365. }
  366. }
  367. return true
  368. }
  369. func pankong(a string) bool {
  370. if a != "" {
  371. return true
  372. } else {
  373. return false
  374. }
  375. }