datamap.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563
  1. package src1
  2. import (
  3. "fmt"
  4. "log"
  5. qutil "qfw/util"
  6. "regexp"
  7. "strings"
  8. "sync"
  9. "time"
  10. )
  11. type Info struct {
  12. id string //id
  13. title string //标题
  14. area string //省份
  15. city string //城市
  16. subtype string //信息类型
  17. buyer string //采购单位
  18. agency string //代理机构
  19. winner string //中标单位
  20. budget float64 //预算金额
  21. bidamount float64 //中标金额
  22. projectname string //项目名称
  23. projectcode string //项目编号
  24. contractnumber string //合同编号
  25. publishtime int64 //发布时间
  26. comeintime int64 //入库时间
  27. bidopentime int64 //开标时间
  28. bidopenaddress string //开标地点
  29. site string //站点
  30. href string //正文的url
  31. repeatid string //重复id
  32. titleSpecialWord bool //标题特殊词
  33. specialWord bool //再次判断的特殊词
  34. mergemap map[string]interface{} //合并记录
  35. is_site bool //是否站点城市
  36. }
  37. var datelimit = float64(432000) //五天
  38. var sitelock sync.Mutex //锁
  39. //一般数据判重
  40. type datamap struct {
  41. lock sync.Mutex //锁
  42. days int //保留几天数据
  43. data map[string][]*Info
  44. keymap []string
  45. areakeys []string
  46. keys map[string]bool
  47. }
  48. //历史
  49. func TimedTaskDatamap(days int,lasttime int64) *datamap {
  50. log.Println("数据池开始重新构建")
  51. datelimit = qutil.Float64All(days * 86400)
  52. dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{},map[string]bool{}}
  53. if lasttime <0 {
  54. log.Println("数据池空数据")
  55. return dm
  56. }
  57. start := int(time.Now().Unix())
  58. sess := mgo.GetMgoConn()
  59. defer mgo.DestoryMongoConn(sess)
  60. query := map[string]interface{}{"publishtime": map[string]interface{}{
  61. "$lt": lasttime,
  62. }}
  63. log.Println("query", query)
  64. it := sess.DB(mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
  65. n, continuSum := 0, 0
  66. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  67. //qutil.IntAll(tmp["dataging"]) == 1
  68. if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 ||
  69. qutil.IntAll(tmp["dataging"]) == 1 {
  70. } else {
  71. pt := tmp["publishtime"]
  72. pt_time := qutil.Int64All(pt)
  73. if pt_time <= 0 {
  74. break
  75. }
  76. if qutil.Float64All(lasttime-pt_time) < datelimit {
  77. continuSum++
  78. info := NewInfo(tmp)
  79. dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
  80. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  81. data := dm.data[k]
  82. if data == nil {
  83. data = []*Info{}
  84. }
  85. data = append(data, info)
  86. dm.data[k] = data
  87. dm.keys[dkey] = true
  88. //添加省
  89. isAreaExist :=false
  90. for _,v:= range dm.areakeys {
  91. if v==info.area {
  92. isAreaExist = true
  93. }
  94. }
  95. if !isAreaExist {
  96. areaArr := dm.areakeys
  97. areaArr = append(areaArr,info.area)
  98. dm.areakeys = areaArr
  99. }
  100. } else {
  101. break
  102. }
  103. }
  104. if n%50000 == 0 {
  105. log.Println("当前数据池:", n, continuSum)
  106. }
  107. tmp = make(map[string]interface{})
  108. }
  109. log.Printf("数据池构建完成:%d秒,%d个\n", int(time.Now().Unix())-start, n)
  110. return dm
  111. }
  112. //增量
  113. func NewDatamap(days int, lastid string) *datamap {
  114. datelimit = qutil.Float64All(days * 86400 * 2)
  115. dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{},[]string{}, map[string]bool{}}
  116. if lastid == "" {
  117. return dm
  118. }
  119. //初始化加载数据
  120. sess := mgo.GetMgoConn()
  121. defer mgo.DestoryMongoConn(sess)
  122. query := map[string]interface{}{"_id": map[string]interface{}{
  123. "$lte": StringTOBsonId(lastid),
  124. }}
  125. log.Println("query", query)
  126. it := sess.DB(mgo.DbName).C(extract).Find(query).Sort("-publishtime").Iter()
  127. now1 := int64(0)
  128. n, continuSum := 0, 0
  129. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  130. if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1{
  131. } else {
  132. pt := tmp["publishtime"]
  133. pt_time := qutil.Int64All(pt)
  134. if pt_time <= 0 {
  135. break
  136. }
  137. if now1 == 0 {
  138. now1 = pt_time
  139. }
  140. if qutil.Float64All(now1-pt_time) < datelimit {
  141. continuSum++
  142. info := NewInfo(tmp)
  143. dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
  144. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  145. data := dm.data[k]
  146. if data == nil {
  147. data = []*Info{}
  148. }
  149. data = append(data, info)
  150. dm.data[k] = data
  151. dm.keys[dkey] = true
  152. //添加省
  153. isAreaExist :=false
  154. for _,v:= range dm.areakeys {
  155. if v==info.area {
  156. isAreaExist = true
  157. }
  158. }
  159. if !isAreaExist {
  160. areaArr := dm.areakeys
  161. areaArr = append(areaArr,info.area)
  162. dm.areakeys = areaArr
  163. }
  164. } else {
  165. break
  166. }
  167. }
  168. if n%10000 == 0 {
  169. log.Println("当前 n:", n,"数量:" ,continuSum)
  170. }
  171. tmp = make(map[string]interface{})
  172. }
  173. log.Println("load data:", n,"总数:",continuSum)
  174. return dm
  175. }
  176. //数据构建
  177. func NewInfo(tmp map[string]interface{}) *Info {
  178. subtype := qutil.ObjToString(tmp["subtype"])
  179. area := qutil.ObjToString(tmp["area"])
  180. if area == "A" {
  181. area = "全国"
  182. }
  183. info := &Info{}
  184. if IdType {
  185. info.id = qutil.ObjToString(tmp["_id"])
  186. }else {
  187. info.id = BsonTOStringId(tmp["_id"])
  188. }
  189. info.title = qutil.ObjToString(tmp["title"])
  190. info.area = area
  191. info.subtype = subtype
  192. info.buyer = qutil.ObjToString(tmp["buyer"])
  193. info.projectname = qutil.ObjToString(tmp["projectname"])
  194. info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
  195. info.projectcode = qutil.ObjToString(tmp["projectcode"])
  196. info.city = qutil.ObjToString(tmp["city"])
  197. info.agency = qutil.ObjToString(tmp["agency"])
  198. info.winner = qutil.ObjToString(tmp["winner"])
  199. info.budget = qutil.Float64All(tmp["budget"])
  200. info.bidamount = qutil.Float64All(tmp["bidamount"])
  201. info.publishtime = qutil.Int64All(tmp["publishtime"])
  202. info.comeintime = qutil.Int64All(tmp["comeintime"])
  203. info.bidopentime = qutil.Int64All(tmp["bidopentime"])
  204. info.bidopenaddress = qutil.ObjToString(tmp["bidopenaddress"])
  205. info.site = qutil.ObjToString(tmp["site"])
  206. info.href = qutil.ObjToString(tmp["href"])
  207. info.repeatid = qutil.ObjToString(tmp["repeatid"])
  208. info.specialWord = FilterRegTitle.MatchString(info.title)
  209. info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) ||FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
  210. info.mergemap = *qutil.ObjToMap(tmp["merge_map"])
  211. if info.mergemap == nil {
  212. info.mergemap = make(map[string]interface{}, 0)
  213. }
  214. info.is_site = false
  215. return info
  216. }
  217. //判重方法
  218. //判重方法
  219. //判重方法
  220. func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
  221. reason := ""
  222. keys := []string{}
  223. d.lock.Lock()
  224. for k, _ := range d.keys { //不同时间段
  225. if info.area=="全国" {
  226. //匹配所有省
  227. for _,v := range d.areakeys{
  228. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, v))
  229. }
  230. }else {
  231. //匹配指定省
  232. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
  233. }
  234. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
  235. }
  236. d.lock.Unlock()
  237. L:
  238. for _, k := range keys {
  239. d.lock.Lock()
  240. data := d.data[k]
  241. d.lock.Unlock()
  242. if len(data) > 0 { //对比v 找到同类型,同省或全国的数据作对比
  243. for _, v := range data {
  244. reason = ""
  245. if v.id == info.id { //正常重复
  246. return false, v, ""
  247. }
  248. //buyer 优先级高,有值且不相等过滤
  249. if info.buyer!=""&&v.buyer!=""&&info.buyer!=v.buyer {
  250. if buyerIsContinue(v,info) {
  251. continue
  252. }
  253. }
  254. if info.site != "" {//站点临时赋值
  255. sitelock.Lock()
  256. dict := SiteMap[info.site]
  257. sitelock.Unlock()
  258. if dict != nil {
  259. if (info.area == "全国" && dict["area"] != "")||
  260. (info.city == "" && dict["city"] != ""){
  261. info.is_site = true
  262. info.area = qutil.ObjToString(dict["area"])
  263. info.city = qutil.ObjToString(dict["city"])
  264. }
  265. }
  266. }
  267. //前置条件 - 站点相关
  268. if info.site != "" && info.site == v.site {
  269. if info.href != "" && info.href == v.href {
  270. reason = "同站点-href相同"
  271. b = true
  272. source = v
  273. reasons = reason
  274. break L
  275. }
  276. if info.href != "" && info.href != v.href {
  277. if v.title==info.title&&len([]rune(info.title)) >10 && isTheSameDay(info.publishtime,v.publishtime){
  278. if !againRepeat(v, info) {//进行同站点二次判断
  279. reason = "同站点-href不同-标题相同等"
  280. b = true
  281. source = v
  282. reasons = reason
  283. break L
  284. }else {
  285. continue
  286. }
  287. }else {
  288. continue
  289. }
  290. }
  291. }
  292. specialNum:= dealWithSpecialWordNumber(info,v)
  293. //前置条件 - 标题相关,有且一个关键词
  294. if specialNum==1 {
  295. if info.title != v.title && v.title != "" && info.title != "" {
  296. continue
  297. }
  298. }
  299. //前置条件3 - 标题相关,均含有关键词
  300. if specialNum==2 {
  301. if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
  302. v.title != "" && info.title != "" {
  303. letter1,letter2:=v.title,info.title
  304. res, _ := regexp.Compile("[0-9a-zA-Z]+");
  305. if res.MatchString(letter1)||res.MatchString(letter2) {
  306. letter1=convertArabicNumeralsAndLetters(letter1)
  307. letter2=convertArabicNumeralsAndLetters(letter2)
  308. }
  309. if strings.Contains(letter1,"重新招标")|| strings.Contains(letter2,"重新招标"){
  310. letter1,letter2=dealWithSpecialPhrases(letter1,letter2)
  311. }
  312. if letter1==letter2 {
  313. reason = reason + "标题关键词相等关系"
  314. if !againRepeat(v, info) {//进行二级金额判断
  315. b = true
  316. source = v
  317. reasons = reason
  318. break L
  319. }
  320. }else {
  321. if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
  322. //无包含关系-即不相等
  323. continue
  324. }
  325. }
  326. }
  327. }
  328. //前置条件-五要素均相等
  329. if leadingElementSame(v,info) {
  330. reason = "五要素-相同-满足"
  331. b = true
  332. source = v
  333. reasons = reason
  334. break L
  335. }
  336. //新增快速数据过少判重
  337. if LowHeavy {
  338. repeat := false
  339. if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
  340. b = true
  341. source = v
  342. reasons = reason
  343. break L
  344. }
  345. }
  346. //代理机构相同-非空相等
  347. if v.agency != "" && info.agency != "" && v.agency == info.agency {
  348. reason = reason + "同机构-"
  349. repeat := false
  350. if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
  351. b = true
  352. source = v
  353. reasons = reason
  354. break L
  355. }
  356. } else {
  357. reason = reason + "非同机构-"
  358. if info.city != "" && info.city == v.city {
  359. reason = reason + "同城-"
  360. repeat := false
  361. if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
  362. b = true
  363. source = v
  364. reasons = reason
  365. break L
  366. }
  367. } else {
  368. reason = reason + "不同城-"
  369. repeat := false
  370. if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
  371. b = true
  372. source = v
  373. reasons = reason
  374. break L
  375. }
  376. }
  377. }
  378. }
  379. }
  380. }
  381. //往预存数据 d 添加
  382. if !b {
  383. ct := info.publishtime
  384. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  385. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  386. d.lock.Lock()
  387. data := d.data[k]
  388. if data == nil {
  389. data = []*Info{info}
  390. d.data[k] = data
  391. if !d.keys[dkey] {
  392. d.keys[dkey] = true
  393. d.update(ct)
  394. }
  395. } else {
  396. data = append(data, info)
  397. d.data[k] = data
  398. }
  399. //添加省
  400. isAreaExist :=false
  401. for _,v:= range d.areakeys {
  402. if v==info.area {
  403. isAreaExist = true
  404. }
  405. }
  406. if !isAreaExist {
  407. areaArr := d.areakeys
  408. areaArr = append(areaArr,info.area)
  409. d.areakeys = areaArr
  410. }
  411. d.lock.Unlock()
  412. }
  413. return
  414. }
  415. func (d *datamap) update(t int64) {
  416. if TimingTask {
  417. d.keymap = d.GetLatelyFiveDay(t)
  418. }else {
  419. //d.keymap = d.GetLatelyFiveDay(t)//测试数据采用
  420. d.keymap = d.GetLatelyFiveDayDouble(t)
  421. }
  422. m := map[string]bool{}
  423. for _, v := range d.keymap {
  424. m[v] = true
  425. }
  426. all, all1 := 0, 0
  427. for k, v := range d.data {
  428. all += len(v)
  429. if !m[k[:8]] {
  430. delete(d.data, k)
  431. }
  432. }
  433. for k, _ := range d.keys {
  434. if !m[k] {
  435. delete(d.keys, k)
  436. }
  437. }
  438. for _, v := range d.data {
  439. all1 += len(v)
  440. }
  441. //log.Println("更新前后数据:", all, all1)
  442. }
  443. func (d *datamap) GetLatelyFiveDay(t int64) []string {
  444. array := make([]string, d.days)
  445. now := time.Unix(t, 0)
  446. for i := 0; i < d.days; i++ {
  447. array[i] = now.Format(qutil.Date_yyyyMMdd)
  448. now = now.AddDate(0, 0, -1)
  449. }
  450. return array
  451. }
  452. func (d *datamap) GetLatelyFiveDayDouble(t int64) []string {//增量-两倍
  453. array := make([]string, d.days*2)
  454. now := time.Now()
  455. for i := 0; i < d.days*2; i++ {
  456. array[i] = now.Format(qutil.Date_yyyyMMdd)
  457. now = now.AddDate(0, 0, -1)
  458. }
  459. return array
  460. }
  461. //替换原始数据池
  462. func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
  463. //删除数据池的老数据
  464. ct_old := oldData.publishtime
  465. dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
  466. k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
  467. data_old := d.data[k_old]
  468. for k, v := range data_old {
  469. if v.id == oldData.id {//删除对应当前的老数据
  470. data_old = append(data_old[:k], data_old[k+1:]...)
  471. break
  472. }
  473. }
  474. d.data[k_old] = data_old
  475. //添加新的
  476. ct := newData.publishtime
  477. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  478. k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
  479. d.lock.Lock()
  480. data := d.data[k]
  481. if data == nil {
  482. data = []*Info{newData}
  483. d.data[k] = data
  484. if !d.keys[dkey] {
  485. d.keys[dkey] = true
  486. d.update(ct)
  487. }
  488. } else {
  489. data = append(data, newData)
  490. d.data[k] = data
  491. }
  492. //添加省
  493. isAreaExist :=false
  494. for _,v:= range d.areakeys {
  495. if v==newData.area {
  496. isAreaExist = true
  497. }
  498. }
  499. if !isAreaExist {
  500. areaArr := d.areakeys
  501. areaArr = append(areaArr,newData.area)
  502. d.areakeys = areaArr
  503. }
  504. d.lock.Unlock()
  505. }