datamap.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. package main
  2. import (
  3. "fmt"
  4. "log"
  5. qutil "qfw/util"
  6. "reflect"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "time"
  11. )
  12. type Info struct {
  13. id string //id
  14. title string //标题
  15. area string //省份
  16. city string //城市
  17. subtype string //信息类型
  18. buyer string //采购单位
  19. agency string //代理机构
  20. winner string //中标单位
  21. budget float64 //预算金额
  22. bidamount float64 //中标金额
  23. projectname string //项目名称
  24. projectcode string //项目编号
  25. contractnumber string //合同编号
  26. publishtime int64 //发布时间
  27. comeintime int64 //入库时间
  28. bidopentime int64 //开标时间
  29. bidopenaddress string //开标地点
  30. site string //站点
  31. href string //正文的url
  32. repeatid string //重复id
  33. titleSpecialWord bool //标题特殊词
  34. specialWord bool //再次判断的特殊词
  35. mergemap map[string]interface{} //合并记录
  36. is_site bool //是否站点城市
  37. repeat_ids []string //记录所有重复id
  38. }
  39. var datelimit = float64(432000) //五天
  40. var sitelock sync.Mutex //锁
  41. //一般数据判重
  42. type datamap struct {
  43. lock sync.Mutex //锁
  44. days int //保留几天数据
  45. data map[string][]*Info
  46. keymap []string
  47. areakeys []string
  48. keys map[string]bool
  49. }
  50. //历史
  51. func TimedTaskDatamap(days int,lasttime int64,numIndex int) *datamap {
  52. datelimit = qutil.Float64All(days * 86400)
  53. dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{},map[string]bool{}}
  54. if lasttime <0 {
  55. log.Println("数据池空数据")
  56. return dm
  57. }
  58. start := int(time.Now().Unix())
  59. sess := mgo.GetMgoConn()
  60. defer mgo.DestoryMongoConn(sess)
  61. query := map[string]interface{}{"publishtime": map[string]interface{}{
  62. "$lt": lasttime,
  63. }}
  64. log.Println("query", query)
  65. it := sess.DB(mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
  66. n, continuSum := 0, 0
  67. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  68. if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 ||
  69. qutil.IntAll(tmp["dataging"]) == 1 {
  70. } else {
  71. if fmt.Sprint(reflect.TypeOf(tmp["publishtime"]))=="string" {
  72. continue
  73. }
  74. pt := tmp["publishtime"]
  75. pt_time := qutil.Int64All(pt)
  76. if pt_time > time.Now().Unix() {
  77. continue
  78. }
  79. if qutil.Float64All(lasttime-pt_time) < datelimit {
  80. continuSum++
  81. info := NewInfo(tmp)
  82. dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
  83. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  84. data := dm.data[k]
  85. if data == nil {
  86. data = []*Info{}
  87. }
  88. data = append(data, info)
  89. dm.data[k] = data
  90. dm.keys[dkey] = true
  91. //添加省
  92. isAreaExist :=false
  93. for _,v:= range dm.areakeys {
  94. if v==info.area {
  95. isAreaExist = true
  96. }
  97. }
  98. if !isAreaExist {
  99. areaArr := dm.areakeys
  100. areaArr = append(areaArr,info.area)
  101. dm.areakeys = areaArr
  102. }
  103. } else {
  104. break
  105. }
  106. }
  107. tmp = make(map[string]interface{})
  108. }
  109. log.Printf("第%d组:数据池构建完成:%d秒,%d个\n",numIndex ,int(time.Now().Unix())-start, n)
  110. return dm
  111. }
  112. //增量
  113. func NewDatamap(days int, lastid string) *datamap {
  114. datelimit = qutil.Float64All(days * 86400 * 2)
  115. dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{},[]string{}, map[string]bool{}}
  116. if lastid == "" {
  117. log.Println("不构建数据池")
  118. return dm
  119. }
  120. //初始化加载数据
  121. sess := mgo.GetMgoConn()
  122. defer mgo.DestoryMongoConn(sess)
  123. query := map[string]interface{}{"_id": map[string]interface{}{
  124. "$lte": StringTOBsonId(lastid),
  125. }}
  126. log.Println("query", query)
  127. it := sess.DB(mgo.DbName).C(extract).Find(query).Sort("-publishtime").Iter()
  128. nowTime := time.Now().Unix()//当前时间的时间戳
  129. n, continuSum := 0, 0
  130. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  131. //source := util.ObjToMap(tmp["jsondata"]) //修复临时添加
  132. //if util.IntAll((*source)["sourcewebsite"]) == 1 {
  133. // continue
  134. //}
  135. if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1{
  136. } else {
  137. if fmt.Sprint(reflect.TypeOf(tmp["publishtime"]))=="string" {
  138. continue
  139. }
  140. pt:= tmp["publishtime"]
  141. pt_time := qutil.Int64All(pt)
  142. if pt_time > time.Now().Unix() {
  143. continue
  144. }
  145. if qutil.Float64All(nowTime-pt_time) <= datelimit {
  146. continuSum++
  147. info := NewInfo(tmp)
  148. dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
  149. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  150. data := dm.data[k]
  151. if data == nil {
  152. data = []*Info{}
  153. }
  154. data = append(data, info)
  155. dm.data[k] = data
  156. dm.keys[dkey] = true
  157. //添加省
  158. isAreaExist :=false
  159. for _,v:= range dm.areakeys {
  160. if v==info.area {
  161. isAreaExist = true
  162. }
  163. }
  164. if !isAreaExist {
  165. areaArr := dm.areakeys
  166. areaArr = append(areaArr,info.area)
  167. dm.areakeys = areaArr
  168. }
  169. } else {
  170. break
  171. }
  172. }
  173. if n%10000 == 0 {
  174. log.Println("当前 n:", n,"数量:" ,continuSum,tmp["_id"])
  175. }
  176. tmp = make(map[string]interface{})
  177. }
  178. log.Println("load data:", n,"总数:",continuSum)
  179. return dm
  180. }
  181. //数据构建
  182. func NewInfo(tmp map[string]interface{}) *Info {
  183. subtype := qutil.ObjToString(tmp["subtype"])
  184. if subtype=="招标"||subtype=="邀标"||subtype=="询价"||
  185. subtype=="竞谈"||subtype=="竞价" {
  186. subtype = "招标"
  187. }
  188. area := qutil.ObjToString(tmp["area"])
  189. if area == "A" {
  190. area = "全国"
  191. }
  192. info := &Info{}
  193. info.id = BsonTOStringId(tmp["_id"])
  194. info.title = qutil.ObjToString(tmp["title"])
  195. info.area = area
  196. info.subtype = subtype
  197. info.buyer = qutil.ObjToString(tmp["buyer"])
  198. info.projectname = qutil.ObjToString(tmp["projectname"])
  199. info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
  200. info.projectcode = qutil.ObjToString(tmp["projectcode"])
  201. info.city = qutil.ObjToString(tmp["city"])
  202. info.agency = qutil.ObjToString(tmp["agency"])
  203. info.winner = qutil.ObjToString(tmp["winner"])
  204. info.budget = qutil.Float64All(tmp["budget"])
  205. info.bidamount = qutil.Float64All(tmp["bidamount"])
  206. info.publishtime = qutil.Int64All(tmp["publishtime"])
  207. info.comeintime = qutil.Int64All(tmp["comeintime"])
  208. info.bidopentime = qutil.Int64All(tmp["bidopentime"])
  209. info.bidopenaddress = qutil.ObjToString(tmp["bidopenaddress"])
  210. info.site = qutil.ObjToString(tmp["site"])
  211. info.href = qutil.ObjToString(tmp["href"])
  212. info.repeatid = qutil.ObjToString(tmp["repeatid"])
  213. info.specialWord = FilterRegTitle.MatchString(info.title)
  214. info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) ||FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
  215. info.mergemap = *qutil.ObjToMap(tmp["merge"])
  216. if info.mergemap == nil {
  217. info.mergemap = make(map[string]interface{}, 0)
  218. }
  219. if info.repeat_ids == nil {
  220. info.repeat_ids = make([]string, 0)
  221. }
  222. info.is_site = false
  223. return info
  224. }
  225. //判重方法
  226. //判重方法
  227. //判重方法
  228. func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
  229. reason := ""
  230. keys := []string{}
  231. d.lock.Lock()
  232. for k, _ := range d.keys { //不同时间段
  233. if info.area=="全国" {
  234. //匹配所有省
  235. for _,v := range d.areakeys{
  236. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, v))
  237. }
  238. }else {
  239. //匹配指定省
  240. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
  241. }
  242. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
  243. }
  244. d.lock.Unlock()
  245. L:
  246. for _, k := range keys {
  247. d.lock.Lock()
  248. data := d.data[k]
  249. d.lock.Unlock()
  250. if len(data) > 0 { //对比v 找到同类型,同省或全国的数据作对比
  251. for _, v := range data {
  252. reason = ""
  253. if v.id == info.id { //正常重复
  254. return false, v, ""
  255. }
  256. //buyer 优先级高,有值且不相等过滤
  257. if info.buyer!=""&&v.buyer!=""&&info.buyer!=v.buyer {
  258. if buyerIsContinue(v,info) {
  259. continue
  260. }
  261. }
  262. if info.site != "" {//站点临时赋值
  263. sitelock.Lock()
  264. dict := SiteMap[info.site]
  265. sitelock.Unlock()
  266. if dict != nil {
  267. if (info.area == "全国" && dict["area"] != "")||
  268. (info.city == "" && dict["city"] != ""){
  269. info.is_site = true
  270. info.area = qutil.ObjToString(dict["area"])
  271. info.city = qutil.ObjToString(dict["city"])
  272. }
  273. }
  274. }
  275. //前置条件 - 站点相关
  276. if info.site != "" && info.site == v.site {
  277. if info.href != "" && info.href == v.href {
  278. reason = "同站点-href相同"
  279. b = true
  280. source = v
  281. reasons = reason
  282. break L
  283. }
  284. //相同发布时间-标题无包含关系
  285. if isTheSameDay(info.publishtime,v.publishtime) &&
  286. !(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
  287. continue
  288. }
  289. //不同href
  290. if info.href != "" && info.href != v.href {
  291. if v.title==info.title{
  292. if !againRepeat(v, info,true) {//进行同站点二次判断
  293. reason = "同站点-href不同-标题相同等"
  294. b = true
  295. source = v
  296. reasons = reason
  297. break L
  298. }else {
  299. continue
  300. }
  301. }else {
  302. if againRepeat(v, info,true) {
  303. continue
  304. }
  305. }
  306. }
  307. }
  308. //特殊词处理
  309. specialNum:= dealWithSpecialWordNumber(info,v)
  310. //前置条件 - 标题相关,有且一个关键词
  311. if specialNum==1 {
  312. if againRepeat(v, info,false) {
  313. continue
  314. }
  315. }
  316. //前置条件3 - 标题相关,均含有关键词
  317. if specialNum==2 {
  318. if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
  319. v.title != "" && info.title != "" {
  320. letter1,letter2:=v.title,info.title
  321. res, _ := regexp.Compile("[0-9a-zA-Z]+");
  322. if res.MatchString(letter1)||res.MatchString(letter2) {
  323. letter1=convertArabicNumeralsAndLetters(letter1)
  324. letter2=convertArabicNumeralsAndLetters(letter2)
  325. }
  326. if strings.Contains(letter1,"重新招标")|| strings.Contains(letter2,"重新招标"){
  327. letter1,letter2=dealWithSpecialPhrases(letter1,letter2)
  328. }
  329. if letter1==letter2 {
  330. reason = reason + "标题关键词相等关系"
  331. if !againRepeat(v, info,false) {//进行二级金额判断
  332. b = true
  333. source = v
  334. reasons = reason
  335. break L
  336. }
  337. }else {
  338. if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
  339. //无包含关系-即不相等
  340. if againContainSpecialWord(v, info) {
  341. continue
  342. }
  343. }
  344. }
  345. }
  346. }
  347. //前置条件-五要素均相等
  348. if leadingElementSame(v,info) {
  349. reason = "五要素-相同-满足"
  350. b = true
  351. source = v
  352. reasons = reason
  353. break L
  354. }
  355. //新增快速数据过少判重
  356. if LowHeavy {
  357. repeat := false
  358. if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
  359. b = true
  360. source = v
  361. reasons = reason
  362. break L
  363. }
  364. }
  365. //代理机构相同-非空相等
  366. if v.agency != "" && info.agency != "" && v.agency == info.agency {
  367. reason = reason + "同机构-"
  368. repeat := false
  369. if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
  370. b = true
  371. source = v
  372. reasons = reason
  373. break L
  374. }
  375. } else {
  376. reason = reason + "非同机构-"
  377. if info.city != "" && info.city == v.city {
  378. reason = reason + "同城-"
  379. repeat := false
  380. if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
  381. b = true
  382. source = v
  383. reasons = reason
  384. break L
  385. }
  386. } else {
  387. reason = reason + "不同城-"
  388. repeat := false
  389. if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
  390. b = true
  391. source = v
  392. reasons = reason
  393. break L
  394. }
  395. }
  396. }
  397. }
  398. }
  399. }
  400. //往预存数据 d 添加
  401. if !b {
  402. ct := info.publishtime
  403. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  404. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  405. d.lock.Lock()
  406. data := d.data[k]
  407. if data == nil {
  408. data = []*Info{info}
  409. d.data[k] = data
  410. if !d.keys[dkey] {
  411. d.keys[dkey] = true
  412. d.update(ct)
  413. }
  414. } else {
  415. data = append(data, info)
  416. d.data[k] = data
  417. }
  418. //添加省
  419. isAreaExist :=false
  420. for _,v:= range d.areakeys {
  421. if v==info.area {
  422. isAreaExist = true
  423. }
  424. }
  425. if !isAreaExist {
  426. areaArr := d.areakeys
  427. areaArr = append(areaArr,info.area)
  428. d.areakeys = areaArr
  429. }
  430. d.lock.Unlock()
  431. }
  432. return
  433. }
  434. func (d *datamap) update(t int64) {
  435. if TimingTask {
  436. }else {
  437. if IsFull {
  438. d.keymap = d.GetLatelyFiveDay(t)//全量
  439. }else {
  440. d.keymap = d.GetLatelyFiveDayDouble(t) //增量
  441. }
  442. m := map[string]bool{}
  443. for _, v := range d.keymap {
  444. m[v] = true
  445. }
  446. for k, _ := range d.data {
  447. if !m[k[:8]] {
  448. delete(d.data, k)
  449. }
  450. }
  451. for k, _ := range d.keys {
  452. if !m[k] {
  453. delete(d.keys, k)
  454. }
  455. }
  456. }
  457. }
  458. func (d *datamap) GetLatelyFiveDay(t int64) []string {
  459. array := make([]string, d.days)
  460. now := time.Unix(t, 0)
  461. for i := 0; i < d.days; i++ {
  462. array[i] = now.Format(qutil.Date_yyyyMMdd)
  463. now = now.AddDate(0, 0, -1)
  464. }
  465. return array
  466. }
  467. func (d *datamap) GetLatelyFiveDayDouble(t int64) []string {//增量-两倍
  468. array := make([]string, d.days*2)
  469. now := time.Now()
  470. for i := 0; i < d.days*2; i++ {
  471. array[i] = now.Format(qutil.Date_yyyyMMdd)
  472. now = now.AddDate(0, 0, -1)
  473. }
  474. return array
  475. }
  476. //替换原始数据池-更新
  477. func (d *datamap) replacePoolData(newData *Info) {
  478. d.lock.Lock()
  479. ct := newData.publishtime
  480. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  481. k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
  482. data := d.data[k]
  483. for k, v := range data {
  484. if v.id == newData.id {//替换
  485. data[k] = newData
  486. break
  487. }
  488. }
  489. d.data[k] = data
  490. d.lock.Unlock()
  491. }
  492. //替换 - A-B - 原始数据池
  493. func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
  494. //删除数据池的老数据
  495. ct_old := oldData.publishtime
  496. dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
  497. k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
  498. data_old := d.data[k_old]
  499. for k, v := range data_old {
  500. if v.id == oldData.id {//删除对应当前的老数据
  501. data_old = append(data_old[:k], data_old[k+1:]...)
  502. break
  503. }
  504. }
  505. d.data[k_old] = data_old
  506. //添加新的
  507. ct := newData.publishtime
  508. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  509. k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
  510. d.lock.Lock()
  511. data := d.data[k]
  512. if data == nil {
  513. data = []*Info{newData}
  514. d.data[k] = data
  515. if !d.keys[dkey] {
  516. d.keys[dkey] = true
  517. d.update(ct)
  518. }
  519. } else {
  520. data = append(data, newData)
  521. d.data[k] = data
  522. }
  523. //添加省
  524. isAreaExist :=false
  525. for _,v:= range d.areakeys {
  526. if v==newData.area {
  527. isAreaExist = true
  528. }
  529. }
  530. if !isAreaExist {
  531. areaArr := d.areakeys
  532. areaArr = append(areaArr,newData.area)
  533. d.areakeys = areaArr
  534. }
  535. d.lock.Unlock()
  536. }
  537. func (d *datamap) currentTotalCount() int {
  538. num:=qutil.IntAll(0)
  539. for _,v:=range d.data {
  540. num = num+qutil.IntAll(len(v))
  541. }
  542. return num
  543. }