datamap.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
  1. package main
  2. import (
  3. "fmt"
  4. "log"
  5. qutil "qfw/util"
  6. "reflect"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "time"
  11. )
  12. type Info struct {
  13. id string //id
  14. title string //标题
  15. spidercode string //爬虫代码
  16. area string //省份
  17. city string //城市
  18. subtype string //信息类型
  19. buyer string //采购单位
  20. agency string //代理机构
  21. winner string //中标单位
  22. budget float64 //预算金额
  23. bidamount float64 //中标金额
  24. projectname string //项目名称
  25. projectcode string //项目编号
  26. contractnumber string //合同编号
  27. publishtime int64 //发布时间
  28. comeintime int64 //入库时间
  29. bidopentime int64 //开标时间
  30. bidopenaddress string //开标地点
  31. site string //站点
  32. href string //正文的url
  33. repeatid string //重复id
  34. titleSpecialWord bool //标题特殊词
  35. specialWord bool //再次判断的特殊词
  36. mergemap map[string]interface{} //合并记录
  37. is_site bool //是否站点城市
  38. repeat_ids []string //记录所有重复id
  39. }
  40. var datelimit = float64(432000) //五天
  41. var sitelock sync.Mutex //锁
  42. //一般数据判重
  43. type datamap struct {
  44. lock sync.Mutex //锁
  45. days int //保留几天数据
  46. data map[string][]*Info
  47. keymap []string
  48. areakeys []string
  49. keys map[string]bool
  50. }
  51. //历史
  52. func TimedTaskDatamap(days int,lasttime int64,numIndex int) *datamap {
  53. datelimit = qutil.Float64All(days * 86400)
  54. dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{},map[string]bool{}}
  55. if lasttime <0 {
  56. log.Println("数据池空数据")
  57. return dm
  58. }
  59. start := int(time.Now().Unix())
  60. sess := data_mgo.GetMgoConn()
  61. defer data_mgo.DestoryMongoConn(sess)
  62. query := map[string]interface{}{"publishtime": map[string]interface{}{
  63. "$lt": lasttime,
  64. }}
  65. log.Println("query", query)
  66. it := sess.DB(data_mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
  67. n, continuSum := 0, 0
  68. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  69. if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 ||
  70. qutil.IntAll(tmp["dataging"]) == 1 {
  71. } else {
  72. if fmt.Sprint(reflect.TypeOf(tmp["publishtime"]))=="string" {
  73. continue
  74. }
  75. pt := tmp["publishtime"]
  76. pt_time := qutil.Int64All(pt)
  77. if pt_time > time.Now().Unix() {
  78. continue
  79. }
  80. if qutil.Float64All(lasttime-pt_time) < datelimit {
  81. continuSum++
  82. info := NewInfo(tmp)
  83. dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
  84. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  85. data := dm.data[k]
  86. if data == nil {
  87. data = []*Info{}
  88. }
  89. data = append(data, info)
  90. dm.data[k] = data
  91. dm.keys[dkey] = true
  92. //添加省
  93. isAreaExist :=false
  94. for _,v:= range dm.areakeys {
  95. if v==info.area {
  96. isAreaExist = true
  97. }
  98. }
  99. if !isAreaExist {
  100. areaArr := dm.areakeys
  101. areaArr = append(areaArr,info.area)
  102. dm.areakeys = areaArr
  103. }
  104. } else {
  105. break
  106. }
  107. }
  108. tmp = make(map[string]interface{})
  109. }
  110. log.Printf("第%d组:数据池构建完成:%d秒,%d个\n",numIndex ,int(time.Now().Unix())-start, n)
  111. return dm
  112. }
  113. //增量
  114. func NewDatamap(days int, lastid string) *datamap {
  115. datelimit = qutil.Float64All(days * 86400 * 2)
  116. dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{},[]string{}, map[string]bool{}}
  117. if lastid == "" {
  118. log.Println("不构建数据池")
  119. return dm
  120. }
  121. //初始化加载数据
  122. sess := data_mgo.GetMgoConn()
  123. defer data_mgo.DestoryMongoConn(sess)
  124. query := map[string]interface{}{"_id": map[string]interface{}{
  125. "$lte": StringTOBsonId(lastid),
  126. }}
  127. log.Println("query", query)
  128. it := sess.DB(data_mgo.DbName).C(extract).Find(query).Sort("-publishtime").Iter()
  129. nowTime := time.Now().Unix()//当前时间的时间戳
  130. n, continuSum := 0, 0
  131. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  132. //source := util.ObjToMap(tmp["jsondata"]) //修复临时添加
  133. //if util.IntAll((*source)["sourcewebsite"]) == 1 {
  134. // continue
  135. //}
  136. if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1{
  137. } else {
  138. if fmt.Sprint(reflect.TypeOf(tmp["publishtime"]))=="string" {
  139. continue
  140. }
  141. pt:= tmp["publishtime"]
  142. pt_time := qutil.Int64All(pt)
  143. if pt_time > time.Now().Unix() {
  144. continue
  145. }
  146. if qutil.Float64All(nowTime-pt_time) <= datelimit {
  147. continuSum++
  148. info := NewInfo(tmp)
  149. dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
  150. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  151. data := dm.data[k]
  152. if data == nil {
  153. data = []*Info{}
  154. }
  155. data = append(data, info)
  156. dm.data[k] = data
  157. dm.keys[dkey] = true
  158. //添加省
  159. isAreaExist :=false
  160. for _,v:= range dm.areakeys {
  161. if v==info.area {
  162. isAreaExist = true
  163. }
  164. }
  165. if !isAreaExist {
  166. areaArr := dm.areakeys
  167. areaArr = append(areaArr,info.area)
  168. dm.areakeys = areaArr
  169. }
  170. } else {
  171. break
  172. }
  173. }
  174. if n%10000 == 0 {
  175. log.Println("当前 n:", n,"数量:" ,continuSum,tmp["_id"])
  176. }
  177. tmp = make(map[string]interface{})
  178. }
  179. log.Println("load data:", n,"总数:",continuSum)
  180. return dm
  181. }
  182. //数据构建
  183. func NewInfo(tmp map[string]interface{}) *Info {
  184. subtype := qutil.ObjToString(tmp["subtype"])
  185. if subtype=="招标"||subtype=="邀标"||subtype=="询价"||
  186. subtype=="竞谈"||subtype=="竞价" {
  187. subtype = "招标"
  188. }
  189. area := qutil.ObjToString(tmp["area"])
  190. if area == "A" {
  191. area = "全国"
  192. }
  193. info := &Info{}
  194. info.id = BsonTOStringId(tmp["_id"])
  195. info.title = qutil.ObjToString(tmp["title"])
  196. info.area = area
  197. info.subtype = subtype
  198. info.spidercode = qutil.ObjToString(tmp["spidercode"])
  199. info.buyer = qutil.ObjToString(tmp["buyer"])
  200. info.projectname = qutil.ObjToString(tmp["projectname"])
  201. info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
  202. info.projectcode = qutil.ObjToString(tmp["projectcode"])
  203. info.city = qutil.ObjToString(tmp["city"])
  204. info.agency = qutil.ObjToString(tmp["agency"])
  205. info.winner = qutil.ObjToString(tmp["winner"])
  206. info.budget = qutil.Float64All(tmp["budget"])
  207. info.bidamount = qutil.Float64All(tmp["bidamount"])
  208. info.publishtime = qutil.Int64All(tmp["publishtime"])
  209. info.comeintime = qutil.Int64All(tmp["comeintime"])
  210. info.bidopentime = qutil.Int64All(tmp["bidopentime"])
  211. info.bidopenaddress = qutil.ObjToString(tmp["bidopenaddress"])
  212. info.site = qutil.ObjToString(tmp["site"])
  213. info.href = qutil.ObjToString(tmp["href"])
  214. info.repeatid = qutil.ObjToString(tmp["repeatid"])
  215. info.specialWord = FilterRegTitle.MatchString(info.title)
  216. info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) ||FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
  217. info.mergemap = *qutil.ObjToMap(tmp["merge"])
  218. if info.mergemap == nil {
  219. info.mergemap = make(map[string]interface{}, 0)
  220. }
  221. if info.repeat_ids == nil {
  222. info.repeat_ids = make([]string, 0)
  223. }
  224. info.is_site = false
  225. return info
  226. }
  227. //判重方法
  228. //判重方法
  229. //判重方法
  230. func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
  231. reason := ""
  232. isTestLog := false
  233. keys := []string{}
  234. d.lock.Lock()
  235. for k, _ := range d.keys { //不同时间段
  236. if info.area=="全国" {//匹配所有省
  237. for _,v := range d.areakeys{
  238. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, v))
  239. }
  240. }else {//匹配指定省
  241. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
  242. }
  243. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
  244. }
  245. d.lock.Unlock()
  246. L:
  247. for _, k := range keys {
  248. d.lock.Lock()
  249. data := d.data[k]
  250. d.lock.Unlock()
  251. if len(data) > 0 { //对比v 找到同类型,同省或全国的数据作对比
  252. for _, v := range data {
  253. reason = ""
  254. isTestLog = false
  255. if v.id == info.id { //正常重复
  256. return false, v, ""
  257. }
  258. //buyer 优先级高,有值且不相等过滤
  259. if info.buyer!=""&&v.buyer!=""&&info.buyer!=v.buyer {
  260. if v.title != info.title && v.title != "" && info.title != "" {
  261. isTestLog = true
  262. }
  263. if buyerIsContinue(v,info) {
  264. continue
  265. }
  266. }
  267. if info.site != "" {//站点临时赋值
  268. sitelock.Lock()
  269. dict := SiteMap[info.site]
  270. sitelock.Unlock()
  271. if dict != nil {
  272. if (info.area == "全国" && dict["area"] != "")||
  273. (info.city == "" && dict["city"] != ""){
  274. info.is_site = true
  275. info.area = qutil.ObjToString(dict["area"])
  276. info.city = qutil.ObjToString(dict["city"])
  277. }
  278. }
  279. }
  280. //前置条件-五要素均相等
  281. if leadingElementSame(v,info) {
  282. reason = "五要素-相同-满足"
  283. b = true
  284. source = v
  285. reasons = reason
  286. break L
  287. }
  288. //前置条件 - 站点相关
  289. if info.site != "" && info.site == v.site {
  290. if info.href != "" && info.href == v.href {
  291. reason = "同站点-href相同"
  292. b = true
  293. source = v
  294. reasons = reason
  295. break L
  296. }
  297. //相同发布时间-标题无包含关系 - 项目名称不等
  298. if isTheSameDay(info.publishtime,v.publishtime) &&
  299. !(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
  300. continue
  301. }
  302. //不同href
  303. if info.href != "" && info.href != v.href {
  304. if v.title==info.title{
  305. if !againRepeat(v, info,true) {//进行同站点二次判断
  306. reason = "同站点-href不同-标题相同等"
  307. b = true
  308. source = v
  309. reasons = reason
  310. break L
  311. }else {
  312. continue
  313. }
  314. }else {
  315. if againRepeat(v, info,true) {
  316. continue
  317. }
  318. }
  319. }
  320. }
  321. //特殊词处理
  322. specialNum:= dealWithSpecialWordNumber(info,v)
  323. //前置条件 - 标题相关,有且一个关键词
  324. if specialNum==1 {
  325. if againRepeat(v, info,false) {
  326. continue
  327. }
  328. }
  329. //前置条件3 - 标题相关,均含有关键词
  330. if specialNum==2 {
  331. if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
  332. v.title != "" && info.title != "" {
  333. letter1,letter2:=v.title,info.title
  334. res, _ := regexp.Compile("[0-9a-zA-Z]+");
  335. if res.MatchString(letter1)||res.MatchString(letter2) {
  336. letter1=convertArabicNumeralsAndLetters(letter1)
  337. letter2=convertArabicNumeralsAndLetters(letter2)
  338. }
  339. if strings.Contains(letter1,"重新招标")|| strings.Contains(letter2,"重新招标"){
  340. letter1,letter2=dealWithSpecialPhrases(letter1,letter2)
  341. }
  342. if letter1==letter2 {
  343. reason = reason + "标题关键词相等关系"
  344. if !againRepeat(v, info,false) {//进行二级金额判断
  345. b = true
  346. source = v
  347. reasons = reason
  348. break L
  349. }
  350. }else {
  351. if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
  352. //无包含关系-即不相等
  353. if againContainSpecialWord(v, info) {
  354. continue
  355. }
  356. }
  357. }
  358. }
  359. }
  360. //新增快速数据过少判重
  361. if LowHeavy {
  362. repeat := false
  363. if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
  364. b = true
  365. source = v
  366. reasons = reason
  367. break L
  368. }
  369. }
  370. //代理机构相同-非空相等
  371. if v.agency != "" && info.agency != "" && v.agency == info.agency {
  372. reason = reason + "同机构-"
  373. repeat := false
  374. if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
  375. b = true
  376. source = v
  377. reasons = reason
  378. break L
  379. }
  380. } else {
  381. reason = reason + "非同机构-"
  382. if info.city != "" && info.city == v.city {
  383. reason = reason + "同城-"
  384. repeat := false
  385. if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
  386. b = true
  387. source = v
  388. reasons = reason
  389. break L
  390. }
  391. } else {
  392. reason = reason + "不同城-"
  393. repeat := false
  394. if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
  395. b = true
  396. source = v
  397. reasons = reason
  398. break L
  399. }
  400. }
  401. }
  402. }
  403. }
  404. }
  405. //往预存数据 d 添加
  406. if !b {
  407. ct := info.publishtime
  408. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  409. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  410. d.lock.Lock()
  411. data := d.data[k]
  412. if data == nil {
  413. data = []*Info{info}
  414. d.data[k] = data
  415. if !d.keys[dkey] {
  416. d.keys[dkey] = true
  417. d.update(ct)
  418. }
  419. } else {
  420. data = append(data, info)
  421. d.data[k] = data
  422. }
  423. //添加省
  424. isAreaExist :=false
  425. for _,v:= range d.areakeys {
  426. if v==info.area {
  427. isAreaExist = true
  428. }
  429. }
  430. if !isAreaExist {
  431. areaArr := d.areakeys
  432. areaArr = append(areaArr,info.area)
  433. d.areakeys = areaArr
  434. }
  435. d.lock.Unlock()
  436. }
  437. if isTestLog {
  438. reasons = reasons+"-新修改"
  439. }
  440. return
  441. }
  442. func (d *datamap) update(t int64) {
  443. if TimingTask {
  444. }else {
  445. if IsFull {
  446. d.keymap = d.GetLatelyFiveDay(t)//全量
  447. }else {
  448. d.keymap = d.GetLatelyFiveDayDouble(t) //增量
  449. }
  450. m := map[string]bool{}
  451. for _, v := range d.keymap {
  452. m[v] = true
  453. }
  454. for k, _ := range d.data {
  455. if !m[k[:8]] {
  456. delete(d.data, k)
  457. }
  458. }
  459. for k, _ := range d.keys {
  460. if !m[k] {
  461. delete(d.keys, k)
  462. }
  463. }
  464. }
  465. }
  466. func (d *datamap) GetLatelyFiveDay(t int64) []string {
  467. array := make([]string, d.days)
  468. now := time.Unix(t, 0)
  469. for i := 0; i < d.days; i++ {
  470. array[i] = now.Format(qutil.Date_yyyyMMdd)
  471. now = now.AddDate(0, 0, -1)
  472. }
  473. return array
  474. }
  475. func (d *datamap) GetLatelyFiveDayDouble(t int64) []string {//增量-两倍
  476. array := make([]string, d.days*2)
  477. now := time.Now()
  478. for i := 0; i < d.days*2; i++ {
  479. array[i] = now.Format(qutil.Date_yyyyMMdd)
  480. now = now.AddDate(0, 0, -1)
  481. }
  482. return array
  483. }
  484. //替换原始数据池-更新
  485. func (d *datamap) replacePoolData(newData *Info) {
  486. d.lock.Lock()
  487. ct := newData.publishtime
  488. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  489. k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
  490. data := d.data[k]
  491. for k, v := range data {
  492. if v.id == newData.id {//替换
  493. data[k] = newData
  494. break
  495. }
  496. }
  497. d.data[k] = data
  498. d.lock.Unlock()
  499. }
  500. //相互替换数据池-暂时弃用
  501. func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
  502. //删除数据池的老数据
  503. ct_old := oldData.publishtime
  504. dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
  505. k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
  506. data_old := d.data[k_old]
  507. for k, v := range data_old {
  508. if v.id == oldData.id {//删除对应当前的老数据
  509. data_old = append(data_old[:k], data_old[k+1:]...)
  510. break
  511. }
  512. }
  513. d.data[k_old] = data_old
  514. //添加新的
  515. ct := newData.publishtime
  516. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  517. k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
  518. d.lock.Lock()
  519. data := d.data[k]
  520. if data == nil {
  521. data = []*Info{newData}
  522. d.data[k] = data
  523. if !d.keys[dkey] {
  524. d.keys[dkey] = true
  525. d.update(ct)
  526. }
  527. } else {
  528. data = append(data, newData)
  529. d.data[k] = data
  530. }
  531. //添加省
  532. isAreaExist :=false
  533. for _,v:= range d.areakeys {
  534. if v==newData.area {
  535. isAreaExist = true
  536. }
  537. }
  538. if !isAreaExist {
  539. areaArr := d.areakeys
  540. areaArr = append(areaArr,newData.area)
  541. d.areakeys = areaArr
  542. }
  543. d.lock.Unlock()
  544. }
  545. //总计条数-暂时弃用
  546. func (d *datamap) currentTotalCount() int {
  547. num:=qutil.IntAll(0)
  548. for _,v:=range d.data {
  549. num = num+qutil.IntAll(len(v))
  550. }
  551. return num
  552. }