datamap.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. package main
  2. import (
  3. "fmt"
  4. "log"
  5. qutil "qfw/util"
  6. "reflect"
  7. "regexp"
  8. "strings"
  9. "sync"
  10. "time"
  11. )
  12. type Info struct {
  13. id string //id
  14. title string //标题
  15. spidercode string //爬虫代码
  16. area string //省份
  17. city string //城市
  18. subtype string //信息类型
  19. buyer string //采购单位
  20. agency string //代理机构
  21. winner string //中标单位
  22. budget float64 //预算金额
  23. bidamount float64 //中标金额
  24. projectname string //项目名称
  25. projectcode string //项目编号
  26. contractnumber string //合同编号
  27. publishtime int64 //发布时间
  28. comeintime int64 //入库时间
  29. bidopentime int64 //开标时间
  30. bidopenaddress string //开标地点
  31. site string //站点
  32. href string //正文的url
  33. repeatid string //重复id
  34. specialWord bool //特殊词
  35. titleSpecialWord bool //标题特殊词
  36. isJphref bool //是否竞品数据
  37. c_title string //清洗后的标题
  38. c_projectname string //清洗后的项目名称
  39. }
  40. var datelimit = float64(432000) //五天
  41. var sitelock sync.Mutex //锁
  42. //一般数据判重
  43. type datamap struct {
  44. lock sync.Mutex //锁
  45. days int //保留几天数据
  46. data map[string][]*Info
  47. keymap []string
  48. areakeys []string
  49. keys map[string]bool
  50. }
  51. //历史~存量
  52. func TimedTaskDatamap(days int, lasttime int64, numIndex int) *datamap {
  53. datelimit = qutil.Float64All(days * 86400)
  54. dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{}, map[string]bool{}}
  55. if lasttime < 0 {
  56. log.Println("数据池空数据")
  57. return dm
  58. }
  59. start := int(time.Now().Unix())
  60. sess := data_mgo.GetMgoConn()
  61. defer data_mgo.DestoryMongoConn(sess)
  62. query := map[string]interface{}{"publishtime": map[string]interface{}{
  63. "$lt": lasttime,
  64. }}
  65. log.Println("query", query)
  66. it := sess.DB(data_mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
  67. n, continuSum := 0, 0
  68. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  69. //if n%10000 == 0 {
  70. // log.Println("当前 n:", n, "数量:", continuSum, tmp["_id"], tmp["publishtime"])
  71. //}
  72. if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 ||
  73. qutil.IntAll(tmp["dataging"]) == 1 || qutil.ObjToString(tmp["subtype"]) == "拟建" {
  74. } else {
  75. if fmt.Sprint(reflect.TypeOf(tmp["publishtime"])) == "string" {
  76. continue
  77. }
  78. pt := tmp["publishtime"]
  79. pt_time := qutil.Int64All(pt)
  80. if pt_time > time.Now().Unix() {
  81. continue
  82. }
  83. if qutil.Float64All(lasttime-pt_time) < datelimit {
  84. continuSum++
  85. info := NewInfo(tmp)
  86. dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
  87. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  88. data := dm.data[k]
  89. if data == nil {
  90. data = []*Info{}
  91. }
  92. data = append(data, info)
  93. dm.data[k] = data
  94. dm.keys[dkey] = true
  95. //添加省
  96. isAreaExist := false
  97. for _, v := range dm.areakeys {
  98. if v == info.area {
  99. isAreaExist = true
  100. }
  101. }
  102. if !isAreaExist {
  103. areaArr := dm.areakeys
  104. areaArr = append(areaArr, info.area)
  105. dm.areakeys = areaArr
  106. }
  107. } else {
  108. break
  109. }
  110. }
  111. tmp = make(map[string]interface{})
  112. }
  113. log.Printf("第%d组:数据池构建完成:%d秒,%d个\n", numIndex, int(time.Now().Unix())-start, n)
  114. return dm
  115. }
  116. //增量
  117. func NewDatamap(days int, lastid string) *datamap {
  118. datelimit = qutil.Float64All(days * 86400 * 2)
  119. dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{}, map[string]bool{}}
  120. if lastid == "" {
  121. log.Println("不构建数据池")
  122. return dm
  123. }
  124. //初始化加载数据
  125. sess := data_mgo.GetMgoConn()
  126. defer data_mgo.DestoryMongoConn(sess)
  127. query := map[string]interface{}{"_id": map[string]interface{}{
  128. "$lte": StringTOBsonId(lastid),
  129. }}
  130. log.Println("query", query)
  131. it := sess.DB(data_mgo.DbName).C(extract).Find(query).Sort("-publishtime").Iter()
  132. nowTime := time.Now().Unix() //当前时间的时间戳
  133. n, continuSum := 0, 0
  134. for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
  135. if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 || qutil.ObjToString(tmp["subtype"]) == "拟建" {
  136. } else {
  137. if fmt.Sprint(reflect.TypeOf(tmp["publishtime"])) == "string" {
  138. continue
  139. }
  140. pt := tmp["publishtime"]
  141. pt_time := qutil.Int64All(pt)
  142. if pt_time > time.Now().Unix() {
  143. continue
  144. }
  145. if qutil.Float64All(nowTime-pt_time) <= datelimit {
  146. continuSum++
  147. info := NewInfo(tmp)
  148. dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
  149. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  150. data := dm.data[k]
  151. if data == nil {
  152. data = []*Info{}
  153. }
  154. data = append(data, info)
  155. dm.data[k] = data
  156. dm.keys[dkey] = true
  157. //添加省
  158. isAreaExist := false
  159. for _, v := range dm.areakeys {
  160. if v == info.area {
  161. isAreaExist = true
  162. }
  163. }
  164. if !isAreaExist {
  165. areaArr := dm.areakeys
  166. areaArr = append(areaArr, info.area)
  167. dm.areakeys = areaArr
  168. }
  169. } else {
  170. break
  171. }
  172. }
  173. if n%10000 == 0 {
  174. log.Println("当前 n:", n, "数量:", continuSum, tmp["_id"])
  175. }
  176. tmp = make(map[string]interface{})
  177. }
  178. log.Println("load data:", n, "总数:", continuSum)
  179. return dm
  180. }
  181. //数据构建
  182. func NewInfo(tmp map[string]interface{}) *Info {
  183. subtype := qutil.ObjToString(tmp["subtype"])
  184. if subtype == "招标" || subtype == "邀标" || subtype == "询价" ||
  185. subtype == "竞谈" || subtype == "竞价" {
  186. subtype = "招标"
  187. }
  188. area := qutil.ObjToString(tmp["area"])
  189. if area == "A" {
  190. area = "全国"
  191. }
  192. info := &Info{}
  193. info.id = BsonTOStringId(tmp["_id"])
  194. info.title = qutil.ObjToString(tmp["title"])
  195. info.area = area
  196. info.subtype = subtype
  197. info.spidercode = qutil.ObjToString(tmp["spidercode"])
  198. info.buyer = qutil.ObjToString(tmp["buyer"])
  199. info.projectname = qutil.ObjToString(tmp["projectname"])
  200. info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
  201. info.projectcode = qutil.ObjToString(tmp["projectcode"])
  202. info.city = qutil.ObjToString(tmp["city"])
  203. info.agency = qutil.ObjToString(tmp["agency"])
  204. info.winner = deleteExtraSpaceName(qutil.ObjToString(tmp["winner"]))
  205. info.budget = qutil.Float64All(tmp["budget"])
  206. info.bidamount = qutil.Float64All(tmp["bidamount"])
  207. info.publishtime = qutil.Int64All(tmp["publishtime"])
  208. info.comeintime = qutil.Int64All(tmp["comeintime"])
  209. info.bidopentime = qutil.Int64All(tmp["bidopentime"])
  210. info.bidopenaddress = qutil.ObjToString(tmp["bidopenaddress"])
  211. info.site = qutil.ObjToString(tmp["site"])
  212. info.href = qutil.ObjToString(tmp["href"])
  213. info.repeatid = qutil.ObjToString(tmp["repeatid"])
  214. info.specialWord = FilterRegTitle.MatchString(info.title)
  215. info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) || FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
  216. info.isJphref = IsJpHref(qutil.ObjToString(tmp["href"]))
  217. //经过通用清洗后
  218. info.c_title = cleanNameFilterRedundant(info.title)
  219. info.c_projectname = cleanNameFilterRedundant(info.projectname)
  220. return info
  221. }
  222. //判重方法
  223. //判重方法
  224. //判重方法
  225. func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
  226. reason := ""
  227. keys := []string{}
  228. d.lock.Lock()
  229. for k, _ := range d.keys { //不同时间段
  230. if info.area == "全国" { //匹配所有省
  231. for _, v := range d.areakeys {
  232. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, v))
  233. }
  234. } else { //匹配指定省
  235. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
  236. }
  237. keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
  238. }
  239. d.lock.Unlock()
  240. L:
  241. for _, k := range keys {
  242. d.lock.Lock()
  243. data := d.data[k]
  244. d.lock.Unlock()
  245. if len(data) > 0 { //对比v 找到同类型,同省或全国的数据作对比
  246. for _, v := range data {
  247. reason = ""
  248. if v.id == info.id { //正常重复
  249. return false, v, ""
  250. }
  251. //buyer 优先级高,有值且不相等过滤
  252. if info.buyer != "" && v.buyer != "" && info.buyer != v.buyer {
  253. if buyerIsContinue(v, info) {
  254. continue
  255. }
  256. }
  257. // 竞品判重模式
  258. if v.isJphref || info.isJphref {
  259. if confirmJingPinIsRepeatData(v, info) {
  260. reason = "竞品模式~重复"
  261. b = true
  262. source = v
  263. reasons = reason
  264. break L
  265. }
  266. }
  267. //站点补城市
  268. if info.site != "" { //站点临时赋值
  269. if info.area == "全国" || info.city == "" {
  270. sitelock.Lock()
  271. dict := SiteMap[info.site]
  272. sitelock.Unlock()
  273. if dict != nil && qutil.ObjToString(dict["city"]) != "" {
  274. info.area = qutil.ObjToString(dict["area"])
  275. info.city = qutil.ObjToString(dict["city"])
  276. }
  277. }
  278. }
  279. //前置条件-五要素均相等
  280. if leadingElementSame(v, info) {
  281. reason = "五要素-相同-满足"
  282. b = true
  283. source = v
  284. reasons = reason
  285. break L
  286. }
  287. //前置条件 - 站点相关
  288. if info.site != "" && info.site == v.site {
  289. if info.href != "" && info.href == v.href {
  290. reason = "同站点-href相同"
  291. b = true
  292. source = v
  293. reasons = reason
  294. break L
  295. }
  296. //相同发布时间-标题无包含关系 - 项目名称不等
  297. if isTheSameDay(info.publishtime, v.publishtime) {
  298. if !isTheSimilarName(info.title, v.title) {
  299. continue
  300. }
  301. }
  302. //不同href
  303. if info.href != "" && info.href != v.href {
  304. if v.title == info.title {
  305. if !againRepeat(v, info, true) { //进行同站点二次判断
  306. reason = "同站点-href不同-标题相同等"
  307. b = true
  308. source = v
  309. reasons = reason
  310. break L
  311. } else {
  312. continue
  313. }
  314. } else {
  315. if againRepeat(v, info, true) {
  316. continue
  317. }
  318. }
  319. }
  320. }
  321. //特殊词处理
  322. specialNum := dealWithSpecialWordNumber(info, v)
  323. //前置条件 - 标题相关,有且一个关键词
  324. if specialNum == 1 {
  325. if againRepeat(v, info, false) {
  326. continue
  327. }
  328. }
  329. //前置条件 - 标题相关,均含有关键词
  330. if specialNum == 2 {
  331. if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
  332. v.title != "" && info.title != "" {
  333. letter1, letter2 := v.title, info.title
  334. res, _ := regexp.Compile("[0-9a-zA-Z]+")
  335. if res.MatchString(letter1) || res.MatchString(letter2) {
  336. letter1 = convertArabicNumeralsAndLetters(letter1)
  337. letter2 = convertArabicNumeralsAndLetters(letter2)
  338. }
  339. if strings.Contains(letter1, "重新招标") || strings.Contains(letter2, "重新招标") {
  340. letter1, letter2 = dealWithSpecialPhrases(letter1, letter2)
  341. }
  342. letter1 = cleanNameFilterRedundant(letter1)
  343. letter2 = cleanNameFilterRedundant(letter2)
  344. if letter1 == letter2 {
  345. reason = reason + "标题关键词相等有效关系"
  346. if !againRepeat(v, info, false) { //进行二级金额判断
  347. b = true
  348. source = v
  349. reasons = reason
  350. break L
  351. }
  352. } else {
  353. if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
  354. if againContainSpecialWord(v, info) { //无包含关系-即不相等
  355. continue
  356. }
  357. }
  358. }
  359. }
  360. }
  361. //新增快速数据过少判重
  362. if LowHeavy {
  363. repeat := false
  364. if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
  365. b = true
  366. source = v
  367. reasons = reason
  368. break L
  369. }
  370. }
  371. //代理机构相同-非空相等
  372. if v.agency != "" && info.agency != "" && v.agency == info.agency {
  373. reason = reason + "同机构-"
  374. repeat := false
  375. if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
  376. b = true
  377. source = v
  378. reasons = reason
  379. break L
  380. }
  381. } else {
  382. reason = reason + "非同机构-"
  383. if info.city != "" && info.city == v.city {
  384. reason = reason + "同城-"
  385. repeat := false
  386. if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
  387. b = true
  388. source = v
  389. reasons = reason
  390. break L
  391. }
  392. } else {
  393. reason = reason + "不同城-"
  394. repeat := false
  395. if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
  396. b = true
  397. source = v
  398. reasons = reason
  399. break L
  400. }
  401. }
  402. }
  403. }
  404. }
  405. }
  406. //往预存数据 d 添加
  407. if !b {
  408. ct := info.publishtime
  409. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  410. k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
  411. d.lock.Lock()
  412. data := d.data[k]
  413. if data == nil {
  414. data = []*Info{info}
  415. d.data[k] = data
  416. if !d.keys[dkey] {
  417. d.keys[dkey] = true
  418. d.update(ct)
  419. }
  420. } else {
  421. data = append(data, info)
  422. d.data[k] = data
  423. }
  424. //添加省
  425. isAreaExist := false
  426. for _, v := range d.areakeys {
  427. if v == info.area {
  428. isAreaExist = true
  429. }
  430. }
  431. if !isAreaExist {
  432. areaArr := d.areakeys
  433. areaArr = append(areaArr, info.area)
  434. d.areakeys = areaArr
  435. }
  436. d.lock.Unlock()
  437. }
  438. return
  439. }
  440. func (d *datamap) update(t int64) {
  441. if TimingTask {
  442. } else {
  443. if IsFull {
  444. d.keymap = d.GetLatelyFiveDay(t) //全量
  445. } else {
  446. d.keymap = d.GetLatelyFiveDayDouble(t) //增量
  447. }
  448. m := map[string]bool{}
  449. for _, v := range d.keymap {
  450. m[v] = true
  451. }
  452. for k, _ := range d.data {
  453. if !m[k[:8]] {
  454. delete(d.data, k)
  455. }
  456. }
  457. for k, _ := range d.keys {
  458. if !m[k] {
  459. delete(d.keys, k)
  460. }
  461. }
  462. }
  463. }
  464. func (d *datamap) GetLatelyFiveDay(t int64) []string {
  465. array := make([]string, d.days)
  466. now := time.Unix(t, 0)
  467. for i := 0; i < d.days; i++ {
  468. array[i] = now.Format(qutil.Date_yyyyMMdd)
  469. now = now.AddDate(0, 0, -1)
  470. }
  471. return array
  472. }
  473. func (d *datamap) GetLatelyFiveDayDouble(t int64) []string { //增量-两倍
  474. array := make([]string, d.days*2)
  475. now := time.Now()
  476. for i := 0; i < d.days*2; i++ {
  477. array[i] = now.Format(qutil.Date_yyyyMMdd)
  478. now = now.AddDate(0, 0, -1)
  479. }
  480. return array
  481. }
  482. //替换原始数据池-更新
  483. func (d *datamap) replacePoolData(newData *Info) {
  484. d.lock.Lock()
  485. ct := newData.publishtime
  486. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  487. k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
  488. data := d.data[k]
  489. for k, v := range data {
  490. if v.id == newData.id { //替换
  491. data[k] = newData
  492. break
  493. }
  494. }
  495. d.data[k] = data
  496. d.lock.Unlock()
  497. }
  498. //相互替换数据池-暂时弃用
  499. func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
  500. //删除数据池的老数据
  501. ct_old := oldData.publishtime
  502. dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
  503. k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
  504. data_old := d.data[k_old]
  505. for k, v := range data_old {
  506. if v.id == oldData.id { //删除对应当前的老数据
  507. data_old = append(data_old[:k], data_old[k+1:]...)
  508. break
  509. }
  510. }
  511. d.data[k_old] = data_old
  512. //添加新的
  513. ct := newData.publishtime
  514. dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
  515. k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
  516. d.lock.Lock()
  517. data := d.data[k]
  518. if data == nil {
  519. data = []*Info{newData}
  520. d.data[k] = data
  521. if !d.keys[dkey] {
  522. d.keys[dkey] = true
  523. d.update(ct)
  524. }
  525. } else {
  526. data = append(data, newData)
  527. d.data[k] = data
  528. }
  529. //添加省
  530. isAreaExist := false
  531. for _, v := range d.areakeys {
  532. if v == newData.area {
  533. isAreaExist = true
  534. }
  535. }
  536. if !isAreaExist {
  537. areaArr := d.areakeys
  538. areaArr = append(areaArr, newData.area)
  539. d.areakeys = areaArr
  540. }
  541. d.lock.Unlock()
  542. }
  543. //总计条数-暂时弃用
  544. func (d *datamap) currentTotalCount() int {
  545. num := qutil.IntAll(0)
  546. for _, v := range d.data {
  547. num = num + qutil.IntAll(len(v))
  548. }
  549. return num
  550. }