123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284 |
- package main
- import (
- "fmt"
- "log"
- qutil "qfw/util"
- "qfw/util/mongodb"
- "regexp"
- "strings"
- "sync"
- "time"
- )
- type Info struct {
- id string //id
- title string //标题
- area string //省份
- city string //城市
- subtype string //信息类型
- buyer string //采购单位
- agency string //代理机构
- winner string //中标单位
- budget float64 //预算金额
- bidamount float64 //中标金额
- projectname string //项目名称
- projectcode string //项目编号
- contractnumber string //合同编号
- publishtime int64 //发布时间
- comeintime int64 //入库时间
- bidopentime int64 //开标时间
- bidopenaddress string //开标地点
- site string //站点
- href string //正文的url
- repeatid string //重复id
- titleSpecialWord bool //标题特殊词
- specialWord bool //再次判断的特殊词
- mergemap map[string]interface{} //合并记录
- is_site bool //是否站点城市
- }
- var datelimit = float64(432000) //五天
- var sitelock sync.Mutex //锁
- //一般数据判重
- type datamap struct {
- lock sync.Mutex //锁
- days int //保留几天数据
- data map[string][]*Info
- keymap []string
- keys map[string]bool
- }
- //历史更新数据
- type historymap struct {
- lock sync.Mutex //锁
- days int //保留几天数据
- data map[string][]*Info
- keymap []string
- keys map[string]bool
- }
- func TimedTaskDatamap(days int,lasttime int64) *datamap {
- log.Println("数据池开始重新构建")
- datelimit = qutil.Float64All(days * 86400)
- dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, map[string]bool{}}
- if lasttime <0 {
- log.Println("数据池空数据")
- return dm
- }
- start := int(time.Now().Unix())
- sess := mgo.GetMgoConn()
- defer mgo.DestoryMongoConn(sess)
- query := map[string]interface{}{"publishtime": map[string]interface{}{
- "$lt": lasttime,
- }}
- log.Println("query", query)
- it := sess.DB(mgo.DbName).C(extract).Find(query).Sort("-publishtime").Iter()
- n, continuSum := 0, 0
- for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
- //qutil.IntAll(tmp["dataging"]) == 1
- if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1||qutil.IntAll(tmp["dataging"]) == 1 {
- continuSum++
- } else {
- pt := tmp["publishtime"]
- pt_time := qutil.Int64All(pt)
- if qutil.Float64All(lasttime-pt_time) < datelimit {
- info := NewInfo(tmp)
- dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
- k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
- data := dm.data[k]
- if data == nil {
- data = []*Info{}
- }
- data = append(data, info)
- dm.data[k] = data
- dm.keys[dkey] = true
- } else {
- break
- }
- }
- if n%10000 == 0 {
- log.Println("current 数据池 n:", n, continuSum)
- }
- tmp = make(map[string]interface{})
- }
- log.Printf("数据池构建完成::%d秒,%d个\n", int(time.Now().Unix())-start, n)
- return dm
- }
- func NewDatamap(days int, lastid string) *datamap {
- datelimit = qutil.Float64All(days * 86400)
- dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, map[string]bool{}}
- if lastid == "" {
- return dm
- }
- //初始化加载数据
- sess := mgo.GetMgoConn()
- defer mgo.DestoryMongoConn(sess)
- query := map[string]interface{}{"_id": map[string]interface{}{
- "$lte": StringTOBsonId(lastid),
- }}
- log.Println("query", query)
- it := sess.DB(mgo.DbName).C(extract).Find(query).Sort("-_id").Iter()
- now1 := int64(0)
- n, continuSum := 0, 0
- for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
- if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 {
- continuSum++
- } else {
- pt := tmp["comeintime"]
- if Is_Sort {
- pt = tmp["publishtime"]
- }
- pt_time := qutil.Int64All(pt)
- if pt_time <= 0 {
- continue
- }
- if now1 == 0 {
- now1 = pt_time
- }
- if qutil.Float64All(now1-pt_time) < datelimit {
- info := NewInfo(tmp)
- dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
- k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
- data := dm.data[k]
- if data == nil {
- data = []*Info{}
- }
- data = append(data, info)
- dm.data[k] = data
- dm.keys[dkey] = true
- } else {
- break
- }
- }
- if n%5000 == 0 {
- log.Println("current n:", n, continuSum)
- }
- tmp = make(map[string]interface{})
- }
- log.Println("load data:", n)
- return dm
- }
- //构建新历史数据池
- func NewHistorymap(startid string, lastid string, startTime int64, lastTime int64) *historymap {
- datelimit = qutil.Float64All(5 * 86400)
- hm := &historymap{sync.Mutex{}, 5, map[string][]*Info{}, []string{}, map[string]bool{}}
- if lastid == "" || startid == "" {
- return hm
- }
- //取startid之前5天
- sess_start := mgo.GetMgoConn()
- defer mgo.DestoryMongoConn(sess_start) //lte gte
- it_start := sess_start.DB(mgo.DbName).C(extract).Find(mongodb.ObjToMQ(`{"_id":{"$lte":"`+startid+`"}}`,
- true)).Sort("-_id").Iter()
- m, n := 0, 0
- for tmp_start := make(map[string]interface{}); it_start.Next(&tmp_start); {
- if qutil.IntAll(tmp_start["repeat"]) == 1||qutil.IntAll(tmp_start["repeat"]) == -1 {
- continue
- }
- pt_s := tmp_start["comeintime"]
- if Is_Sort {
- pt_s = tmp_start["publishtime"]
- }
- pt_time := qutil.Int64All(pt_s)
- if pt_time <= 0 {
- continue
- }
- if qutil.Float64All(startTime-pt_time) <= datelimit {
- n++
- info := NewInfo(tmp_start)
- dkey := qutil.FormatDateWithObj(&pt_s, qutil.Date_yyyyMMdd)
- k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
- data := hm.data[k]
- if data == nil {
- data = []*Info{}
- }
- data = append(data, info)
- hm.data[k] = data
- hm.keys[dkey] = true
- } else {
- break
- }
- tmp_start = make(map[string]interface{})
- }
- log.Println("load history 前:", n)
- //取lastid之后5天
- sess_last := mgo.GetMgoConn()
- defer mgo.DestoryMongoConn(sess_last) //lte gte
- it_last := sess_last.DB(mgo.DbName).C(extract).Find(mongodb.ObjToMQ(`{"_id":{"$gte":"`+lastid+`"}}`,
- true)).Sort("_id").Iter()
- for tmp_last := make(map[string]interface{}); it_last.Next(&tmp_last); {
- if qutil.IntAll(tmp_last["repeat"]) == 1||qutil.IntAll(tmp_last["repeat"]) == -1 {
- continue
- }
- pt_l := tmp_last["comeintime"]
- if Is_Sort {
- pt_l = tmp_last["publishtime"]
- }
- pt_time := qutil.Int64All(pt_l)
- if pt_time <= 0 {
- continue
- }
- if qutil.Float64All(pt_time-lastTime) <= datelimit {
- m++
- info := NewInfo(tmp_last)
- dkey := qutil.FormatDateWithObj(&pt_l, qutil.Date_yyyyMMdd)
- k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
- data := hm.data[k]
- if data == nil {
- data = []*Info{}
- }
- data = append(data, info)
- hm.data[k] = data
- hm.keys[dkey] = true
- } else {
- break
- }
- tmp_last = make(map[string]interface{})
- }
- log.Println("load history 后:", m)
- return hm
- }
- func NewInfo(tmp map[string]interface{}) *Info {
- subtype := qutil.ObjToString(tmp["subtype"])
- area := qutil.ObjToString(tmp["area"])
- if area == "A" {
- area = "全国"
- }
- info := &Info{}
- info.id = BsonTOStringId(tmp["_id"])
- info.title = qutil.ObjToString(tmp["title"])
- info.area = area
- info.subtype = subtype
- info.buyer = qutil.ObjToString(tmp["buyer"])
- info.projectname = qutil.ObjToString(tmp["projectname"])
- info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
- info.projectcode = qutil.ObjToString(tmp["projectcode"])
- info.city = qutil.ObjToString(tmp["city"])
- info.agency = qutil.ObjToString(tmp["agency"])
- info.winner = qutil.ObjToString(tmp["winner"])
- info.budget = qutil.Float64All(tmp["budget"])
- info.bidamount = qutil.Float64All(tmp["bidamount"])
- info.publishtime = qutil.Int64All(tmp["publishtime"])
- info.comeintime = qutil.Int64All(tmp["comeintime"])
- info.bidopentime = qutil.Int64All(tmp["bidopentime"])
- info.bidopenaddress = qutil.ObjToString(tmp["bidopenaddress"])
- info.site = qutil.ObjToString(tmp["site"])
- info.href = qutil.ObjToString(tmp["href"])
- info.repeatid = qutil.ObjToString(tmp["repeatid"])
- info.specialWord = FilterRegTitle.MatchString(info.title)
- info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) ||FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
- info.mergemap = *qutil.ObjToMap(tmp["merge_map"])
- if info.mergemap == nil {
- info.mergemap = make(map[string]interface{}, 0)
- }
- info.is_site = false
- return info
- }
- //判重方法
- func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
- reason := ""
- keys := []string{}
- d.lock.Lock()
- for k, _ := range d.keys { //不同时间段
- keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
- if info.area != "全国" { //这个后续可以不要
- keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
- }
- }
- d.lock.Unlock()
- L:
- for _, k := range keys {
- d.lock.Lock()
- data := d.data[k]
- d.lock.Unlock()
- if len(data) > 0 { //对比v 找到同类型,同省或全国的数据作对比
- for _, v := range data {
- reason = ""
- if v.id == info.id { //正常重复
- return false, v, ""
- }
- if info.subtype == v.subtype {
- if info.site != "" {
- sitelock.Lock()
- dict := SiteMap[info.site]
- sitelock.Unlock()
- if dict != nil {
- if info.area == "全国" && dict["area"] != "" {
- info.is_site = true
- info.area = qutil.ObjToString(dict["area"])
- info.city = qutil.ObjToString(dict["city"])
- } else {
- if info.city == "" && dict["city"] != "" {
- info.is_site = true
- info.area = qutil.ObjToString(dict["area"])
- info.city = qutil.ObjToString(dict["city"])
- }
- }
- }
- }
- //前置条件1 - 站点相关
- if info.site != "" && info.site == v.site {
- if info.href != "" && info.href == v.href {
- reason = "href相同"
- b = true
- source = v
- reasons = reason
- break L
- }
- if info.href != "" && info.href != v.href {
- reason = "href不同-"
- }
- }
- //前置条件2 - 标题相关,有且一个关键词
- if ((info.titleSpecialWord && !v.titleSpecialWord) || (info.specialWord && !v.specialWord)) &&
- info.title != v.title && v.title != "" && info.title != "" {
- continue
- }
- //前置条件3 - 标题相关,均含有关键词
- if ((info.titleSpecialWord && v.titleSpecialWord) || (info.specialWord && v.specialWord)) &&
- len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 && v.title != "" && info.title != "" {
- if !(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
- continue //无包含关系
- }
- if strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title) {
- reason = reason + "标题关键词且包含关系"
- //继续二级金额判断
- if !againRepeat(v, info) {
- b = true
- source = v
- reasons = reason
- break
- }
- }
- }
- //新增快速数据过少判重
- if LowHeavy {
- repeat := false
- if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
- b = true
- source = v
- reasons = reason
- break
- }
- }
- //代理机构相同-非空相等
- if v.agency != "" && info.agency != "" && v.agency == info.agency {
- reason = reason + "同机构-"
- repeat := false
- if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
- b = true
- source = v
- reasons = reason
- break
- }
- } else {
- reason = reason + "非同机构-"
- if info.city != "" && info.city == v.city {
- reason = reason + "同城-"
- repeat := false
- if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
- b = true
- source = v
- reasons = reason
- break
- }
- } else {
- reason = reason + "不同城-"
- repeat := false
- if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
- b = true
- source = v
- reasons = reason
- break
- }
- }
- }
- }
- }
- }
- }
- //往预存数据 d 添加
- if !b {
- ct := info.comeintime
- if Is_Sort ||TimingTask{
- ct = info.publishtime
- }
- dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
- k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
- d.lock.Lock()
- data := d.data[k]
- if data == nil {
- data = []*Info{info}
- d.data[k] = data
- if !d.keys[dkey] {
- d.keys[dkey] = true
- d.update(ct)
- }
- } else {
- data = append(data, info)
- d.data[k] = data
- }
- d.lock.Unlock()
- }
- return
- }
- func (h *historymap) checkHistory(info *Info) (b bool, source *Info, reasons string) {
- reason := ""
- keys := []string{}
- h.lock.Lock()
- for k, _ := range h.keys { //不同时间段
- keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
- if info.area != "全国" { //这个后续可以不要
- keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
- }
- }
- h.lock.Unlock()
- L:
- for _, k := range keys {
- h.lock.Lock()
- data := h.data[k]
- h.lock.Unlock()
- if len(data) > 0 { //对比v 找到同类型,同省或全国的数据作对比
- for _, v := range data {
- reason = ""
- if v.id == info.id { //正常重复
- return false, v, ""
- }
- if info.subtype == v.subtype {
- if info.site != "" {
- sitelock.Lock()
- dict := SiteMap[info.site]
- sitelock.Unlock()
- if dict != nil {
- if info.area == "全国" && dict["area"] != "" {
- info.area = qutil.ObjToString(dict["area"])
- info.city = qutil.ObjToString(dict["city"])
- } else {
- if info.city == "" && dict["city"] != "" {
- info.area = qutil.ObjToString(dict["area"])
- info.city = qutil.ObjToString(dict["city"])
- }
- }
- }
- }
- //前置条件1 - 站点相关
- if info.site != "" && info.site == v.site {
- if info.href != "" && info.href == v.href {
- reason = "href相同"
- b = true
- source = v
- reasons = reason
- break L
- }
- if info.href != "" && info.href != v.href {
- reason = "href不同-"
- }
- }
- //前置条件2 - 标题相关,有且一个关键词
- if ((info.titleSpecialWord && !v.titleSpecialWord) || (info.specialWord && !v.specialWord)) &&
- info.title != v.title && v.title != "" && info.title != "" {
- continue
- }
- //前置条件3 - 标题相关,均含有关键词
- if ((info.titleSpecialWord && v.titleSpecialWord) || (info.specialWord && v.specialWord)) &&
- len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 && v.title != "" && info.title != "" {
- if !(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
- continue //无包含关系
- }
- if strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title) {
- reason = reason + "标题关键词且包含关系"
- //继续二级金额判断
- if !againRepeat(v, info) {
- b = true
- source = v
- reasons = reason
- break
- }
- }
- }
- //新增快速数据过少判重
- if LowHeavy {
- repeat := false
- if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
- b = true
- source = v
- reasons = reason
- break
- }
- }
- //代理机构相同-非空相等
- if v.agency != "" && info.agency != "" && v.agency == info.agency {
- reason = reason + "同机构-"
- repeat := false
- if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
- b = true
- source = v
- reasons = reason
- break
- }
- } else {
- reason = reason + "非同机构-"
- if info.city != "" && info.city == v.city {
- reason = reason + "同城-"
- repeat := false
- if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
- b = true
- source = v
- reasons = reason
- break
- }
- } else {
- reason = reason + "不同城-"
- repeat := false
- if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
- b = true
- source = v
- reasons = reason
- break
- }
- }
- }
- }
- }
- }
- }
- //
- if b {
- if info.repeatid == source.id {
- b = false //重复-无变化-不处理
- }
- } else {
- if source != nil {
- if source.repeatid != "" { //未判重-有变化--记录
- b = true
- reason = "未判重记录"
- reasons = reason
- }
- }
- }
- //往预存数据 d 添加
- if !b {
- ct := info.comeintime
- if Is_Sort {
- ct = info.publishtime
- }
- dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
- k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
- data := h.data[k]
- if data == nil {
- data = []*Info{info}
- h.data[k] = data
- if !h.keys[dkey] {
- h.keys[dkey] = true
- //h.update(ct)
- }
- } else {
- data = append(data, info)
- h.data[k] = data
- }
- }
- return
- }
- //替换原始数据池
- func (d *datamap) replaceSourceData(replaceData *Info, replaceId string) {
- ct := replaceData.comeintime
- if Is_Sort||TimingTask {
- ct = replaceData.publishtime
- }
- dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
- k := fmt.Sprintf("%s_%s_%s", dkey, replaceData.subtype, replaceData.area)
- d.lock.Lock()
- data := d.data[k]
- if data == nil {
- data = []*Info{replaceData}
- d.data[k] = data
- if !d.keys[dkey] {
- d.keys[dkey] = true
- }
- } else {
- //遍历替换
- for k, v := range data {
- if v.id == replaceId {
- data[k] = replaceData
- break
- }
- }
- d.data[k] = data
- }
- d.lock.Unlock()
- }
- func (h *historymap) replaceSourceData(replaceData *Info, replaceId string) {
- ct := replaceData.comeintime
- if Is_Sort {
- ct = replaceData.publishtime
- }
- dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
- k := fmt.Sprintf("%s_%s_%s", dkey, replaceData.subtype, replaceData.area)
- h.lock.Lock()
- data := h.data[k]
- if data == nil {
- data = []*Info{replaceData}
- h.data[k] = data
- if !h.keys[dkey] {
- h.keys[dkey] = true
- }
- } else {
- //遍历替换
- for k, v := range data {
- if v.id == replaceId {
- data[k] = replaceData
- break
- }
- }
- h.data[k] = data
- }
- h.lock.Unlock()
- }
- func (d *datamap) update(t int64) {
- //每天0点清除历史数据
- d.keymap = d.GetLatelyFiveDay(t)
- m := map[string]bool{}
- for _, v := range d.keymap {
- m[v] = true
- }
- all, all1 := 0, 0
- for k, v := range d.data {
- all += len(v)
- if !m[k[:8]] {
- delete(d.data, k)
- }
- }
- for k, _ := range d.keys {
- if !m[k] {
- delete(d.keys, k)
- }
- }
- for _, v := range d.data {
- all1 += len(v)
- }
- //log.Println("更新前后数据:", all, all1)
- }
- func (d *datamap) GetLatelyFiveDay(t int64) []string {
- array := make([]string, d.days)
- now := time.Unix(t, 0)
- for i := 0; i < d.days; i++ {
- array[i] = now.Format(qutil.Date_yyyyMMdd)
- now = now.AddDate(0, 0, -1)
- }
- return array
- }
- /*
- **************************
- ******** 以下为判重 ********
- **************************
- */
- //快速低质量数据判重
- func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
- //首先判定是否为低质量数据 info目标数据
- if info.agency==v.agency&&info.title!=""&&
- info.title==v.title &&
- info.projectname==""&&info.projectcode==""&&info.contractnumber==""&&info.buyer=="" {
- isValue:=0//五要素判断
- if info.budget != 0 {//预算
- isValue++
- }
- if info.bidopentime != 0{//开标时间
- isValue++
- }
- if info.bidopenaddress!=""{//开标地点
- isValue++
- }
- if info.winner != ""{//中标单位
- isValue++
- }
- if info.bidamount != 0 {//中标金额
- isValue++
- }
- if isValue==0 {
- //if info.site!=v.site {
- // log.Println("符合低质量条件条件0",info.id,"--",v.id)
- //}
- //log.Println("符合低质量条件条件0",info.id,"--",v.id)
- reason = reason + "---要素均为空,标题包含关系"
- return true, reason
- }else if isValue==1 {
- isMeet := false
- if isMeet, reason = judgeLowQualityData(v, info, reason); isMeet {
- //log.Println("符合低质量条件条件1",info.id,"--",v.id)
- reason = reason + "---有且一个要素组合"
- return true, reason
- }
- }else {
- }
- }
- return false,reason
- }
- //类别细节原因记录
- func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
- if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
- info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
- info.subtype == "变更" || info.subtype == "其他" {
- //招标结果
- if info.budget != 0 && info.budget == v.budget{//预算
- reason = reason + "---招标类:预算"
- return true,reason
- }
- if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
- reason = reason + "---招标类:开标时间"
- return true,reason
- }
- if info.bidopenaddress!="" && info.bidopenaddress == v.bidopenaddress{//开标地点
- reason = reason + "---招标类:开标地点"
- return true,reason
- }
- } else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
- //中标结果
- if v.winner != "" && info.winner == v.winner{//中标单位
- reason = reason + "---中标类:中标单位"
- return true,reason
- }
- if v.bidamount != 0 && info.bidamount == v.bidamount{//中标金额
- reason = reason + "---中标类:中标金额"
- return true,reason
- }
- } else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
- //合同
- if info.budget != 0 && info.budget == v.budget{//预算
- reason = reason + "---合同类:预算"
- return true,reason
- }
- if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
- reason = reason + "---合同类:开标时间"
- return true,reason
- }
- if info.bidopenaddress!="" && info.bidopenaddress == v.bidopenaddress{//开标地点
- reason = reason + "---合同类:开标地点"
- return true,reason
- }
- if v.winner != "" && info.winner == v.winner{//中标单位
- reason = reason + "---合同类:中标单位"
- return true,reason
- }
- if v.bidamount != 0 && info.bidamount == v.bidamount{//中标金额
- reason = reason + "---合同类:中标金额"
- return true,reason
- }
- } else {
- //招标结果
- if info.budget != 0 && info.budget == v.budget{//预算
- reason = reason + "---类别空-招标类:预算"
- return true,reason
- }
- if info.bidopentime != 0 && info.bidopentime == v.bidopentime{//开标时间
- reason = reason + "---类别空-招标类:开标时间"
- return true,reason
- }
- if info.bidopenaddress!="" && info.bidopenaddress == v.bidopenaddress{//开标地点
- reason = reason + "---类别空-招标类:开标地点"
- return true,reason
- }
- }
- return false,reason
- }
- //判重方法1
- func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
- isMeet := false
- if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
- info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
- info.subtype == "变更" || info.subtype == "其他" {
- //招标结果
- if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
- if tenderRepeat_C(v, info) {
- return false, reason
- } else {
- reason = reason + "---招标类"
- return true, reason
- }
- } else {
- return false, reason
- }
- } else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
- //中标结果
- if isMeet, reason = winningRepeat_A(v, info, reason); isMeet {
- if winningRepeat_C(v, info) {
- return false, reason
- } else {
- reason = reason + "---中标类"
- return true, reason
- }
- } else {
- return false, reason
- }
- } else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
- //合同
- if isMeet, reason = contractRepeat_A(v, info, reason); isMeet {
- if contractRepeat_C(v, info) {
- return false, reason
- } else {
- reason = reason + "---合同类"
- return true, reason
- }
- } else {
- return false, reason
- }
- } else {
- //招标结果
- if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
- if tenderRepeat_C(v, info) {
- return false, reason
- } else {
- reason = reason + "---类别空-招标类"
- return true, reason
- }
- } else {
- return false, reason
- }
- }
- return false, reason
- }
- //判重方法2
- func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
- isMeet := false
- if v.agency == info.agency && v.agency != "" && info.agency != "" {
- if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
- info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
- info.subtype == "变更" || info.subtype == "其他" {
- //招标结果
- if isMeet, reason = tenderRepeat_B(v, info, reason); isMeet {
- if tenderRepeat_C(v, info) { //有不同
- return false, reason
- } else {
- reason = reason + "---招标类"
- return true, reason
- }
- } else {
- return false, reason
- }
- } else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
- //中标结果
- if isMeet, reason = winningRepeat_B(v, info, reason); isMeet {
- if winningRepeat_C(v, info) { //有不同
- return false, reason
- } else {
- reason = reason + "---中标类"
- return true, reason
- }
- } else {
- return false, reason
- }
- } else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
- //合同
- if isMeet, reason = contractRepeat_B(v, info, reason); isMeet {
- if contractRepeat_C(v, info) { //有不同
- return false, reason
- } else {
- reason = reason + "---合同类"
- return true, reason
- }
- } else {
- return false, reason
- }
- } else {
- //招标结果
- if isMeet, reason = tenderRepeat_B(v, info, reason); isMeet {
- if tenderRepeat_C(v, info) { //有不同
- return false, reason
- } else {
- reason = reason + "---类别空-招标类"
- return true, reason
- }
- } else {
- return false, reason
- }
- }
- }
- //不同
- if v.agency != info.agency && v.agency != "" && info.agency != "" {
- return false, reason
- }
- //机构最少一个为空
- if v.agency == "" || info.agency == "" {
- var repeat = false
- if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
- reason = reason + "---机构最少一个空"
- return true, reason
- } else {
- return false, reason
- }
- }
- return false, reason
- }
- //招标_A
- func tenderRepeat_A(v *Info, info *Info, reason string) (bool, string) {
- var ss string
- p1, p2, p3, p4, p9, p10, p11 := false, false, false, false, false, false, false
- if v.projectname != "" && v.projectname == info.projectname {
- ss = ss + "p1(名称)-"
- p1 = true
- }
- if v.buyer != "" && v.buyer == info.buyer {
- ss = ss + "p2(单位)-"
- p2 = true
- }
- if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
- (v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
- ss = ss + "p3(编号组)-"
- p3 = true
- }
- if v.budget != 0 && v.budget == info.budget {
- ss = ss + "p4(预算)-"
- p4 = true
- }
- if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
- ss = ss + "p9(开标时间)-"
- p9 = true
- }
- if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
- ss = ss + "p10(开标地点)-"
- p10 = true
- }
- if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
- (strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
- ss = ss + "p11(标题)-"
- p11 = true
- }
- if (p1 && p2 && p3) || (p1 && p2 && p4) || (p1 && p2 && p9) ||
- (p1 && p2 && p10) || (p1 && p2 && p11) || (p1 && p3 && p9) || (p1 && p3 && p10) ||
- (p1 && p4 && p9) || (p1 && p4 && p10) || (p2 && p3 && p4) ||
- (p2 && p3 && p9) || (p2 && p3 && p10) || (p2 && p3 && p11) ||
- (p2 && p4 && p9) || (p2 && p4 && p10) || (p2 && p4 && p11) ||
- (p3 && p4 && p9) || (p3 && p4 && p10) || (p3 && p4 && p11) ||
- (p4 && p9 && p10) || (p4 && p9 && p11) || (p9 && p10 && p11) {
- reason = reason + "满足招标A,3要素组合-" + ss + ","
- return true, reason
- }
- return false, reason
- }
- //招标_B
- func tenderRepeat_B(v *Info, info *Info, reason string) (bool, string) {
- m, n := 0, 0
- if v.projectname != "" && v.projectname == info.projectname {
- m++
- n++
- }
- if v.buyer != "" && v.buyer == info.buyer {
- m++
- }
- if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
- (v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
- m++
- }
- if v.budget != 0 && v.budget == info.budget {
- m++
- }
- if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
- m++
- }
- //if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
- // m++
- //}
- if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
- (strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
- m++
- n++
- }
- if m >= 2 {
- if n == 2 && m == 2 {
- return false, reason
- } else {
- reason = reason + "满足招标B,六选二,"
- return true, reason
- }
- }
- return false, reason
- }
- //招标_C
- func tenderRepeat_C(v *Info, info *Info) bool {
- if v.budget != 0 && info.budget != 0 && v.budget != info.budget {
- return true
- }
- //原始地址...
- if v.buyer != "" && info.buyer != "" && v.buyer != info.buyer {
- return true
- }
- if v.bidopentime != 0 && info.bidopentime != 0 && v.bidopentime != info.bidopentime {
- return true
- }
- if v.bidopenaddress != "" && info.bidopenaddress != "" && v.bidopenaddress != info.bidopenaddress {
- return true
- }
- return false
- }
- //中标_A
- func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
- var ss string
- p1, p2, p3, p5, p6, p11 := false, false, false, false, false, false
- if v.projectname != "" && v.projectname == info.projectname {
- ss = ss + "p1(项目名称)-"
- p1 = true
- }
- if v.buyer != "" && v.buyer == info.buyer {
- ss = ss + "p2(单位)-"
- p2 = true
- }
- if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
- (v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
- ss = ss + "p3(编号组)-"
- p3 = true
- }
- //if v.bidamount != 0 && v.bidamount == info.bidamount {
- // ss = ss + "p5(中标金)-"
- // p5 = true
- //}
- //if v.winner != "" && v.winner == info.winner {
- // ss = ss + "p6(中标人)-"
- // p6 = true
- //}
- if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
- ss = ss + "p5(中标金)-"
- p5 = true
- }
- if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
- ss = ss + "p6(中标人)-"
- p6 = true
- }
- if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
- (strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
- ss = ss + "p11(标题)-"
- p11 = true
- }
- if (p1 && p2 && p3) || (p1 && p2 && p5) || (p1 && p2 && p6) ||
- (p1 && p3 && p5) || (p1 && p3 && p6) || (p1 && p5 && p6) ||
- (p2 && p3 && p5) || (p2 && p3 && p6) || (p2 && p3 && p11) ||
- (p2 && p5 && p6) || (p2 && p5 && p11) || (p2 && p6 && p11) ||
- (p3 && p5 && p6) || (p3 && p5 && p11) || (p3 && p6 && p11) ||
- (p5 && p6 && p11) {
- reason = reason + "满足中标A,3要素组合-" + ss + ","
- return true, reason
- }
- return false, reason
- }
- //中标_B
- func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
- m, n := 0, 0
- if v.projectname != "" && v.projectname == info.projectname {
- m++
- n++
- }
- if v.buyer != "" && v.buyer == info.buyer {
- m++
- }
- if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
- (v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
- m++
- }
- if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
- m++
- }
- if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
- m++
- }
- if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
- (strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
- m++
- n++
- }
- if m >= 2 {
- if n == 2 && m == 2 {
- return false, reason
- } else {
- reason = reason + "满足中标B.六选二,"
- return true, reason
- }
- }
- return false, reason
- }
- //中标_C
- func winningRepeat_C(v *Info, info *Info) bool {
- //if v.bidamount != 0 && info.bidamount != 0 && v.bidamount != info.bidamount {
- // return true
- //}
- if v.bidamount != 0 && info.bidamount != 0 && isBidWinningAmount(v.bidamount,info.bidamount) {
- return true
- }
- if v.winner != "" && info.winner != "" && deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) {
- return true
- }
- //原始地址...
- if v.buyer != "" && info.buyer != "" && v.buyer != info.buyer {
- return true
- }
- return false
- }
- //合同_A
- func contractRepeat_A(v *Info, info *Info, reason string) (bool, string) {
- isMeet_1 := false
- if isMeet_1, reason = tenderRepeat_A(v, info, reason); isMeet_1 {
- return true, reason
- }
- isMeet_2 := false
- if isMeet_2, reason = winningRepeat_A(v, info, reason); isMeet_2 {
- return true, reason
- }
- return false, reason
- }
- //合同_B
- func contractRepeat_B(v *Info, info *Info, reason string) (bool, string) {
- isMeet_1 := false
- if isMeet_1, reason = tenderRepeat_B(v, info, reason); isMeet_1 {
- return true, reason
- }
- isMeet_2 := false
- if isMeet_2, reason = winningRepeat_B(v, info, reason); isMeet_2 {
- return true, reason
- }
- return false, reason
- }
- //合同_C
- func contractRepeat_C(v *Info, info *Info) bool {
- if tenderRepeat_C(v, info) {
- return true
- }
- if winningRepeat_C(v, info) {
- return true
- }
- return false
- }
- //再次金额判断
- func againRepeat(v *Info, info *Info) bool {
- //相同采购单位下
- if info.buyer != "" && v.buyer == info.buyer {
- if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
- info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
- info.subtype == "其他" || info.subtype == "变更" {
- //预算金额满足条件
- if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
- return true
- }
- } else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" ||
- info.subtype == "流标" || info.subtype == "合同" || info.subtype == "验收" ||
- info.subtype == "违规" {
- //中标金额单位满足条件
- if (isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0) ||
- (deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "") {
- return true
- }
- } else {
- }
- }
- return false
- }
- //删除中标单位字符串中多余的空格(含tab)
- func deleteExtraSpace(s string) string {
- //删除字符串中的多余空格,有多个空格时,仅保留一个空格
- s1 := strings.Replace(s, " ", " ", -1) //替换tab为空格
- regstr := "\\s{2,}" //两个及两个以上空格的正则表达式
- reg, _ := regexp.Compile(regstr) //编译正则表达式
- s2 := make([]byte, len(s1)) //定义字符数组切片
- copy(s2, s1) //将字符串复制到切片
- spc_index := reg.FindStringIndex(string(s2)) //在字符串中搜索
- for len(spc_index) > 0 { //找到适配项
- s2 = append(s2[:spc_index[0]+1], s2[spc_index[1]:]...) //删除多余空格
- spc_index = reg.FindStringIndex(string(s2)) //继续在字符串中搜索
- }
- return string(s2)
- }
- //中标金额倍率:10000
- func isBidWinningAmount(f1 float64 ,f2 float64) bool {
- if f1==f2||f1*10000==f2||f2*10000==f1 {
- return false
- }
- return true
- }
|