package main import ( "fmt" "math" "os" "project/config" "reflect" "regexp" "sort" "strings" "sync" "go.mongodb.org/mongo-driver/bson/primitive" util "jygit.jydev.jianyu360.cn/data_processing/common_utils" "jygit.jydev.jianyu360.cn/data_processing/common_utils/log" "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb" "jygit.jydev.jianyu360.cn/data_processing/common_utils/redis" "jygit.jydev.jianyu360.cn/data_processing/common_utils/udp" ) var ( MgoP, MgoB, MgoS *mongodb.MongodbSim //mongodb连接 BiddingColl, ProjectColl, BackupColl, SiteColl string //抽取表、项目表、项目快照表、站点表 Thread int //配置项线程数 BlackList []interface{} SkipSiteList []string BlaskListMap map[string]bool RedisProject, RedisBuyer string P_KEY, B_KEY string ) var ( //判断是日期 _datereg = regexp.MustCompile("20[0-2][0-9][年-][0-9]{1,2}[月-][0-9]{1,2}[日-]([0-9]{1,2}时[0-9]{0,2})?") _numreg1 = regexp.MustCompile("^[0-9-]{1,8}$") _zimureg1 = regexp.MustCompile("^[a-zA-Z-]{1,7}$") _nzreg = regexp.MustCompile("^[0-9a-zA-Z-]+$") _hanreg = regexp.MustCompile(`^[\p{Han}::【】\\[\\]()()--、]+$`) replaceStr = regexp.MustCompile("(工程|采购|项目|[?!、【】()—()--]|栏标价|中标候选人|招标代理)") //判断带有分包、等特定词的 pStr = regexp.MustCompile("(勘察|监理|施工|设计|验收|标段|分包|子包|[0-9A-Z]包|[一二三四五六七八九十0-9]批)") //判断包含数值 nreg1 = regexp.MustCompile("[0-9]{2,}") //判断包含字母 zreg1 = regexp.MustCompile("[a-zA-Z]{1,}") //判断包含汉字 hreg1 = regexp.MustCompile(`[\p{Han}]+`) //判断项目编号是在10以内的纯数字结构 numCheckPc = regexp.MustCompile("^[0-9-]{1,10}$") //仅初始化使用 compareNoPass = map[string]bool{} compareAB = map[string]bool{} compareAB2D = map[string]bool{} compareABD = map[string]bool{} compareAB2CD = map[string]bool{} compareABCD = map[string]bool{} ) func init() { config.Init("./common.toml") InitLog() MgoP = &mongodb.MongodbSim{ MongodbAddr: config.Conf.DB.MongoP.Addr, Size: config.Conf.DB.MongoP.Size, DbName: config.Conf.DB.MongoP.Dbname, UserName: config.Conf.DB.MongoP.User, Password: config.Conf.DB.MongoP.Password, } MgoP.InitPool() MgoB = &mongodb.MongodbSim{ MongodbAddr: config.Conf.DB.MongoB.Addr, Size: config.Conf.DB.MongoB.Size, DbName: config.Conf.DB.MongoB.Dbname, UserName: config.Conf.DB.MongoB.User, Password: config.Conf.DB.MongoB.Password, } MgoB.InitPool() MgoS = &mongodb.MongodbSim{ MongodbAddr: config.Conf.DB.MongoS.Addr, Size: config.Conf.DB.MongoS.Size, DbName: config.Conf.DB.MongoS.Dbname, UserName: config.Conf.DB.MongoS.User, Password: config.Conf.DB.MongoS.Password, } MgoS.InitPool() BiddingColl = config.Conf.DB.MongoB.Coll ProjectColl = config.Conf.DB.MongoP.Coll BackupColl = config.Conf.DB.MongoP.Coll + "_back" SiteColl = config.Conf.Serve.SiteColl Thread = config.Conf.Serve.Thread udpclient = udp.UdpClient{Local: config.Conf.Serve.Udp, BufSize: 1024} udpclient.Listen(processUdpMsg) log.Info("udp init port:" + udpclient.Local) RedisProject = "project" P_KEY = "project_detail_%s" RedisBuyer = "qyxy_buyer" B_KEY = "project_buyer_%s" redis.InitRedis(config.Conf.DB.Redis.Addr) // 采购单位与中标单位初次合作项目 cof := make(map[string]interface{}) util.ReadConfig(&cof) BlackList = cof["rp_blacklist"].([]interface{}) BlaskListMap = make(map[string]bool) for _, v := range BlackList { BlaskListMap[util.ObjToString(v)] = true } SkipSiteList = util.ObjArrToStringArr(cof["site_list"].([]interface{})) initWinnerRegexp(cof) initBuyerRegexp(cof) initAgencyRegexp(cof) //加载项目数据 //---不能通过 vm := []string{"C", "D"} for i := 0; i < 2; i++ { for j := 0; j < 2; j++ { for k := 0; k < 2; k++ { key := vm[i] + vm[j] + vm[k] compareNoPass[key] = true //fmt.Println(key) } } } //fmt.Println("-------------------") //三个元素一致 [AB][AB][AB],分值最高 vm = []string{"A", "B"} for i := 0; i < 2; i++ { for j := 0; j < 2; j++ { for k := 0; k < 2; k++ { key := vm[i] + vm[j] + vm[k] compareAB[key] = true //fmt.Println(key) } } } //fmt.Println("-------------------", len(compareAB)) //---至少两个一致,其他可能不存在 //[AB][AB][ABD] //[AB][ABD][AB] vm = []string{"A", "B"} vm2 := []string{"A", "B", "D"} for i := 0; i < 2; i++ { for j := 0; j < 2; j++ { for k := 0; k < 3; k++ { key := vm[i] + vm[j] + vm2[k] if !compareAB[key] { compareAB2D[key] = true //fmt.Println(key) } } } } for i := 0; i < 2; i++ { for j := 0; j < 3; j++ { for k := 0; k < 2; k++ { key := vm[i] + vm2[j] + vm[k] if !compareAB[key] { compareAB2D[key] = true //fmt.Println(key) } } } } //fmt.Println("-------------------", len(compareAB2D)) //---至少一个一致,其他可能不存在 //[ABD][ABD][ABD] //已经删除DDD vm = []string{"A", "B", "D"} for i := 0; i < 3; i++ { for j := 0; j < 3; j++ { for k := 0; k < 3; k++ { key := vm[i] + vm[j] + vm[k] if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] { compareABD[key] = true //fmt.Println(key) } } } } //fmt.Println("-------------------", len(compareABD)) //[AB][ABCD][AB] //[AB][AB][ABCD] vm = []string{"A", "B"} vm2 = []string{"A", "B", "C", "D"} for i := 0; i < 2; i++ { for j := 0; j < 4; j++ { for k := 0; k < 2; k++ { key := vm[i] + vm2[j] + vm[k] if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] { compareAB2CD[key] = true //fmt.Println(key) } } } } for i := 0; i < 2; i++ { for j := 0; j < 2; j++ { for k := 0; k < 4; k++ { key := vm[i] + vm[j] + vm2[k] if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] { compareAB2CD[key] = true //fmt.Println(key) } } } } //fmt.Println("-------------------", len(compareAB2CD)) //[ABECD][ABECD][ABECD] //已经删除[CD][CD][CD] //这个要重点讨论 vm = []string{"A", "B", "C", "D"} for i := 0; i < 4; i++ { for j := 0; j < 4; j++ { for k := 0; k < 4; k++ { key := vm[i] + vm[j] + vm[k] if !compareAB[key] && !compareAB2D[key] && !compareABD[key] && !compareNoPass[key] && !compareAB2CD[key] { compareABCD[key] = true //fmt.Println(key) } } } } } func CheckHanAndNum(str string) (b bool) { return nreg1.MatchString(str) && hreg1.MatchString(str) } func CheckZimuAndNum(str string) (b bool) { return zreg1.MatchString(str) && nreg1.MatchString(str) } type KeyMap struct { Lock sync.Mutex Map map[string]*Key } type ID struct { Id string Lock sync.Mutex P *ProjectCache } type Key struct { Arr []string Lock sync.Mutex } type IdAndLock struct { Id string Lock sync.Mutex } func NewKeyMap() *KeyMap { return &KeyMap{ Map: map[string]*Key{}, Lock: sync.Mutex{}, } } // 招标信息实体类 type Info struct { Id string `json:"_id"` Href string `json:"href"` //源地址 Publishtime int64 `json:"publishtime"` Comeintime int64 `json:"comeintime"` Title string `json:"title"` TopType string `json:"toptype"` SubType string `json:"subtype"` ProjectName string `json:"projectname"` ProjectCode string `json:"projectcode"` ProjectScope string `json:"projectscope"` ContractCode string `json:"contractcode"` Buyer string `json:"buyer"` Buyerperson string `json:"buyerperson"` Buyertel string `json:"buyertel"` Agency string `json:"agency"` Area string `json:"area"` City string `json:"city"` District string `json:"district"` Infoformat int `json:"infoformat"` ReviewExperts string `json:"review_experts"` Purchasing string `json:"purchasing"` WinnerOrder []map[string]interface{} `json:"winnerorder"` ProjectScale string `json:"project_scale"` ProjectDuration int `json:"project_duration"` ProjectTimeUnit string `json:"project_timeunit"` ProjectStartDate int64 `json:"project_startdate"` ProjectCompleteDate int64 `json:"project_completedate"` Payway string `json:"payway"` ContractGuarantee bool `json:"contract_guarantee"` BidGuarantee bool `json:"bid_guarantee"` Qualifies []map[string]interface{} `json:"qualifies"` EntIdList []string `json:"entidlist"` HasPackage bool // `json:"haspackage"` Package map[string]interface{} `json:"package"` Topscopeclass []string `json:"topscopeclass"` Subscopeclass []string `json:"subscopeclass"` Buyerclass string `json:"buyerclass"` BidOpenTime int64 `json:"bidopentime"` BidEndTime int64 `json:"bidendtime"` Budget float64 `json:"budget"` Bidamount float64 `json:"bidamount"` Winners []string dealtype int PTC string //从标题中抽的项目编号 pnbval int //项目名称、编号、采购单位存在的个数 LenPC int //项目编号长度 LenPN int //项目名称长度 LenPTC int //标题抽的项目编号长度 //以下三个元素做对比,计算包含时候使用 PNBH int //0初始,+包含,-被包含 PCBH int PTCBH int // JgtimeFirst int64 `json:"jgtime_first"` } // 内存 项目信息 type ProjectCache struct { Id primitive.ObjectID `json:"_id"` Ids []string `json:"ids,omitempty"` Size int `json:"size"` FirstTime int64 `json:"firsttime,omitempty"` //项目的最早时间 LastTime int64 `json:"lasttime,omitempty"` //项目的最后时间 ProjectName string `json:"projectname,omitempty"` //项目名称 ProjectCode string `json:"projectcode,omitempty"` //项目代码唯一(纯数字的权重低) Buyer string `json:"buyer,omitempty"` //采购单位唯一 Agency string `json:"agency"` //代理机构 Area string `json:"area"` //地区 City string `json:"city"` //地市 District string `json:"district"` //区县 Bidamount float64 `json:"bidamount,omitempty"` //中标金额 Budget float64 `json:"budget,omitempty"` //预算 Bidstatus string `json:"bidstatus"` // Bidtype string `json:"bidtype"` // score int comStr string MPN []string `json:"mpn,omitempty"` //合并后多余的项目名称 MPC []string `json:"mpc,omitempty"` //合并后多余的项目编号 resVal, pjVal int } // 项目信息 type Project struct { Id primitive.ObjectID `json:"_id"` Ids []string `json:"ids,omitempty"` FirstTime int64 `json:"firsttime,omitempty"` //项目的最早时间 LastTime int64 `json:"lasttime,omitempty"` //项目的最后时间 ProjectName string `json:"projectname,omitempty"` //项目名称 ProjectCode string `json:"projectcode,omitempty"` //项目代码唯一(纯数字的权重低) Buyer string `json:"buyer,omitempty"` //采购单位唯一 Agency string `json:"agency"` //代理机构 Area string `json:"area"` //地区 City string `json:"city"` //地市 District string `json:"district"` //区县 Bidamount float64 `json:"bidamount,omitempty"` //中标金额 Budget float64 `json:"budget,omitempty"` //预算 score int comStr string MPN []string `json:"mpn,omitempty"` //合并后多余的项目名称 MPC []string `json:"mpc,omitempty"` //合并后多余的项目编号 resVal, pjVal int Topscopeclass []string `json:"topscopeclass,omitempty"` Subscopeclass []string `json:"subscopeclass,omitempty"` //子行业分类 Winners string `json:"s_winner,omitempty"` //中标人 ContractCode string `json:"contractcode,omitempty"` //合同编号 Buyerperson string `json:"buyerperson"` //采购联系人 Buyertel string `json:"buyertel"` //采购联系人电话 Bidstatus string `json:"bidstatus"` // Bidtype string `json:"bidtype"` // ReviewExperts string `json:"review_experts"` // 项目评审专家 Purchasing string `json:"purchasing"` // 标的物 Package map[string]interface{} `json:"package,omitempty"` //分包的对比对象 Buyerclass string `json:"buyerclass"` //采购单位分类 BidOpenTime int64 `json:"bidopentime,omitempty"` //开标时间 BidEndTime int64 `json:"bidendtime,omitempty"` //开标时间 Jgtime int64 `json:"jgtime"` //结果中标时间 Zbtime int64 `json:"zbtime"` //招标时间 Winnerorder []string `json:"winnerorder"` //中标候选人 ProjectScale string `json:"project_scale"` //项目规模 ProjectDuration int `json:"project_duration"` //工期时长 ProjectTimeunit string `json:"project_timeunit"` //工期时长单位 ProjectStartDate int64 `json:"project_startdate"` //开工日期 ProjctCompleteDate int64 `json:"projct_completedate"` //竣工日期 Payway string `json:"payway"` //付款方式 ContractGuarantee bool `json:"contract_guarantee"` //履约保证金 是否支持包含 BidGuarantee bool `json:"bid_guarantee"` //投标保证金 是否支持包含 Qualifies string `json:"qualifies"` //资质条件 EntIdList []string `json:"entidlist"` //企业id //FirstCooperation []string `json:"first_cooperation"` //first_cooperation //InfoFiled map[string]InfoField `json:"infofield"` //逻辑处理需要的info字段 JgtimeFirst int64 `json:"jgtime_first"` } // 存储部分招标信息字段,业务逻辑处理需要 type InfoField struct { Budget float64 `json:"budget"` Bidamount float64 `json:"bidamount"` ContractCode string `json:"contractcode"` ProjectName string `json:"projectname"` ProjectCode string `json:"projectcode"` Bidstatus string `json:"bidstatus"` } // 站点信息 type Site struct { Id string `json:"_id"` Site string `json:"site"` //站点名字 Area string `json:"area"` //省 City string `json:"city"` //市 District string `json:"district"` //区、县 Domain string `json:"domain"` //地址 Status int `json:"status"` // } // 二分字符串查找 func BinarySearch(s []string, k string) int { sort.Strings(s) lo, hi := 0, len(s)-1 for lo <= hi { m := (lo + hi) >> 1 if s[m] < k { lo = m + 1 } else if s[m] > k { hi = m - 1 } else { return m } } return -1 } func Duplicate(a interface{}) (ret []interface{}) { va := reflect.ValueOf(a) for i := 0; i < va.Len(); i++ { if i > 0 && reflect.DeepEqual(va.Index(i-1).Interface(), va.Index(i).Interface()) { continue } ret = append(ret, va.Index(i).Interface()) } return ret } // 计算文本相似度 func CosineSimilar(srcWords1, dstWords1 string) float64 { srcWords, dstWords := strings.Split(srcWords1, ""), strings.Split(dstWords1, "") // get all words allWordsMap := make(map[string]int, 0) for _, word := range srcWords { if _, found := allWordsMap[word]; !found { allWordsMap[word] = 1 } else { allWordsMap[word] += 1 } } for _, word := range dstWords { if _, found := allWordsMap[word]; !found { allWordsMap[word] = 1 } else { allWordsMap[word] += 1 } } // stable the sort allWordsSlice := make([]string, 0) for word, _ := range allWordsMap { allWordsSlice = append(allWordsSlice, word) } // assemble vector srcVector := make([]int, len(allWordsSlice)) dstVector := make([]int, len(allWordsSlice)) for _, word := range srcWords { if index := BinarySearch(allWordsSlice, word); index != -1 { srcVector[index] += 1 } } for _, word := range dstWords { if index := BinarySearch(allWordsSlice, word); index != -1 { dstVector[index] += 1 } } // calc cos numerator := float64(0) srcSq := 0 dstSq := 0 for i, srcCount := range srcVector { dstCount := dstVector[i] numerator += float64(srcCount * dstCount) srcSq += srcCount * srcCount dstSq += dstCount * dstCount } denominator := math.Sqrt(float64(srcSq * dstSq)) v1 := numerator / denominator // if v1 > 0.6 { // log.Println(v1, srcWords1, dstWords1) // } return v1 } func initWinnerRegexp(cof map[string]interface{}) { winRegMap := cof["winner"].(map[string]interface{}) //preRegexps := winRegMap["pre_regexp"].([]interface{}) //backRegexps := winRegMap["back_regexp"].([]interface{}) //backRepRegexps := winRegMap["back_rep_regexp"].([]interface{}) backBlack := winRegMap["blacklist"].([]interface{}) //var winPreRegexps []*regexp.Regexp //for _, v := range preRegexps { // reg := regexp.MustCompile("^" + v.(string)) // winPreRegexps = append(winPreRegexps, reg) //} //PreRegexp["winner"] = winPreRegexps //var winBackRegexps []*regexp.Regexp //for _, v := range backRegexps { // reg := regexp.MustCompile(v.(string)) // winBackRegexps = append(winBackRegexps, reg) //} //BackRegexp["winner"] = winBackRegexps //var winBackRepRegexps []RegexpInfo //for _, v := range backRepRegexps { // reps := strings.Split(v.(string), "#") // if len(reps) > 1 { // reg := RegexpInfo{ // regs: regexp.MustCompile(reps[0]), // repstr: reps[1], // } // winBackRepRegexps = append(winBackRepRegexps, reg) // } //} //BackRepRegexp["winner"] = winBackRepRegexps var winBlackRegexps []*regexp.Regexp for _, v := range backBlack { reg := regexp.MustCompile(v.(string)) winBlackRegexps = append(winBlackRegexps, reg) } BlackRegexp["winner"] = winBlackRegexps } func initBuyerRegexp(cof map[string]interface{}) { buyRegMap := cof["buyer"].(map[string]interface{}) //preRegexps := buyRegMap["pre_regexp"].([]interface{}) //backRegexps := buyRegMap["back_regexp"].([]interface{}) //backRepRegexps := buyRegMap["back_rep_regexp"].([]interface{}) backBlack := buyRegMap["blacklist"].([]interface{}) //var winPreRegexps []*regexp.Regexp //for _, v := range preRegexps { // reg := regexp.MustCompile("^" + v.(string)) // winPreRegexps = append(winPreRegexps, reg) //} //PreRegexp["buyer"] = winPreRegexps //var winBackRegexps []*regexp.Regexp //for _, v := range backRegexps { // reg := regexp.MustCompile(v.(string)) // winBackRegexps = append(winBackRegexps, reg) //} //BackRegexp["buyer"] = winBackRegexps //var winBackRepRegexps []RegexpInfo //for _, v := range backRepRegexps { // reps := strings.Split(v.(string), "#") // if len(reps) > 1 { // reg := RegexpInfo{ // regs: regexp.MustCompile(reps[0]), // repstr: reps[1], // } // winBackRepRegexps = append(winBackRepRegexps, reg) // } //} //BackRepRegexp["buyer"] = winBackRepRegexps var winBlackRegexps []*regexp.Regexp for _, v := range backBlack { reg := regexp.MustCompile(v.(string)) winBlackRegexps = append(winBlackRegexps, reg) } BlackRegexp["buyer"] = winBlackRegexps } func initAgencyRegexp(cof map[string]interface{}) { buyRegMap := cof["agency"].(map[string]interface{}) //preRegexps := buyRegMap["pre_regexp"].([]interface{}) //backRegexps := buyRegMap["back_regexp"].([]interface{}) //backRepRegexps := buyRegMap["back_rep_regexp"].([]interface{}) backBlack := buyRegMap["blacklist"].([]interface{}) //var winPreRegexps []*regexp.Regexp //for _, v := range preRegexps { // reg := regexp.MustCompile("^" + v.(string)) // winPreRegexps = append(winPreRegexps, reg) //} //PreRegexp["agency"] = winPreRegexps //var winBackRegexps []*regexp.Regexp //for _, v := range backRegexps { // reg := regexp.MustCompile(v.(string)) // winBackRegexps = append(winBackRegexps, reg) //} //BackRegexp["agency"] = winBackRegexps //var winBackRepRegexps []RegexpInfo //for _, v := range backRepRegexps { // reps := strings.Split(v.(string), "#") // if len(reps) > 1 { // reg := RegexpInfo{ // regs: regexp.MustCompile(reps[0]), // repstr: reps[1], // } // winBackRepRegexps = append(winBackRepRegexps, reg) // } //} //BackRepRegexp["agency"] = winBackRepRegexps var winBlackRegexps []*regexp.Regexp for _, v := range backBlack { reg := regexp.MustCompile(v.(string)) winBlackRegexps = append(winBlackRegexps, reg) } BlackRegexp["agency"] = winBlackRegexps } func InitLog() { logcfg := config.Conf.Log err := log.InitLog( log.Path(logcfg.LogPath), log.Level(logcfg.LogLevel), log.Compress(logcfg.Compress), log.MaxSize(logcfg.MaxSize), log.MaxBackups(logcfg.MaxBackups), log.MaxAge(logcfg.MaxAge), log.Format(logcfg.Format), ) if err != nil { fmt.Printf("InitLog failed: %v\n", err) os.Exit(1) } }