package main import ( "log" "math" mu "mfw/util" "qfw/util" "regexp" "sort" "strings" "sync" "go.mongodb.org/mongo-driver/bson/primitive" ) const ( ProjectCache = "info" //存放每条项目信息,key为项目ID ) var ( Sysconfig map[string]interface{} //读取配置文件 MongoTool *MongodbSim //mongodb连接 ExtractColl, ProjectColl, BackupColl, SiteColl string //抽取表、项目表、项目快照表、站点表 Thread int //配置项线程数 //NextNode []interface{} ) var ( //判断是日期 _datereg = regexp.MustCompile("20[0-2][0-9][年-][0-9]{1,2}[月-][0-9]{1,2}[日-]([0-9]{1,2}时[0-9]{0,2})?") _numreg1 = regexp.MustCompile("^[0-9-]{1,8}$") _zimureg1 = regexp.MustCompile("^[a-zA-Z-]{1,7}$") _nzreg = regexp.MustCompile("^[0-9a-zA-Z-]+$") _hanreg = regexp.MustCompile(`^[\p{Han}::【】\\[\\]()()--、]+$`) replaceStr = regexp.MustCompile("(工程|采购|项目|[?!、【】()—()--]|栏标价|中标候选人|招标代理)") //判断带有分包、等特定词的 pStr = regexp.MustCompile("(勘察|监理|施工|设计|验收|标段|分包|子包|[0-9A-Z]包|[一二三四五六七八九十0-9]批)") //判断包含数值 nreg1 = regexp.MustCompile("[0-9]{2,}") //判断包含字母 zreg1 = regexp.MustCompile("[a-zA-Z]{1,}") //判断包含汉字 hreg1 = regexp.MustCompile(`[\p{Han}]+`) //判断项目编号是在10以内的纯数字结构 numCheckPc = regexp.MustCompile("^[0-9-]{1,10}$") //仅初始化使用 compareNoPass = map[string]bool{} compareAB = map[string]bool{} compareAB2D = map[string]bool{} compareABD = map[string]bool{} compareAB2CD = map[string]bool{} compareABCD = map[string]bool{} ) func init() { util.ReadConfig(&Sysconfig) MongoTool = &MongodbSim{ MongodbAddr: Sysconfig["mongodbServers"].(string), Size: util.IntAll(Sysconfig["mongodbPoolSize"]), DbName: Sysconfig["mongodbName"].(string), } MongoTool.InitPool() ExtractColl = Sysconfig["extractColl"].(string) ProjectColl = Sysconfig["projectColl"].(string) BackupColl = Sysconfig["projectColl"].(string) + "_back" SiteColl = Sysconfig["siteColl"].(string) Thread = util.IntAll(Sysconfig["thread"]) //NextNode = Sysconfig["nextNode"].([]interface{}) udpport, _ := Sysconfig["udpport"].(string) udpclient = mu.UdpClient{Local: udpport, BufSize: 1024} udpclient.Listen(processUdpMsg) log.Println("Udp服务监听", udpport) //加载项目数据 //---不能通过 vm := []string{"C", "D"} for i := 0; i < 2; i++ { for j := 0; j < 2; j++ { for k := 0; k < 2; k++ { key := vm[i] + vm[j] + vm[k] compareNoPass[key] = true //fmt.Println(key) } } } //fmt.Println("-------------------") //三个元素一致 [AB][AB][AB],分值最高 vm = []string{"A", "B"} for i := 0; i < 2; i++ { for j := 0; j < 2; j++ { for k := 0; k < 2; k++ { key := vm[i] + vm[j] + vm[k] compareAB[key] = true //fmt.Println(key) } } } //fmt.Println("-------------------", len(compareAB)) //---至少两个一致,其他可能不存在 //[AB][AB][ABD] //[AB][ABD][AB] vm = []string{"A", "B"} vm2 := []string{"A", "B", "D"} for i := 0; i < 2; i++ { for j := 0; j < 2; j++ { for k := 0; k < 3; k++ { key := vm[i] + vm[j] + vm2[k] if !compareAB[key] { compareAB2D[key] = true //fmt.Println(key) } } } } for i := 0; i < 2; i++ { for j := 0; j < 3; j++ { for k := 0; k < 2; k++ { key := vm[i] + vm2[j] + vm[k] if !compareAB[key] { compareAB2D[key] = true //fmt.Println(key) } } } } //fmt.Println("-------------------", len(compareAB2D)) //---至少一个一致,其他可能不存在 //[ABD][ABD][ABD] //已经删除DDD vm = []string{"A", "B", "D"} for i := 0; i < 3; i++ { for j := 0; j < 3; j++ { for k := 0; k < 3; k++ { key := vm[i] + vm[j] + vm[k] if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] { compareABD[key] = true //fmt.Println(key) } } } } //fmt.Println("-------------------", len(compareABD)) //[AB][ABCD][AB] //[AB][AB][ABCD] vm = []string{"A", "B"} vm2 = []string{"A", "B", "C", "D"} for i := 0; i < 2; i++ { for j := 0; j < 4; j++ { for k := 0; k < 2; k++ { key := vm[i] + vm2[j] + vm[k] if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] { compareAB2CD[key] = true //fmt.Println(key) } } } } for i := 0; i < 2; i++ { for j := 0; j < 2; j++ { for k := 0; k < 4; k++ { key := vm[i] + vm[j] + vm2[k] if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] { compareAB2CD[key] = true //fmt.Println(key) } } } } //fmt.Println("-------------------", len(compareAB2CD)) //[ABECD][ABECD][ABECD] //已经删除[CD][CD][CD] //这个要重点讨论 vm = []string{"A", "B", "C", "D"} for i := 0; i < 4; i++ { for j := 0; j < 4; j++ { for k := 0; k < 4; k++ { key := vm[i] + vm[j] + vm[k] if !compareAB[key] && !compareAB2D[key] && !compareABD[key] && !compareNoPass[key] && !compareAB2CD[key] { compareABCD[key] = true //fmt.Println(key) } } } } } func CheckHanAndNum(str string) (b bool) { return nreg1.MatchString(str) && hreg1.MatchString(str) } func CheckZimuAndNum(str string) (b bool) { return zreg1.MatchString(str) && nreg1.MatchString(str) } type KeyMap struct { Lock sync.Mutex Map map[string]*Key } type ID struct { Id string Lock sync.Mutex P *ProjectInfo } type Key struct { Arr []string Lock sync.Mutex } type IdAndLock struct { Id string Lock sync.Mutex } func NewKeyMap() *KeyMap { return &KeyMap{ Map: map[string]*Key{}, Lock: sync.Mutex{}, } } //招标信息实体类 type Info struct { Id string `json:"_id"` Href string `json:"href"` //源地址 Publishtime int64 `json:"publishtime"` Comeintime int64 `json:"comeintime"` Title string `json:"title"` TopType string `json:"toptype"` SubType string `json:"subtype"` ProjectName string `json:"projectname"` ProjectCode string `json:"projectcode"` ProjectScope string `json:"projectscope"` ContractCode string `json:"contractcode"` Buyer string `json:"buyer"` Buyerperson string `json:"buyerperson"` Buyertel string `json:"buyertel"` Agency string `json:"agency"` Area string `json:"area"` City string `json:"city"` District string `json:"district"` Infoformat int `json:"infoformat"` HasPackage bool // `json:"haspackage"` Package map[string]interface{} `json:"package"` //PNum string `json:"pnum"` Topscopeclass []string `json:"topscopeclass"` Subscopeclass []string `json:"subscopeclass"` Buyerclass string `json:"buyerclass"` Bidopentime int64 `json:"bidopentime"` Budget float64 `json:"budget"` Bidamount float64 `json:"bidamount"` Winners []string dealtype int Winnerorder []string PTC string //从标题中抽的项目编号 pnbval int //项目名称、编号、采购单位存在的个数 LenPC int //项目编号长度 LenPN int //项目名称长度 LenPTC int //标题抽的项目编号长度 //以下三个元素做对比,计算包含时候使用 PNBH int //0初始,+包含,-被包含 PCBH int PTCBH int } //项目实体类 type ProjectInfo struct { Id primitive.ObjectID `json:"_id"` FirstTime int64 `json:"firsttime,omitempty"` //项目的最早时间 LastTime int64 `json:"lasttime,omitempty"` //项目的最后时间 Ids []string `json:"ids,omitempty"` Topscopeclass []string `json:"topscopeclass,omitempty"` Subscopeclass []string `json:"subscopeclass,omitempty"` //子行业分类 Winners []string `json:"s_winner,omitempty"` //中标人 ProjectName string `json:"projectname,omitempty"` //项目名称 ProjectCode string `json:"projectcode,omitempty"` //项目代码唯一(纯数字的权重低) ContractCode string `json:"contractcode,omitempty"` //项目编号 Buyer string `json:"buyer,omitempty"` //采购单位唯一 MPN []string `json:"mpn,omitempty"` //合并后多余的项目名称 MPC []string `json:"mpc,omitempty"` //合并后多余的项目编号 Buyerperson string `json:"buyerperson"` //采购联系人 Buyertel string `json:"buyertel"` //采购联系人电话 Agency string `json:"agency"` //代理机构 Area string `json:"area"` //地区 City string `json:"city"` //地市 District string `json:"district"` //区县 Bidstatus string `json:"bidstatus"` // Bidtype string `json:"bidtype"` // //HasPackage bool `json:"haspackage"` //是否有分包 Package map[string]interface{} `json:"package,omitempty"` //分包的对比对象 Buyerclass string `json:"buyerclass"` //采购单位分类 Bidopentime int64 `json:"bidopentime,omitempty"` //开标时间 // Zbtime int64 `json:"zbtime"` //招标时间 Jgtime int64 `json:"jgtime"` //结果中标时间 Zbtime int64 `json:"zbtime"` //招标时间 Bidamount float64 `json:"bidamount,omitempty"` //中标金额 Budget float64 `json:"budget,omitempty"` //预算 //Winnerorder []string `json:"winnerorder"` //中标候选人 score int comStr string resVal, pjVal int InfoFiled map[string]InfoField `json:"infofiled"` //逻辑处理需要的info字段 Budgettag int `json:"budgettag"` //预算是否有效标记 Bidamounttag int `json:"bidamounttag"` //中标金额是否有效标记 } //存储部分招标信息字段,业务逻辑处理需要 type InfoField struct { Budget float64 `json:"budget"` Bidamount float64 `json:"bidamount"` ContractCode string `json:"contractcode"` ProjectName string `json:"projectname"` ProjectCode string `json:"projectcode"` Bidstatus string `json:"bidstatus"` } //站点信息 type Site struct { Id string `json:"_id"` Site string `json:"site"` //站点名字 Area string `json:"area"` //省 City string `json:"city"` //市 District string `json:"district"` //区、县 Domain string `json:"domain"` //地址 } //二分字符串查找 func BinarySearch(s []string, k string) int { sort.Strings(s) lo, hi := 0, len(s)-1 for lo <= hi { m := (lo + hi) >> 1 if s[m] < k { lo = m + 1 } else if s[m] > k { hi = m - 1 } else { return m } } return -1 } //计算文本相似度 func CosineSimilar(srcWords1, dstWords1 string) float64 { srcWords, dstWords := strings.Split(srcWords1, ""), strings.Split(dstWords1, "") // get all words allWordsMap := make(map[string]int, 0) for _, word := range srcWords { if _, found := allWordsMap[word]; !found { allWordsMap[word] = 1 } else { allWordsMap[word] += 1 } } for _, word := range dstWords { if _, found := allWordsMap[word]; !found { allWordsMap[word] = 1 } else { allWordsMap[word] += 1 } } // stable the sort allWordsSlice := make([]string, 0) for word, _ := range allWordsMap { allWordsSlice = append(allWordsSlice, word) } // assemble vector srcVector := make([]int, len(allWordsSlice)) dstVector := make([]int, len(allWordsSlice)) for _, word := range srcWords { if index := BinarySearch(allWordsSlice, word); index != -1 { srcVector[index] += 1 } } for _, word := range dstWords { if index := BinarySearch(allWordsSlice, word); index != -1 { dstVector[index] += 1 } } // calc cos numerator := float64(0) srcSq := 0 dstSq := 0 for i, srcCount := range srcVector { dstCount := dstVector[i] numerator += float64(srcCount * dstCount) srcSq += srcCount * srcCount dstSq += dstCount * dstCount } denominator := math.Sqrt(float64(srcSq * dstSq)) v1 := numerator / denominator // if v1 > 0.6 { // log.Println(v1, srcWords1, dstWords1) // } return v1 }