123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468 |
- package main
- import (
- "go.mongodb.org/mongo-driver/bson/primitive"
- "math"
- "mongodb"
- "qfw/util"
- "reflect"
- "regexp"
- "sort"
- "strings"
- "sync"
- )
- var (
- Sysconfig map[string]interface{} //读取配置文件
- MongoTool *mongodb.MongodbSim //mongodb连接
- ExtractColl, ProjectColl string //抽取表、项目表、项目快照表、站点表
- Thread int //配置项线程数
- operators []string // 运营商
- )
- var (
- //判断是日期
- _datereg = regexp.MustCompile("20[0-2][0-9][年-][0-9]{1,2}[月-][0-9]{1,2}[日-]([0-9]{1,2}时[0-9]{0,2})?")
- _numreg1 = regexp.MustCompile("^[0-9-]{1,8}$")
- _zimureg1 = regexp.MustCompile("^[a-zA-Z-]{1,7}$")
- _nzreg = regexp.MustCompile("^[0-9a-zA-Z-]+$")
- _hanreg = regexp.MustCompile(`^[\p{Han}::【】\\[\\]()()--、]+$`)
- replaceStr = regexp.MustCompile("(工程|采购|项目|[?!、【】()—()--]|栏标价|中标候选人|招标代理)")
- //判断带有分包、等特定词的
- pStr = regexp.MustCompile("(勘察|监理|施工|设计|验收|标段|分包|子包|[0-9A-Z]包|[一二三四五六七八九十0-9]批)")
- //判断包含数值
- nreg1 = regexp.MustCompile("[0-9]{2,}")
- //判断包含字母
- zreg1 = regexp.MustCompile("[a-zA-Z]{1,}")
- //判断包含汉字
- hreg1 = regexp.MustCompile(`[\p{Han}]+`)
- //判断项目编号是在10以内的纯数字结构
- numCheckPc = regexp.MustCompile("^[0-9-]{1,10}$")
- //仅初始化使用
- compareNoPass = map[string]bool{}
- compareAB = map[string]bool{}
- compareAB2D = map[string]bool{}
- compareABD = map[string]bool{}
- compareAB2CD = map[string]bool{}
- compareABCD = map[string]bool{}
- )
- func init() {
- util.ReadConfig(&Sysconfig)
- MongoTool = &mongodb.MongodbSim{
- MongodbAddr: Sysconfig["mongodbServers"].(string),
- Size: util.IntAll(Sysconfig["mongodbPoolSize"]),
- DbName: Sysconfig["mongodbName"].(string),
- //UserName: "root",
- //Password: "root",
- }
- MongoTool.InitPool()
- ExtractColl = Sysconfig["extractColl"].(string)
- ProjectColl = Sysconfig["projectColl"].(string)
- Thread = util.IntAll(Sysconfig["thread"])
- operators = strings.Split(util.ObjToString(Sysconfig["operators"]), ",")
- //加载项目数据
- //---不能通过
- vm := []string{"C", "D"}
- for i := 0; i < 2; i++ {
- for j := 0; j < 2; j++ {
- for k := 0; k < 2; k++ {
- key := vm[i] + vm[j] + vm[k]
- compareNoPass[key] = true
- //fmt.Println(key)
- }
- }
- }
- //fmt.Println("-------------------")
- //三个元素一致 [AB][AB][AB],分值最高
- vm = []string{"A", "B"}
- for i := 0; i < 2; i++ {
- for j := 0; j < 2; j++ {
- for k := 0; k < 2; k++ {
- key := vm[i] + vm[j] + vm[k]
- compareAB[key] = true
- //fmt.Println(key)
- }
- }
- }
- //fmt.Println("-------------------", len(compareAB))
- //---至少两个一致,其他可能不存在
- //[AB][AB][ABD]
- //[AB][ABD][AB]
- vm = []string{"A", "B"}
- vm2 := []string{"A", "B", "D"}
- for i := 0; i < 2; i++ {
- for j := 0; j < 2; j++ {
- for k := 0; k < 3; k++ {
- key := vm[i] + vm[j] + vm2[k]
- if !compareAB[key] {
- compareAB2D[key] = true
- //fmt.Println(key)
- }
- }
- }
- }
- for i := 0; i < 2; i++ {
- for j := 0; j < 3; j++ {
- for k := 0; k < 2; k++ {
- key := vm[i] + vm2[j] + vm[k]
- if !compareAB[key] {
- compareAB2D[key] = true
- //fmt.Println(key)
- }
- }
- }
- }
- //fmt.Println("-------------------", len(compareAB2D))
- //---至少一个一致,其他可能不存在
- //[ABD][ABD][ABD] //已经删除DDD
- vm = []string{"A", "B", "D"}
- for i := 0; i < 3; i++ {
- for j := 0; j < 3; j++ {
- for k := 0; k < 3; k++ {
- key := vm[i] + vm[j] + vm[k]
- if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] {
- compareABD[key] = true
- //fmt.Println(key)
- }
- }
- }
- }
- //fmt.Println("-------------------", len(compareABD))
- //[AB][ABCD][AB]
- //[AB][AB][ABCD]
- vm = []string{"A", "B"}
- vm2 = []string{"A", "B", "C", "D"}
- for i := 0; i < 2; i++ {
- for j := 0; j < 4; j++ {
- for k := 0; k < 2; k++ {
- key := vm[i] + vm2[j] + vm[k]
- if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
- compareAB2CD[key] = true
- //fmt.Println(key)
- }
- }
- }
- }
- for i := 0; i < 2; i++ {
- for j := 0; j < 2; j++ {
- for k := 0; k < 4; k++ {
- key := vm[i] + vm[j] + vm2[k]
- if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
- compareAB2CD[key] = true
- //fmt.Println(key)
- }
- }
- }
- }
- //fmt.Println("-------------------", len(compareAB2CD))
- //[ABECD][ABECD][ABECD] //已经删除[CD][CD][CD] //这个要重点讨论
- vm = []string{"A", "B", "C", "D"}
- for i := 0; i < 4; i++ {
- for j := 0; j < 4; j++ {
- for k := 0; k < 4; k++ {
- key := vm[i] + vm[j] + vm[k]
- if !compareAB[key] && !compareAB2D[key] && !compareABD[key] && !compareNoPass[key] && !compareAB2CD[key] {
- compareABCD[key] = true
- //fmt.Println(key)
- }
- }
- }
- }
- }
- //项目合并对象
- type ProjectTask struct {
- InitMinTime int64 //最小时间,小于0的处理一次
- name string
- thread int //线程数
- //查找锁
- findLock sync.Mutex
- wg sync.WaitGroup
- //map锁
- AllIdsMapLock sync.Mutex
- //对应的id
- AllIdsMap map[string]*ID
- //采购单位、项目名称、项目编号
- mapPb, mapPn, mapPc map[string]*Key
- //bidtype、bidstatus 锁
- mapBidLock sync.Mutex
- //更新或新增通道
- updatePool chan []map[string]interface{}
- //savePool chan map[string]interface{}
- //saveSign, updateSign chan bool
- //表名
- coll string
- //当前状态是全量还是增量
- currentType string //当前是跑全量还是跑增量
- //
- clearContimes int
- //当前时间
- currentTime int64
- //保存长度
- saveSize int
- pici int64
- validTime int64
- statusTime int64
- //结果时间的更新 最近两天的公告不再更新jgtime
- jgTime int64
- // LockPool chan *sync.Mutex
- // LockPoolLock sync.Mutex
- // m1, m23, m4 map[int]int
- // l1, l23, l4 map[int]*sync.Mutex
- Brun bool
- }
- func CheckHanAndNum(str string) (b bool) {
- return nreg1.MatchString(str) && hreg1.MatchString(str)
- }
- func CheckZimuAndNum(str string) (b bool) {
- return zreg1.MatchString(str) && nreg1.MatchString(str)
- }
- type KeyMap struct {
- Lock sync.Mutex
- Map map[string]*Key
- }
- type ID struct {
- Id string
- Lock sync.Mutex
- P *ProjectInfo
- }
- type Key struct {
- Arr []string
- Lock sync.Mutex
- }
- type IdAndLock struct {
- Id string
- Lock sync.Mutex
- }
- func NewKeyMap() *KeyMap {
- return &KeyMap{
- Map: map[string]*Key{},
- Lock: sync.Mutex{},
- }
- }
- //招标信息实体类
- type Info struct {
- Id string `json:"_id"`
- Href string `json:"href"` //源地址
- Publishtime int64 `json:"publishtime"`
- Comeintime int64 `json:"comeintime"`
- Title string `json:"title"`
- TopType string `json:"toptype"`
- SubType string `json:"subtype"`
- ProjectName string `json:"projectname"`
- ProjectCode string `json:"projectcode"`
- ProjectScope string `json:"projectscope"`
- ContractCode string `json:"contractcode"`
- Buyer string `json:"buyer"`
- Buyerperson string `json:"buyerperson"`
- Buyertel string `json:"buyertel"`
- Agency string `json:"agency"`
- Area string `json:"area"`
- City string `json:"city"`
- District string `json:"district"`
- Infoformat int `json:"infoformat"`
- ReviewExperts []string `json:"review_experts"`
- Purchasing string `json:"purchasing"`
- WinnerOrder []map[string]interface{} `json:"winnerorder"`
- ProjectScale string `json:"project_scale"`
- ProjectDuration int `json:"project_duration"`
- ProjectTimeUnit string `json:"project_timeunit"`
- ProjectStartDate int64 `json:"project_startdate"`
- ProjectCompleteDate int64 `json:"project_completedate"`
- Payway string `json:"payway"`
- ContractGuarantee bool `json:"contract_guarantee"`
- BidGuarantee bool `json:"bid_guarantee"`
- Qualifies []map[string]interface{} `json:"qualifies"`
- EntIdList []string `json:"entidlist"`
- HasPackage bool // `json:"haspackage"`
- Package map[string]interface{} `json:"package"`
- Topscopeclass []string `json:"topscopeclass"`
- Subscopeclass []string `json:"subscopeclass"`
- Buyerclass string `json:"buyerclass"`
- Bidopentime int64 `json:"bidopentime"`
- Budget float64 `json:"budget"`
- Bidamount float64 `json:"bidamount"`
- TagRule string `json:"tag_rule"`
- TopBuyerclass string `json:"top_buyerclass"`
- FirstTag string `json:"first_tag"`
- SecondTag string `json:"second_tag"`
- Winners []string
- dealtype int
- PTC string //从标题中抽的项目编号
- pnbval int //项目名称、编号、采购单位存在的个数
- LenPC int //项目编号长度
- LenPN int //项目名称长度
- LenPTC int //标题抽的项目编号长度
- //以下三个元素做对比,计算包含时候使用
- PNBH int //0初始,+包含,-被包含
- PCBH int
- PTCBH int
- }
- //项目实体类
- type ProjectInfo struct {
- Id primitive.ObjectID `json:"_id"`
- FirstTime int64 `json:"firsttime,omitempty"` //项目的最早时间
- LastTime int64 `json:"lasttime,omitempty"` //项目的最后时间
- Ids []string `json:"ids,omitempty"`
- Topscopeclass []string `json:"topscopeclass,omitempty"`
- Subscopeclass []string `json:"subscopeclass,omitempty"` //子行业分类
- Winners []string `json:"s_winner,omitempty"` //中标人
- ProjectName string `json:"projectname,omitempty"` //项目名称
- ProjectCode string `json:"projectcode,omitempty"` //项目代码唯一(纯数字的权重低)
- ContractCode string `json:"contractcode,omitempty"` //项目编号
- Buyer string `json:"buyer,omitempty"` //采购单位唯一
- MPN []string `json:"mpn,omitempty"` //合并后多余的项目名称
- MPC []string `json:"mpc,omitempty"` //合并后多余的项目编号
- Buyerperson string `json:"buyerperson"` //采购联系人
- Buyertel string `json:"buyertel"` //采购联系人电话
- Agency string `json:"agency"` //代理机构
- Area string `json:"area"` //地区
- City string `json:"city"` //地市
- District string `json:"district"` //区县
- Bidstatus string `json:"bidstatus"` //
- Bidtype string `json:"bidtype"` //
- ReviewExperts []string `json:"review_experts"` // 项目评审专家
- Purchasing string `json:"purchasing"` // 标的物
- Package map[string]interface{} `json:"package,omitempty"` //分包的对比对象
- Buyerclass string `json:"buyerclass"` //采购单位分类
- Bidopentime int64 `json:"bidopentime,omitempty"` //开标时间
- Jgtime int64 `json:"jgtime"` //结果中标时间
- Zbtime int64 `json:"zbtime"` //招标时间
- Bidamount float64 `json:"bidamount,omitempty"` //中标金额
- Budget float64 `json:"budget,omitempty"` //预算
- Winnerorder []string `json:"winnerorder"` //中标候选人
- ProjectScale string `json:"project_scale"` //项目规模
- ProjectDuration int `json:"project_duration"` //工期时长
- ProjectTimeunit string `json:"project_timeunit"` //工期时长单位
- ProjectStartDate int64 `json:"project_startdate"` //开工日期
- ProjctCompleteDate int64 `json:"projct_completedate"` //竣工日期
- Payway string `json:"payway"` //付款方式
- ContractGuarantee bool `json:"contract_guarantee"` //履约保证金 是否支持包含
- BidGuarantee bool `json:"bid_guarantee"` //投标保证金 是否支持包含
- Qualifies string `json:"qualifies"` //资质条件
- TagRule string `json:"tag_rule"` //数据标签
- TopBuyerclass string `json:"top_buyerclass"`
- FirstTag string `json:"first_tag"`
- SecondTag string `json:"second_tag"`
- IsOperators bool `json:"isOperators"` //是否是运营商
- EntIdList []string `json:"entidlist"` //企业id
- score int
- comStr string
- resVal, pjVal int
- InfoFiled map[string]InfoField `json:"infofield"` //逻辑处理需要的info字段
- Budgettag int `json:"budgettag"` //预算是否有效标记
- Bidamounttag int `json:"bidamounttag"` //中标金额是否有效标记
- }
- //存储部分招标信息字段,业务逻辑处理需要
- type InfoField struct {
- Budget float64 `json:"budget"`
- Bidamount float64 `json:"bidamount"`
- ContractCode string `json:"contractcode"`
- ProjectName string `json:"projectname"`
- ProjectCode string `json:"projectcode"`
- Bidstatus string `json:"bidstatus"`
- }
- //二分字符串查找
- func BinarySearch(s []string, k string) int {
- sort.Strings(s)
- lo, hi := 0, len(s)-1
- for lo <= hi {
- m := (lo + hi) >> 1
- if s[m] < k {
- lo = m + 1
- } else if s[m] > k {
- hi = m - 1
- } else {
- return m
- }
- }
- return -1
- }
- func Duplicate(a interface{}) (ret []interface{}) {
- va := reflect.ValueOf(a)
- for i := 0; i < va.Len(); i++ {
- if i > 0 && reflect.DeepEqual(va.Index(i-1).Interface(), va.Index(i).Interface()) {
- continue
- }
- ret = append(ret, va.Index(i).Interface())
- }
- return ret
- }
- //计算文本相似度
- func CosineSimilar(srcWords1, dstWords1 string) float64 {
- srcWords, dstWords := strings.Split(srcWords1, ""), strings.Split(dstWords1, "")
- // get all words
- allWordsMap := make(map[string]int, 0)
- for _, word := range srcWords {
- if _, found := allWordsMap[word]; !found {
- allWordsMap[word] = 1
- } else {
- allWordsMap[word] += 1
- }
- }
- for _, word := range dstWords {
- if _, found := allWordsMap[word]; !found {
- allWordsMap[word] = 1
- } else {
- allWordsMap[word] += 1
- }
- }
- // stable the sort
- allWordsSlice := make([]string, 0)
- for word, _ := range allWordsMap {
- allWordsSlice = append(allWordsSlice, word)
- }
- // assemble vector
- srcVector := make([]int, len(allWordsSlice))
- dstVector := make([]int, len(allWordsSlice))
- for _, word := range srcWords {
- if index := BinarySearch(allWordsSlice, word); index != -1 {
- srcVector[index] += 1
- }
- }
- for _, word := range dstWords {
- if index := BinarySearch(allWordsSlice, word); index != -1 {
- dstVector[index] += 1
- }
- }
- // calc cos
- numerator := float64(0)
- srcSq := 0
- dstSq := 0
- for i, srcCount := range srcVector {
- dstCount := dstVector[i]
- numerator += float64(srcCount * dstCount)
- srcSq += srcCount * srcCount
- dstSq += dstCount * dstCount
- }
- denominator := math.Sqrt(float64(srcSq * dstSq))
- v1 := numerator / denominator
- // if v1 > 0.6 {
- // log.Println(v1, srcWords1, dstWords1)
- // }
- return v1
- }
|