init.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. package main
  2. import (
  3. "go.mongodb.org/mongo-driver/bson/primitive"
  4. "math"
  5. "mongodb"
  6. "qfw/util"
  7. "reflect"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. "sync"
  12. )
  13. var (
  14. Sysconfig map[string]interface{} //读取配置文件
  15. MongoTool *mongodb.MongodbSim //mongodb连接
  16. ExtractColl, ProjectColl string //抽取表、项目表、项目快照表、站点表
  17. Thread int //配置项线程数
  18. operators []string // 运营商
  19. )
  20. var (
  21. //判断是日期
  22. _datereg = regexp.MustCompile("20[0-2][0-9][年-][0-9]{1,2}[月-][0-9]{1,2}[日-]([0-9]{1,2}时[0-9]{0,2})?")
  23. _numreg1 = regexp.MustCompile("^[0-9-]{1,8}$")
  24. _zimureg1 = regexp.MustCompile("^[a-zA-Z-]{1,7}$")
  25. _nzreg = regexp.MustCompile("^[0-9a-zA-Z-]+$")
  26. _hanreg = regexp.MustCompile(`^[\p{Han}::【】\\[\\]()()--、]+$`)
  27. replaceStr = regexp.MustCompile("(工程|采购|项目|[?!、【】()—()--]|栏标价|中标候选人|招标代理)")
  28. //判断带有分包、等特定词的
  29. pStr = regexp.MustCompile("(勘察|监理|施工|设计|验收|标段|分包|子包|[0-9A-Z]包|[一二三四五六七八九十0-9]批)")
  30. //判断包含数值
  31. nreg1 = regexp.MustCompile("[0-9]{2,}")
  32. //判断包含字母
  33. zreg1 = regexp.MustCompile("[a-zA-Z]{1,}")
  34. //判断包含汉字
  35. hreg1 = regexp.MustCompile(`[\p{Han}]+`)
  36. //判断项目编号是在10以内的纯数字结构
  37. numCheckPc = regexp.MustCompile("^[0-9-]{1,10}$")
  38. //仅初始化使用
  39. compareNoPass = map[string]bool{}
  40. compareAB = map[string]bool{}
  41. compareAB2D = map[string]bool{}
  42. compareABD = map[string]bool{}
  43. compareAB2CD = map[string]bool{}
  44. compareABCD = map[string]bool{}
  45. )
  46. func init() {
  47. util.ReadConfig(&Sysconfig)
  48. MongoTool = &mongodb.MongodbSim{
  49. MongodbAddr: Sysconfig["mongodbServers"].(string),
  50. Size: util.IntAll(Sysconfig["mongodbPoolSize"]),
  51. DbName: Sysconfig["mongodbName"].(string),
  52. //UserName: "root",
  53. //Password: "root",
  54. }
  55. MongoTool.InitPool()
  56. ExtractColl = Sysconfig["extractColl"].(string)
  57. ProjectColl = Sysconfig["projectColl"].(string)
  58. Thread = util.IntAll(Sysconfig["thread"])
  59. operators = strings.Split(util.ObjToString(Sysconfig["operators"]), ",")
  60. //加载项目数据
  61. //---不能通过
  62. vm := []string{"C", "D"}
  63. for i := 0; i < 2; i++ {
  64. for j := 0; j < 2; j++ {
  65. for k := 0; k < 2; k++ {
  66. key := vm[i] + vm[j] + vm[k]
  67. compareNoPass[key] = true
  68. //fmt.Println(key)
  69. }
  70. }
  71. }
  72. //fmt.Println("-------------------")
  73. //三个元素一致 [AB][AB][AB],分值最高
  74. vm = []string{"A", "B"}
  75. for i := 0; i < 2; i++ {
  76. for j := 0; j < 2; j++ {
  77. for k := 0; k < 2; k++ {
  78. key := vm[i] + vm[j] + vm[k]
  79. compareAB[key] = true
  80. //fmt.Println(key)
  81. }
  82. }
  83. }
  84. //fmt.Println("-------------------", len(compareAB))
  85. //---至少两个一致,其他可能不存在
  86. //[AB][AB][ABD]
  87. //[AB][ABD][AB]
  88. vm = []string{"A", "B"}
  89. vm2 := []string{"A", "B", "D"}
  90. for i := 0; i < 2; i++ {
  91. for j := 0; j < 2; j++ {
  92. for k := 0; k < 3; k++ {
  93. key := vm[i] + vm[j] + vm2[k]
  94. if !compareAB[key] {
  95. compareAB2D[key] = true
  96. //fmt.Println(key)
  97. }
  98. }
  99. }
  100. }
  101. for i := 0; i < 2; i++ {
  102. for j := 0; j < 3; j++ {
  103. for k := 0; k < 2; k++ {
  104. key := vm[i] + vm2[j] + vm[k]
  105. if !compareAB[key] {
  106. compareAB2D[key] = true
  107. //fmt.Println(key)
  108. }
  109. }
  110. }
  111. }
  112. //fmt.Println("-------------------", len(compareAB2D))
  113. //---至少一个一致,其他可能不存在
  114. //[ABD][ABD][ABD] //已经删除DDD
  115. vm = []string{"A", "B", "D"}
  116. for i := 0; i < 3; i++ {
  117. for j := 0; j < 3; j++ {
  118. for k := 0; k < 3; k++ {
  119. key := vm[i] + vm[j] + vm[k]
  120. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] {
  121. compareABD[key] = true
  122. //fmt.Println(key)
  123. }
  124. }
  125. }
  126. }
  127. //fmt.Println("-------------------", len(compareABD))
  128. //[AB][ABCD][AB]
  129. //[AB][AB][ABCD]
  130. vm = []string{"A", "B"}
  131. vm2 = []string{"A", "B", "C", "D"}
  132. for i := 0; i < 2; i++ {
  133. for j := 0; j < 4; j++ {
  134. for k := 0; k < 2; k++ {
  135. key := vm[i] + vm2[j] + vm[k]
  136. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
  137. compareAB2CD[key] = true
  138. //fmt.Println(key)
  139. }
  140. }
  141. }
  142. }
  143. for i := 0; i < 2; i++ {
  144. for j := 0; j < 2; j++ {
  145. for k := 0; k < 4; k++ {
  146. key := vm[i] + vm[j] + vm2[k]
  147. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
  148. compareAB2CD[key] = true
  149. //fmt.Println(key)
  150. }
  151. }
  152. }
  153. }
  154. //fmt.Println("-------------------", len(compareAB2CD))
  155. //[ABECD][ABECD][ABECD] //已经删除[CD][CD][CD] //这个要重点讨论
  156. vm = []string{"A", "B", "C", "D"}
  157. for i := 0; i < 4; i++ {
  158. for j := 0; j < 4; j++ {
  159. for k := 0; k < 4; k++ {
  160. key := vm[i] + vm[j] + vm[k]
  161. if !compareAB[key] && !compareAB2D[key] && !compareABD[key] && !compareNoPass[key] && !compareAB2CD[key] {
  162. compareABCD[key] = true
  163. //fmt.Println(key)
  164. }
  165. }
  166. }
  167. }
  168. }
  169. //项目合并对象
  170. type ProjectTask struct {
  171. InitMinTime int64 //最小时间,小于0的处理一次
  172. name string
  173. thread int //线程数
  174. //查找锁
  175. findLock sync.Mutex
  176. wg sync.WaitGroup
  177. //map锁
  178. AllIdsMapLock sync.Mutex
  179. //对应的id
  180. AllIdsMap map[string]*ID
  181. //采购单位、项目名称、项目编号
  182. mapPb, mapPn, mapPc map[string]*Key
  183. //bidtype、bidstatus 锁
  184. mapBidLock sync.Mutex
  185. //更新或新增通道
  186. updatePool chan []map[string]interface{}
  187. //savePool chan map[string]interface{}
  188. //saveSign, updateSign chan bool
  189. //表名
  190. coll string
  191. //当前状态是全量还是增量
  192. currentType string //当前是跑全量还是跑增量
  193. //
  194. clearContimes int
  195. //当前时间
  196. currentTime int64
  197. //保存长度
  198. saveSize int
  199. pici int64
  200. validTime int64
  201. statusTime int64
  202. //结果时间的更新 最近两天的公告不再更新jgtime
  203. jgTime int64
  204. // LockPool chan *sync.Mutex
  205. // LockPoolLock sync.Mutex
  206. // m1, m23, m4 map[int]int
  207. // l1, l23, l4 map[int]*sync.Mutex
  208. Brun bool
  209. }
  210. func CheckHanAndNum(str string) (b bool) {
  211. return nreg1.MatchString(str) && hreg1.MatchString(str)
  212. }
  213. func CheckZimuAndNum(str string) (b bool) {
  214. return zreg1.MatchString(str) && nreg1.MatchString(str)
  215. }
  216. type KeyMap struct {
  217. Lock sync.Mutex
  218. Map map[string]*Key
  219. }
  220. type ID struct {
  221. Id string
  222. Lock sync.Mutex
  223. P *ProjectInfo
  224. }
  225. type Key struct {
  226. Arr []string
  227. Lock sync.Mutex
  228. }
  229. type IdAndLock struct {
  230. Id string
  231. Lock sync.Mutex
  232. }
  233. func NewKeyMap() *KeyMap {
  234. return &KeyMap{
  235. Map: map[string]*Key{},
  236. Lock: sync.Mutex{},
  237. }
  238. }
  239. //招标信息实体类
  240. type Info struct {
  241. Id string `json:"_id"`
  242. Href string `json:"href"` //源地址
  243. Publishtime int64 `json:"publishtime"`
  244. Comeintime int64 `json:"comeintime"`
  245. Title string `json:"title"`
  246. TopType string `json:"toptype"`
  247. SubType string `json:"subtype"`
  248. ProjectName string `json:"projectname"`
  249. ProjectCode string `json:"projectcode"`
  250. ProjectScope string `json:"projectscope"`
  251. ContractCode string `json:"contractcode"`
  252. Buyer string `json:"buyer"`
  253. Buyerperson string `json:"buyerperson"`
  254. Buyertel string `json:"buyertel"`
  255. Agency string `json:"agency"`
  256. Area string `json:"area"`
  257. City string `json:"city"`
  258. District string `json:"district"`
  259. Infoformat int `json:"infoformat"`
  260. ReviewExperts []string `json:"review_experts"`
  261. Purchasing string `json:"purchasing"`
  262. WinnerOrder []map[string]interface{} `json:"winnerorder"`
  263. ProjectScale string `json:"project_scale"`
  264. ProjectDuration int `json:"project_duration"`
  265. ProjectTimeUnit string `json:"project_timeunit"`
  266. ProjectStartDate int64 `json:"project_startdate"`
  267. ProjectCompleteDate int64 `json:"project_completedate"`
  268. Payway string `json:"payway"`
  269. ContractGuarantee bool `json:"contract_guarantee"`
  270. BidGuarantee bool `json:"bid_guarantee"`
  271. Qualifies []map[string]interface{} `json:"qualifies"`
  272. EntIdList []string `json:"entidlist"`
  273. HasPackage bool // `json:"haspackage"`
  274. Package map[string]interface{} `json:"package"`
  275. Topscopeclass []string `json:"topscopeclass"`
  276. Subscopeclass []string `json:"subscopeclass"`
  277. Buyerclass string `json:"buyerclass"`
  278. Bidopentime int64 `json:"bidopentime"`
  279. Budget float64 `json:"budget"`
  280. Bidamount float64 `json:"bidamount"`
  281. TagRule string `json:"tag_rule"`
  282. Winners []string
  283. dealtype int
  284. PTC string //从标题中抽的项目编号
  285. pnbval int //项目名称、编号、采购单位存在的个数
  286. LenPC int //项目编号长度
  287. LenPN int //项目名称长度
  288. LenPTC int //标题抽的项目编号长度
  289. //以下三个元素做对比,计算包含时候使用
  290. PNBH int //0初始,+包含,-被包含
  291. PCBH int
  292. PTCBH int
  293. }
  294. //项目实体类
  295. type ProjectInfo struct {
  296. Id primitive.ObjectID `json:"_id"`
  297. FirstTime int64 `json:"firsttime,omitempty"` //项目的最早时间
  298. LastTime int64 `json:"lasttime,omitempty"` //项目的最后时间
  299. Ids []string `json:"ids,omitempty"`
  300. Topscopeclass []string `json:"topscopeclass,omitempty"`
  301. Subscopeclass []string `json:"subscopeclass,omitempty"` //子行业分类
  302. Winners []string `json:"s_winner,omitempty"` //中标人
  303. ProjectName string `json:"projectname,omitempty"` //项目名称
  304. ProjectCode string `json:"projectcode,omitempty"` //项目代码唯一(纯数字的权重低)
  305. ContractCode string `json:"contractcode,omitempty"` //项目编号
  306. Buyer string `json:"buyer,omitempty"` //采购单位唯一
  307. MPN []string `json:"mpn,omitempty"` //合并后多余的项目名称
  308. MPC []string `json:"mpc,omitempty"` //合并后多余的项目编号
  309. Buyerperson string `json:"buyerperson"` //采购联系人
  310. Buyertel string `json:"buyertel"` //采购联系人电话
  311. Agency string `json:"agency"` //代理机构
  312. Area string `json:"area"` //地区
  313. City string `json:"city"` //地市
  314. District string `json:"district"` //区县
  315. Bidstatus string `json:"bidstatus"` //
  316. Bidtype string `json:"bidtype"` //
  317. ReviewExperts []string `json:"review_experts"` // 项目评审专家
  318. Purchasing string `json:"purchasing"` // 标的物
  319. Package map[string]interface{} `json:"package,omitempty"` //分包的对比对象
  320. Buyerclass string `json:"buyerclass"` //采购单位分类
  321. Bidopentime int64 `json:"bidopentime,omitempty"` //开标时间
  322. Jgtime int64 `json:"jgtime"` //结果中标时间
  323. Zbtime int64 `json:"zbtime"` //招标时间
  324. Bidamount float64 `json:"bidamount,omitempty"` //中标金额
  325. Budget float64 `json:"budget,omitempty"` //预算
  326. Winnerorder []string `json:"winnerorder"` //中标候选人
  327. ProjectScale string `json:"project_scale"` //项目规模
  328. ProjectDuration int `json:"project_duration"` //工期时长
  329. ProjectTimeunit string `json:"project_timeunit"` //工期时长单位
  330. ProjectStartDate int64 `json:"project_startdate"` //开工日期
  331. ProjctCompleteDate int64 `json:"projct_completedate"` //竣工日期
  332. Payway string `json:"payway"` //付款方式
  333. ContractGuarantee bool `json:"contract_guarantee"` //履约保证金 是否支持包含
  334. BidGuarantee bool `json:"bid_guarantee"` //投标保证金 是否支持包含
  335. Qualifies string `json:"qualifies"` //资质条件
  336. TagRule string `json:"tag_rule"` //数据标签
  337. IsOperators bool `json:"isOperators"` //是否是运营商
  338. EntIdList []string `json:"entidlist"` //企业id
  339. score int
  340. comStr string
  341. resVal, pjVal int
  342. InfoFiled map[string]InfoField `json:"infofield"` //逻辑处理需要的info字段
  343. Budgettag int `json:"budgettag"` //预算是否有效标记
  344. Bidamounttag int `json:"bidamounttag"` //中标金额是否有效标记
  345. }
  346. //存储部分招标信息字段,业务逻辑处理需要
  347. type InfoField struct {
  348. Budget float64 `json:"budget"`
  349. Bidamount float64 `json:"bidamount"`
  350. ContractCode string `json:"contractcode"`
  351. ProjectName string `json:"projectname"`
  352. ProjectCode string `json:"projectcode"`
  353. Bidstatus string `json:"bidstatus"`
  354. }
  355. //二分字符串查找
  356. func BinarySearch(s []string, k string) int {
  357. sort.Strings(s)
  358. lo, hi := 0, len(s)-1
  359. for lo <= hi {
  360. m := (lo + hi) >> 1
  361. if s[m] < k {
  362. lo = m + 1
  363. } else if s[m] > k {
  364. hi = m - 1
  365. } else {
  366. return m
  367. }
  368. }
  369. return -1
  370. }
  371. func Duplicate(a interface{}) (ret []interface{}) {
  372. va := reflect.ValueOf(a)
  373. for i := 0; i < va.Len(); i++ {
  374. if i > 0 && reflect.DeepEqual(va.Index(i-1).Interface(), va.Index(i).Interface()) {
  375. continue
  376. }
  377. ret = append(ret, va.Index(i).Interface())
  378. }
  379. return ret
  380. }
  381. //计算文本相似度
  382. func CosineSimilar(srcWords1, dstWords1 string) float64 {
  383. srcWords, dstWords := strings.Split(srcWords1, ""), strings.Split(dstWords1, "")
  384. // get all words
  385. allWordsMap := make(map[string]int, 0)
  386. for _, word := range srcWords {
  387. if _, found := allWordsMap[word]; !found {
  388. allWordsMap[word] = 1
  389. } else {
  390. allWordsMap[word] += 1
  391. }
  392. }
  393. for _, word := range dstWords {
  394. if _, found := allWordsMap[word]; !found {
  395. allWordsMap[word] = 1
  396. } else {
  397. allWordsMap[word] += 1
  398. }
  399. }
  400. // stable the sort
  401. allWordsSlice := make([]string, 0)
  402. for word, _ := range allWordsMap {
  403. allWordsSlice = append(allWordsSlice, word)
  404. }
  405. // assemble vector
  406. srcVector := make([]int, len(allWordsSlice))
  407. dstVector := make([]int, len(allWordsSlice))
  408. for _, word := range srcWords {
  409. if index := BinarySearch(allWordsSlice, word); index != -1 {
  410. srcVector[index] += 1
  411. }
  412. }
  413. for _, word := range dstWords {
  414. if index := BinarySearch(allWordsSlice, word); index != -1 {
  415. dstVector[index] += 1
  416. }
  417. }
  418. // calc cos
  419. numerator := float64(0)
  420. srcSq := 0
  421. dstSq := 0
  422. for i, srcCount := range srcVector {
  423. dstCount := dstVector[i]
  424. numerator += float64(srcCount * dstCount)
  425. srcSq += srcCount * srcCount
  426. dstSq += dstCount * dstCount
  427. }
  428. denominator := math.Sqrt(float64(srcSq * dstSq))
  429. v1 := numerator / denominator
  430. // if v1 > 0.6 {
  431. // log.Println(v1, srcWords1, dstWords1)
  432. // }
  433. return v1
  434. }