init.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. package main
  2. import (
  3. "log"
  4. "math"
  5. mu "mfw/util"
  6. "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "sync"
  11. "go.mongodb.org/mongo-driver/bson/primitive"
  12. )
  13. const (
  14. ProjectCache = "info" //存放每条项目信息,key为项目ID
  15. )
  16. var (
  17. Sysconfig map[string]interface{} //读取配置文件
  18. MongoTool *MongodbSim //mongodb连接
  19. ExtractColl, ProjectColl, BackupColl, SiteColl string //抽取表、项目表、项目快照表、站点表
  20. UpdateColl string // 金额修改数据表
  21. Thread int //配置项线程数
  22. //NextNode []interface{}
  23. BlackList []interface{}
  24. BlaskListMap map[string]bool
  25. )
  26. var (
  27. //判断是日期
  28. _datereg = regexp.MustCompile("20[0-2][0-9][年-][0-9]{1,2}[月-][0-9]{1,2}[日-]([0-9]{1,2}时[0-9]{0,2})?")
  29. _numreg1 = regexp.MustCompile("^[0-9-]{1,8}$")
  30. _zimureg1 = regexp.MustCompile("^[a-zA-Z-]{1,7}$")
  31. _nzreg = regexp.MustCompile("^[0-9a-zA-Z-]+$")
  32. _hanreg = regexp.MustCompile(`^[\p{Han}::【】\\[\\]()()--、]+$`)
  33. replaceStr = regexp.MustCompile("(工程|采购|项目|[?!、【】()—()--]|栏标价|中标候选人|招标代理)")
  34. //判断带有分包、等特定词的
  35. pStr = regexp.MustCompile("(勘察|监理|施工|设计|验收|标段|分包|子包|[0-9A-Z]包|[一二三四五六七八九十0-9]批)")
  36. //判断包含数值
  37. nreg1 = regexp.MustCompile("[0-9]{2,}")
  38. //判断包含字母
  39. zreg1 = regexp.MustCompile("[a-zA-Z]{1,}")
  40. //判断包含汉字
  41. hreg1 = regexp.MustCompile(`[\p{Han}]+`)
  42. //判断项目编号是在10以内的纯数字结构
  43. numCheckPc = regexp.MustCompile("^[0-9-]{1,10}$")
  44. //仅初始化使用
  45. compareNoPass = map[string]bool{}
  46. compareAB = map[string]bool{}
  47. compareAB2D = map[string]bool{}
  48. compareABD = map[string]bool{}
  49. compareAB2CD = map[string]bool{}
  50. compareABCD = map[string]bool{}
  51. )
  52. func init() {
  53. util.ReadConfig(&Sysconfig)
  54. MongoTool = &MongodbSim{
  55. MongodbAddr: Sysconfig["mongodbServers"].(string),
  56. Size: util.IntAll(Sysconfig["mongodbPoolSize"]),
  57. DbName: Sysconfig["mongodbName"].(string),
  58. }
  59. MongoTool.InitPool()
  60. ExtractColl = Sysconfig["extractColl"].(string)
  61. ProjectColl = Sysconfig["projectColl"].(string)
  62. UpdateColl = Sysconfig["updateColl"].(string)
  63. BackupColl = Sysconfig["projectColl"].(string) + "_back"
  64. SiteColl = Sysconfig["siteColl"].(string)
  65. Thread = util.IntAll(Sysconfig["thread"])
  66. //NextNode = Sysconfig["nextNode"].([]interface{})
  67. udpport, _ := Sysconfig["udpport"].(string)
  68. udpclient = mu.UdpClient{Local: udpport, BufSize: 1024}
  69. udpclient.Listen(processUdpMsg)
  70. log.Println("Udp服务监听", udpport)
  71. BlackList = Sysconfig["blacklist"].([]interface{})
  72. BlaskListMap = make(map[string]bool)
  73. for _, v := range BlackList {
  74. BlaskListMap[util.ObjToString(v)] = true
  75. }
  76. //加载项目数据
  77. //---不能通过
  78. vm := []string{"C", "D"}
  79. for i := 0; i < 2; i++ {
  80. for j := 0; j < 2; j++ {
  81. for k := 0; k < 2; k++ {
  82. key := vm[i] + vm[j] + vm[k]
  83. compareNoPass[key] = true
  84. //fmt.Println(key)
  85. }
  86. }
  87. }
  88. //fmt.Println("-------------------")
  89. //三个元素一致 [AB][AB][AB],分值最高
  90. vm = []string{"A", "B"}
  91. for i := 0; i < 2; i++ {
  92. for j := 0; j < 2; j++ {
  93. for k := 0; k < 2; k++ {
  94. key := vm[i] + vm[j] + vm[k]
  95. compareAB[key] = true
  96. //fmt.Println(key)
  97. }
  98. }
  99. }
  100. //fmt.Println("-------------------", len(compareAB))
  101. //---至少两个一致,其他可能不存在
  102. //[AB][AB][ABD]
  103. //[AB][ABD][AB]
  104. vm = []string{"A", "B"}
  105. vm2 := []string{"A", "B", "D"}
  106. for i := 0; i < 2; i++ {
  107. for j := 0; j < 2; j++ {
  108. for k := 0; k < 3; k++ {
  109. key := vm[i] + vm[j] + vm2[k]
  110. if !compareAB[key] {
  111. compareAB2D[key] = true
  112. //fmt.Println(key)
  113. }
  114. }
  115. }
  116. }
  117. for i := 0; i < 2; i++ {
  118. for j := 0; j < 3; j++ {
  119. for k := 0; k < 2; k++ {
  120. key := vm[i] + vm2[j] + vm[k]
  121. if !compareAB[key] {
  122. compareAB2D[key] = true
  123. //fmt.Println(key)
  124. }
  125. }
  126. }
  127. }
  128. //fmt.Println("-------------------", len(compareAB2D))
  129. //---至少一个一致,其他可能不存在
  130. //[ABD][ABD][ABD] //已经删除DDD
  131. vm = []string{"A", "B", "D"}
  132. for i := 0; i < 3; i++ {
  133. for j := 0; j < 3; j++ {
  134. for k := 0; k < 3; k++ {
  135. key := vm[i] + vm[j] + vm[k]
  136. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] {
  137. compareABD[key] = true
  138. //fmt.Println(key)
  139. }
  140. }
  141. }
  142. }
  143. //fmt.Println("-------------------", len(compareABD))
  144. //[AB][ABCD][AB]
  145. //[AB][AB][ABCD]
  146. vm = []string{"A", "B"}
  147. vm2 = []string{"A", "B", "C", "D"}
  148. for i := 0; i < 2; i++ {
  149. for j := 0; j < 4; j++ {
  150. for k := 0; k < 2; k++ {
  151. key := vm[i] + vm2[j] + vm[k]
  152. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
  153. compareAB2CD[key] = true
  154. //fmt.Println(key)
  155. }
  156. }
  157. }
  158. }
  159. for i := 0; i < 2; i++ {
  160. for j := 0; j < 2; j++ {
  161. for k := 0; k < 4; k++ {
  162. key := vm[i] + vm[j] + vm2[k]
  163. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
  164. compareAB2CD[key] = true
  165. //fmt.Println(key)
  166. }
  167. }
  168. }
  169. }
  170. //fmt.Println("-------------------", len(compareAB2CD))
  171. //[ABECD][ABECD][ABECD] //已经删除[CD][CD][CD] //这个要重点讨论
  172. vm = []string{"A", "B", "C", "D"}
  173. for i := 0; i < 4; i++ {
  174. for j := 0; j < 4; j++ {
  175. for k := 0; k < 4; k++ {
  176. key := vm[i] + vm[j] + vm[k]
  177. if !compareAB[key] && !compareAB2D[key] && !compareABD[key] && !compareNoPass[key] && !compareAB2CD[key] {
  178. compareABCD[key] = true
  179. //fmt.Println(key)
  180. }
  181. }
  182. }
  183. }
  184. }
  185. func CheckHanAndNum(str string) (b bool) {
  186. return nreg1.MatchString(str) && hreg1.MatchString(str)
  187. }
  188. func CheckZimuAndNum(str string) (b bool) {
  189. return zreg1.MatchString(str) && nreg1.MatchString(str)
  190. }
  191. type KeyMap struct {
  192. Lock sync.Mutex
  193. Map map[string]*Key
  194. }
  195. type ID struct {
  196. Id string
  197. Lock sync.Mutex
  198. P *ProjectInfo
  199. }
  200. type Key struct {
  201. Arr []string
  202. Lock sync.Mutex
  203. }
  204. type IdAndLock struct {
  205. Id string
  206. Lock sync.Mutex
  207. }
  208. func NewKeyMap() *KeyMap {
  209. return &KeyMap{
  210. Map: map[string]*Key{},
  211. Lock: sync.Mutex{},
  212. }
  213. }
  214. //招标信息实体类
  215. type Info struct {
  216. Id string `json:"_id"`
  217. Href string `json:"href"` //源地址
  218. Publishtime int64 `json:"publishtime"`
  219. Comeintime int64 `json:"comeintime"`
  220. Title string `json:"title"`
  221. TopType string `json:"toptype"`
  222. SubType string `json:"subtype"`
  223. ProjectName string `json:"projectname"`
  224. ProjectCode string `json:"projectcode"`
  225. ProjectScope string `json:"projectscope"`
  226. ContractCode string `json:"contractcode"`
  227. Buyer string `json:"buyer"`
  228. Buyerperson string `json:"buyerperson"`
  229. Buyertel string `json:"buyertel"`
  230. Agency string `json:"agency"`
  231. Area string `json:"area"`
  232. City string `json:"city"`
  233. District string `json:"district"`
  234. Infoformat int `json:"infoformat"`
  235. ReviewExperts []string `json:"review_experts"`
  236. Purchasing string `json:"purchasing"`
  237. WinnerOrder []map[string]interface{} `json:"winnerorder"`
  238. HasPackage bool // `json:"haspackage"`
  239. Package map[string]interface{} `json:"package"`
  240. //PNum string `json:"pnum"`
  241. Topscopeclass []string `json:"topscopeclass"`
  242. Subscopeclass []string `json:"subscopeclass"`
  243. Buyerclass string `json:"buyerclass"`
  244. Bidopentime int64 `json:"bidopentime"`
  245. Budget float64 `json:"budget"`
  246. Bidamount float64 `json:"bidamount"`
  247. Winners []string
  248. dealtype int
  249. PTC string //从标题中抽的项目编号
  250. pnbval int //项目名称、编号、采购单位存在的个数
  251. LenPC int //项目编号长度
  252. LenPN int //项目名称长度
  253. LenPTC int //标题抽的项目编号长度
  254. //以下三个元素做对比,计算包含时候使用
  255. PNBH int //0初始,+包含,-被包含
  256. PCBH int
  257. PTCBH int
  258. }
  259. //项目实体类
  260. type ProjectInfo struct {
  261. Id primitive.ObjectID `json:"_id"`
  262. FirstTime int64 `json:"firsttime,omitempty"` //项目的最早时间
  263. LastTime int64 `json:"lasttime,omitempty"` //项目的最后时间
  264. Ids []string `json:"ids,omitempty"`
  265. Topscopeclass []string `json:"topscopeclass,omitempty"`
  266. Subscopeclass []string `json:"subscopeclass,omitempty"` //子行业分类
  267. Winners []string `json:"s_winner,omitempty"` //中标人
  268. ProjectName string `json:"projectname,omitempty"` //项目名称
  269. ProjectCode string `json:"projectcode,omitempty"` //项目代码唯一(纯数字的权重低)
  270. ContractCode string `json:"contractcode,omitempty"` //项目编号
  271. Buyer string `json:"buyer,omitempty"` //采购单位唯一
  272. MPN []string `json:"mpn,omitempty"` //合并后多余的项目名称
  273. MPC []string `json:"mpc,omitempty"` //合并后多余的项目编号
  274. Buyerperson string `json:"buyerperson"` //采购联系人
  275. Buyertel string `json:"buyertel"` //采购联系人电话
  276. Agency string `json:"agency"` //代理机构
  277. Area string `json:"area"` //地区
  278. City string `json:"city"` //地市
  279. District string `json:"district"` //区县
  280. Bidstatus string `json:"bidstatus"` //
  281. Bidtype string `json:"bidtype"` //
  282. ReviewExperts []string `json:"review_experts"` // 项目评审专家
  283. Purchasing string `json:"purchasing"` // 标的物
  284. //HasPackage bool `json:"haspackage"` //是否有分包
  285. Package map[string]interface{} `json:"package,omitempty"` //分包的对比对象
  286. Buyerclass string `json:"buyerclass"` //采购单位分类
  287. Bidopentime int64 `json:"bidopentime,omitempty"` //开标时间
  288. // Zbtime int64 `json:"zbtime"` //招标时间
  289. Jgtime int64 `json:"jgtime"` //结果中标时间
  290. Zbtime int64 `json:"zbtime"` //招标时间
  291. Bidamount float64 `json:"bidamount,omitempty"` //中标金额
  292. Budget float64 `json:"budget,omitempty"` //预算
  293. Winnerorder []string `json:"winnerorder"` //中标候选人
  294. score int
  295. comStr string
  296. resVal, pjVal int
  297. InfoFiled map[string]InfoField `json:"infofield"` //逻辑处理需要的info字段
  298. Budgettag int `json:"budgettag"` //预算是否有效标记
  299. Bidamounttag int `json:"bidamounttag"` //中标金额是否有效标记
  300. }
  301. //存储部分招标信息字段,业务逻辑处理需要
  302. type InfoField struct {
  303. Budget float64 `json:"budget"`
  304. Bidamount float64 `json:"bidamount"`
  305. ContractCode string `json:"contractcode"`
  306. ProjectName string `json:"projectname"`
  307. ProjectCode string `json:"projectcode"`
  308. Bidstatus string `json:"bidstatus"`
  309. }
  310. //站点信息
  311. type Site struct {
  312. Id string `json:"_id"`
  313. Site string `json:"site"` //站点名字
  314. Area string `json:"area"` //省
  315. City string `json:"city"` //市
  316. District string `json:"district"` //区、县
  317. Domain string `json:"domain"` //地址
  318. Status int `json:"status"` //
  319. }
  320. //二分字符串查找
  321. func BinarySearch(s []string, k string) int {
  322. sort.Strings(s)
  323. lo, hi := 0, len(s)-1
  324. for lo <= hi {
  325. m := (lo + hi) >> 1
  326. if s[m] < k {
  327. lo = m + 1
  328. } else if s[m] > k {
  329. hi = m - 1
  330. } else {
  331. return m
  332. }
  333. }
  334. return -1
  335. }
  336. //计算文本相似度
  337. func CosineSimilar(srcWords1, dstWords1 string) float64 {
  338. srcWords, dstWords := strings.Split(srcWords1, ""), strings.Split(dstWords1, "")
  339. // get all words
  340. allWordsMap := make(map[string]int, 0)
  341. for _, word := range srcWords {
  342. if _, found := allWordsMap[word]; !found {
  343. allWordsMap[word] = 1
  344. } else {
  345. allWordsMap[word] += 1
  346. }
  347. }
  348. for _, word := range dstWords {
  349. if _, found := allWordsMap[word]; !found {
  350. allWordsMap[word] = 1
  351. } else {
  352. allWordsMap[word] += 1
  353. }
  354. }
  355. // stable the sort
  356. allWordsSlice := make([]string, 0)
  357. for word, _ := range allWordsMap {
  358. allWordsSlice = append(allWordsSlice, word)
  359. }
  360. // assemble vector
  361. srcVector := make([]int, len(allWordsSlice))
  362. dstVector := make([]int, len(allWordsSlice))
  363. for _, word := range srcWords {
  364. if index := BinarySearch(allWordsSlice, word); index != -1 {
  365. srcVector[index] += 1
  366. }
  367. }
  368. for _, word := range dstWords {
  369. if index := BinarySearch(allWordsSlice, word); index != -1 {
  370. dstVector[index] += 1
  371. }
  372. }
  373. // calc cos
  374. numerator := float64(0)
  375. srcSq := 0
  376. dstSq := 0
  377. for i, srcCount := range srcVector {
  378. dstCount := dstVector[i]
  379. numerator += float64(srcCount * dstCount)
  380. srcSq += srcCount * srcCount
  381. dstSq += dstCount * dstCount
  382. }
  383. denominator := math.Sqrt(float64(srcSq * dstSq))
  384. v1 := numerator / denominator
  385. // if v1 > 0.6 {
  386. // log.Println(v1, srcWords1, dstWords1)
  387. // }
  388. return v1
  389. }