init.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. package main
  2. import (
  3. "log"
  4. "math"
  5. mu "mfw/util"
  6. "qfw/util"
  7. //"qfw/util/mongodb"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. "sync"
  12. //"gopkg.in/mgo.v2/bson"
  13. //"go.mongodb.org/mongo-driver/bson"
  14. "go.mongodb.org/mongo-driver/bson/primitive"
  15. )
  16. const (
  17. ProjectCache = "info" //存放每条项目信息,key为项目ID
  18. )
  19. var (
  20. Sysconfig map[string]interface{} //读取配置文件
  21. MongoTool *MongodbSim //mongodb连接
  22. ExtractColl, ProjectColl string //抽取表、项目表
  23. //NextNode []interface{}
  24. )
  25. var (
  26. //判断是日期
  27. _datereg = regexp.MustCompile("20[0-2][0-9][年-][0-9]{1,2}[月-][0-9]{1,2}[日-]([0-9]{1,2}时[0-9]{0,2})?")
  28. _numreg1 = regexp.MustCompile("^[0-9-]{1,8}$")
  29. _zimureg1 = regexp.MustCompile("^[a-zA-Z-]{1,7}$")
  30. _nzreg = regexp.MustCompile("^[0-9a-zA-Z-]+$")
  31. _hanreg = regexp.MustCompile(`^[\p{Han}::【】\\[\\]()()--、]+$`)
  32. replaceStr = regexp.MustCompile("(工程|采购|项目|[?!、【】()—()--]|栏标价|中标候选人|招标代理)")
  33. //判断带有分包、等特定词的
  34. pStr = regexp.MustCompile("(勘察|监理|施工|设计|验收|标段|分包|子包|[0-9A-Z]包|[一二三四五六七八九十0-9]批)")
  35. //判断包含数值
  36. nreg1 = regexp.MustCompile("[0-9]{2,}")
  37. //判断包含字母
  38. zreg1 = regexp.MustCompile("[a-zA-Z]{1,}")
  39. //判断包含汉字
  40. hreg1 = regexp.MustCompile(`[\p{Han}]+`)
  41. //判断项目编号是在10以内的纯数字结构
  42. numCheckPc = regexp.MustCompile("^[0-9-]{1,10}$")
  43. //仅初始化使用
  44. compareNoPass = map[string]bool{}
  45. compareAB = map[string]bool{}
  46. compareAB2D = map[string]bool{}
  47. compareABD = map[string]bool{}
  48. compareAB2CD = map[string]bool{}
  49. compareABCD = map[string]bool{}
  50. )
  51. func init() {
  52. util.ReadConfig(&Sysconfig)
  53. MongoTool = &MongodbSim{
  54. MongodbAddr: Sysconfig["mongodbServers"].(string),
  55. Size: util.IntAll(Sysconfig["mongodbPoolSize"]),
  56. DbName: Sysconfig["mongodbName"].(string),
  57. }
  58. MongoTool.InitPool()
  59. ExtractColl = Sysconfig["extractColl"].(string)
  60. ProjectColl = Sysconfig["projectColl"].(string)
  61. //NextNode = Sysconfig["nextNode"].([]interface{})
  62. udpport, _ := Sysconfig["udpport"].(string)
  63. udpclient = mu.UdpClient{Local: udpport, BufSize: 1024}
  64. udpclient.Listen(processUdpMsg)
  65. log.Println("Udp服务监听", udpport)
  66. //加载项目数据
  67. //---不能通过
  68. vm := []string{"C", "D"}
  69. for i := 0; i < 2; i++ {
  70. for j := 0; j < 2; j++ {
  71. for k := 0; k < 2; k++ {
  72. key := vm[i] + vm[j] + vm[k]
  73. compareNoPass[key] = true
  74. //fmt.Println(key)
  75. }
  76. }
  77. }
  78. //fmt.Println("-------------------")
  79. //三个元素一致 [AB][AB][AB],分值最高
  80. vm = []string{"A", "B"}
  81. for i := 0; i < 2; i++ {
  82. for j := 0; j < 2; j++ {
  83. for k := 0; k < 2; k++ {
  84. key := vm[i] + vm[j] + vm[k]
  85. compareAB[key] = true
  86. //fmt.Println(key)
  87. }
  88. }
  89. }
  90. //fmt.Println("-------------------", len(compareAB))
  91. //---至少两个一致,其他可能不存在
  92. //[AB][AB][ABD]
  93. //[AB][ABD][AB]
  94. vm = []string{"A", "B"}
  95. vm2 := []string{"A", "B", "D"}
  96. for i := 0; i < 2; i++ {
  97. for j := 0; j < 2; j++ {
  98. for k := 0; k < 3; k++ {
  99. key := vm[i] + vm[j] + vm2[k]
  100. if !compareAB[key] {
  101. compareAB2D[key] = true
  102. //fmt.Println(key)
  103. }
  104. }
  105. }
  106. }
  107. for i := 0; i < 2; i++ {
  108. for j := 0; j < 3; j++ {
  109. for k := 0; k < 2; k++ {
  110. key := vm[i] + vm2[j] + vm[k]
  111. if !compareAB[key] {
  112. compareAB2D[key] = true
  113. //fmt.Println(key)
  114. }
  115. }
  116. }
  117. }
  118. //fmt.Println("-------------------", len(compareAB2D))
  119. //---至少一个一致,其他可能不存在
  120. //[ABD][ABD][ABD] //已经删除DDD
  121. vm = []string{"A", "B", "D"}
  122. for i := 0; i < 3; i++ {
  123. for j := 0; j < 3; j++ {
  124. for k := 0; k < 3; k++ {
  125. key := vm[i] + vm[j] + vm[k]
  126. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] {
  127. compareABD[key] = true
  128. //fmt.Println(key)
  129. }
  130. }
  131. }
  132. }
  133. //fmt.Println("-------------------", len(compareABD))
  134. //[AB][ABCD][AB]
  135. //[AB][AB][ABCD]
  136. vm = []string{"A", "B"}
  137. vm2 = []string{"A", "B", "C", "D"}
  138. for i := 0; i < 2; i++ {
  139. for j := 0; j < 4; j++ {
  140. for k := 0; k < 2; k++ {
  141. key := vm[i] + vm2[j] + vm[k]
  142. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
  143. compareAB2CD[key] = true
  144. //fmt.Println(key)
  145. }
  146. }
  147. }
  148. }
  149. for i := 0; i < 2; i++ {
  150. for j := 0; j < 2; j++ {
  151. for k := 0; k < 4; k++ {
  152. key := vm[i] + vm[j] + vm2[k]
  153. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
  154. compareAB2CD[key] = true
  155. //fmt.Println(key)
  156. }
  157. }
  158. }
  159. }
  160. //fmt.Println("-------------------", len(compareAB2CD))
  161. //[ABECD][ABECD][ABECD] //已经删除[CD][CD][CD] //这个要重点讨论
  162. vm = []string{"A", "B", "C", "D"}
  163. for i := 0; i < 4; i++ {
  164. for j := 0; j < 4; j++ {
  165. for k := 0; k < 4; k++ {
  166. key := vm[i] + vm[j] + vm[k]
  167. if !compareAB[key] && !compareAB2D[key] && !compareABD[key] && !compareNoPass[key] && !compareAB2CD[key] {
  168. compareABCD[key] = true
  169. //fmt.Println(key)
  170. }
  171. }
  172. }
  173. }
  174. }
  175. func CheckHanAndNum(str string) (b bool) {
  176. return nreg1.MatchString(str) && hreg1.MatchString(str)
  177. }
  178. func CheckZimuAndNum(str string) (b bool) {
  179. return zreg1.MatchString(str) && nreg1.MatchString(str)
  180. }
  181. type KeyMap struct {
  182. Lock sync.Mutex
  183. Map map[string]*Key
  184. }
  185. type ID struct {
  186. Id string
  187. Lock sync.Mutex
  188. pos int
  189. P *ProjectInfo
  190. }
  191. type Key struct {
  192. Arr []string
  193. Lock sync.Mutex
  194. }
  195. type IdAndLock struct {
  196. Id string
  197. Lock sync.Mutex
  198. }
  199. func NewKeyMap() *KeyMap {
  200. return &KeyMap{
  201. Map: map[string]*Key{},
  202. Lock: sync.Mutex{},
  203. }
  204. }
  205. //招标信息实体类
  206. type Info struct {
  207. Id string `json:"_id"`
  208. Href string `json:"href"` //源地址
  209. Publishtime int64 `json:"publishtime"`
  210. Title string `json:"title"`
  211. TopType string `json:"toptype"`
  212. SubType string `json:"subtype"`
  213. ProjectName string `json:"projectname"`
  214. ProjectCode string `json:"projectcode"`
  215. Buyer string `json:"buyer"`
  216. Buyerperson string `json:"buyerperson"`
  217. Buyertel string `json:"buyertel"`
  218. Agency string `json:"agency"`
  219. Area string `json:"area"`
  220. City string `json:"city"`
  221. District string `json:"district"`
  222. HasPackage bool // `json:"haspackage"`
  223. Package map[string]interface{} `json:"package"`
  224. //PNum string `json:"pnum"`
  225. Topscopeclass []string `json:"topscopeclass"`
  226. Subscopeclass []string `json:"subscopeclass"`
  227. Buyerclass string `json:"buyerclass"`
  228. Bidopentime int64 `json:"bidopentime"`
  229. Budget float64 `json:"budget"`
  230. Bidamount float64 `json:"bidamount"`
  231. Winners []string
  232. dealtype int
  233. Winnerorder []string
  234. PTC string //从标题中抽的项目编号
  235. pnbval int //项目名称、编号、采购单位存在的个数
  236. LenPC int //项目编号长度
  237. LenPN int //项目名称长度
  238. LenPTC int //标题抽的项目编号长度
  239. //以下三个元素做对比,计算包含时候使用
  240. PNBH int //0初始,+包含,-被包含
  241. PCBH int
  242. PTCBH int
  243. }
  244. //项目实体类
  245. type ProjectInfo struct {
  246. Id primitive.ObjectID `json:"_id"`
  247. FirstTime int64 `json:"firsttime,omitempty"` //项目的最早时间
  248. LastTime int64 `json:"lasttime,omitempty"` //项目的最后时间
  249. Ids []string `json:"ids,omitempty"`
  250. Topscopeclass []string `json:"topscopeclass,omitempty"`
  251. Subscopeclass []string `json:"subscopeclass,omitempty"` //子行业分类
  252. Winners []string `json:"winners,omitempty"` //中标人
  253. ProjectName string `json:"projectname,omitempty"` //项目名称
  254. ProjectCode string `json:"projectcode,omitempty"` //项目代码唯一(纯数字的权重低)
  255. Buyer string `json:"buyer,omitempty"` //采购单位唯一
  256. MPN []string `json:"mpn,omitempty"` //合并后多余的项目名称
  257. MPC []string `json:"mpc,omitempty"` //合并后多余的项目编号
  258. Buyerperson string `json:"buyerperson"` //采购联系人
  259. Buyertel string `json:"buyertel"` //采购联系人电话
  260. Agency string `json:"agency"` //代理机构
  261. Area string `json:"area"` //地区
  262. City string `json:"city"` //地市
  263. District string `json:"district"` //区县
  264. //HasPackage bool `json:"haspackage"` //是否有分包
  265. Package map[string]interface{} `json:"package,omitempty"` //分包的对比对象
  266. Buyerclass string `json:"buyerclass"` //采购单位分类
  267. Bidopentime int64 `json:"bidopentime,omitempty"` //开标时间
  268. // Zbtime int64 `json:"zbtime"` //招标时间
  269. // Jgtime int64 `json:"jgtime"` //结果中标时间
  270. Bidamount float64 `json:"bidamount,omitempty"` //中标金额
  271. Budget float64 `json:"budget,omitempty"` //预算
  272. //Winnerorder []string `json:"winnerorder"` //中标候选人
  273. score int
  274. comStr string
  275. resVal, pjVal int
  276. }
  277. //二分字符串查找
  278. func BinarySearch(s []string, k string) int {
  279. sort.Strings(s)
  280. lo, hi := 0, len(s)-1
  281. for lo <= hi {
  282. m := (lo + hi) >> 1
  283. if s[m] < k {
  284. lo = m + 1
  285. } else if s[m] > k {
  286. hi = m - 1
  287. } else {
  288. return m
  289. }
  290. }
  291. return -1
  292. }
  293. //计算文本相似度
  294. func CosineSimilar(srcWords1, dstWords1 string) float64 {
  295. srcWords, dstWords := strings.Split(srcWords1, ""), strings.Split(dstWords1, "")
  296. // get all words
  297. allWordsMap := make(map[string]int, 0)
  298. for _, word := range srcWords {
  299. if _, found := allWordsMap[word]; !found {
  300. allWordsMap[word] = 1
  301. } else {
  302. allWordsMap[word] += 1
  303. }
  304. }
  305. for _, word := range dstWords {
  306. if _, found := allWordsMap[word]; !found {
  307. allWordsMap[word] = 1
  308. } else {
  309. allWordsMap[word] += 1
  310. }
  311. }
  312. // stable the sort
  313. allWordsSlice := make([]string, 0)
  314. for word, _ := range allWordsMap {
  315. allWordsSlice = append(allWordsSlice, word)
  316. }
  317. // assemble vector
  318. srcVector := make([]int, len(allWordsSlice))
  319. dstVector := make([]int, len(allWordsSlice))
  320. for _, word := range srcWords {
  321. if index := BinarySearch(allWordsSlice, word); index != -1 {
  322. srcVector[index] += 1
  323. }
  324. }
  325. for _, word := range dstWords {
  326. if index := BinarySearch(allWordsSlice, word); index != -1 {
  327. dstVector[index] += 1
  328. }
  329. }
  330. // calc cos
  331. numerator := float64(0)
  332. srcSq := 0
  333. dstSq := 0
  334. for i, srcCount := range srcVector {
  335. dstCount := dstVector[i]
  336. numerator += float64(srcCount * dstCount)
  337. srcSq += srcCount * srcCount
  338. dstSq += dstCount * dstCount
  339. }
  340. denominator := math.Sqrt(float64(srcSq * dstSq))
  341. v1 := numerator / denominator
  342. // if v1 > 0.6 {
  343. // log.Println(v1, srcWords1, dstWords1)
  344. // }
  345. return v1
  346. }