init.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. package main
  2. import (
  3. "log"
  4. "math"
  5. mu "mfw/util"
  6. "qfw/util"
  7. "qfw/util/mongodb"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. "sync"
  12. "gopkg.in/mgo.v2/bson"
  13. )
  14. const (
  15. ProjectCache = "info" //存放每条项目信息,key为项目ID
  16. )
  17. var (
  18. Sysconfig map[string]interface{} //读取配置文件
  19. MongoTool mongodb.MongodbSim //mongodb连接
  20. ExtractColl, ProjectColl string //抽取表、项目表
  21. )
  22. var (
  23. //判断是日期
  24. _datereg = regexp.MustCompile("20[0-2][0-9][年-][0-9]{1,2}[月-][0-9]{1,2}[日-]([0-9]{1,2}时[0-9]{0,2})?")
  25. _numreg1 = regexp.MustCompile("^[0-9-]{1,8}$")
  26. _zimureg1 = regexp.MustCompile("^[a-zA-Z-]{1,7}$")
  27. _nzreg = regexp.MustCompile("^[0-9a-zA-Z-]+$")
  28. _hanreg = regexp.MustCompile(`^[\p{Han}::【】\\[\\]()()--、]+$`)
  29. replaceStr = regexp.MustCompile("(工程|采购|项目|[?!、【】()—()--]|栏标价|中标候选人|招标代理)")
  30. //判断带有分包、等特定词的
  31. pStr = regexp.MustCompile("(勘察|监理|施工|设计|验收|标段|分包|子包|[0-9A-Z]包|[一二三四五六七八九十0-9]批)")
  32. //判断包含数值
  33. nreg1 = regexp.MustCompile("[0-9]{2,}")
  34. //判断包含字母
  35. zreg1 = regexp.MustCompile("[a-zA-Z]{1,}")
  36. //判断包含汉字
  37. hreg1 = regexp.MustCompile(`[\p{Han}]+`)
  38. //判断项目编号是在10以内的纯数字结构
  39. numCheckPc = regexp.MustCompile("^[0-9-]{1,10}$")
  40. //仅初始化使用
  41. compareNoPass = map[string]bool{}
  42. compareAB = map[string]bool{}
  43. compareAB2D = map[string]bool{}
  44. compareABD = map[string]bool{}
  45. compareAB2CD = map[string]bool{}
  46. compareABCD = map[string]bool{}
  47. )
  48. func init() {
  49. util.ReadConfig(&Sysconfig)
  50. MongoTool = mongodb.MongodbSim{
  51. MongodbAddr: Sysconfig["mongodbServers"].(string),
  52. Size: util.IntAll(Sysconfig["mongodbPoolSize"]),
  53. DbName: Sysconfig["mongodbName"].(string),
  54. }
  55. MongoTool.InitPool()
  56. ExtractColl = Sysconfig["extractColl"].(string)
  57. ProjectColl = Sysconfig["projectColl"].(string)
  58. udpport, _ := Sysconfig["udpport"].(string)
  59. udpclient = mu.UdpClient{Local: udpport, BufSize: 1024}
  60. udpclient.Listen(processUdpMsg)
  61. log.Println("Udp服务监听", udpport)
  62. //加载项目数据
  63. //---不能通过
  64. vm := []string{"C", "D"}
  65. for i := 0; i < 2; i++ {
  66. for j := 0; j < 2; j++ {
  67. for k := 0; k < 2; k++ {
  68. key := vm[i] + vm[j] + vm[k]
  69. compareNoPass[key] = true
  70. //fmt.Println(key)
  71. }
  72. }
  73. }
  74. //fmt.Println("-------------------")
  75. //三个元素一致 [AB][AB][AB],分值最高
  76. vm = []string{"A", "B"}
  77. for i := 0; i < 2; i++ {
  78. for j := 0; j < 2; j++ {
  79. for k := 0; k < 2; k++ {
  80. key := vm[i] + vm[j] + vm[k]
  81. compareAB[key] = true
  82. //fmt.Println(key)
  83. }
  84. }
  85. }
  86. //fmt.Println("-------------------", len(compareAB))
  87. //---至少两个一致,其他可能不存在
  88. //[AB][AB][ABD]
  89. //[AB][ABD][AB]
  90. vm = []string{"A", "B"}
  91. vm2 := []string{"A", "B", "D"}
  92. for i := 0; i < 2; i++ {
  93. for j := 0; j < 2; j++ {
  94. for k := 0; k < 3; k++ {
  95. key := vm[i] + vm[j] + vm2[k]
  96. if !compareAB[key] {
  97. compareAB2D[key] = true
  98. //fmt.Println(key)
  99. }
  100. }
  101. }
  102. }
  103. for i := 0; i < 2; i++ {
  104. for j := 0; j < 3; j++ {
  105. for k := 0; k < 2; k++ {
  106. key := vm[i] + vm2[j] + vm[k]
  107. if !compareAB[key] {
  108. compareAB2D[key] = true
  109. //fmt.Println(key)
  110. }
  111. }
  112. }
  113. }
  114. //fmt.Println("-------------------", len(compareAB2D))
  115. //---至少一个一致,其他可能不存在
  116. //[ABD][ABD][ABD] //已经删除DDD
  117. vm = []string{"A", "B", "D"}
  118. for i := 0; i < 3; i++ {
  119. for j := 0; j < 3; j++ {
  120. for k := 0; k < 3; k++ {
  121. key := vm[i] + vm[j] + vm[k]
  122. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] {
  123. compareABD[key] = true
  124. //fmt.Println(key)
  125. }
  126. }
  127. }
  128. }
  129. //fmt.Println("-------------------", len(compareABD))
  130. //[AB][ABCD][AB]
  131. //[AB][AB][ABCD]
  132. vm = []string{"A", "B"}
  133. vm2 = []string{"A", "B", "C", "D"}
  134. for i := 0; i < 2; i++ {
  135. for j := 0; j < 4; j++ {
  136. for k := 0; k < 2; k++ {
  137. key := vm[i] + vm2[j] + vm[k]
  138. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
  139. compareAB2CD[key] = true
  140. //fmt.Println(key)
  141. }
  142. }
  143. }
  144. }
  145. for i := 0; i < 2; i++ {
  146. for j := 0; j < 2; j++ {
  147. for k := 0; k < 4; k++ {
  148. key := vm[i] + vm[j] + vm2[k]
  149. if !compareAB[key] && !compareAB2D[key] && !compareNoPass[key] && !compareABD[key] {
  150. compareAB2CD[key] = true
  151. //fmt.Println(key)
  152. }
  153. }
  154. }
  155. }
  156. //fmt.Println("-------------------", len(compareAB2CD))
  157. //[ABECD][ABECD][ABECD] //已经删除[CD][CD][CD] //这个要重点讨论
  158. vm = []string{"A", "B", "C", "D"}
  159. for i := 0; i < 4; i++ {
  160. for j := 0; j < 4; j++ {
  161. for k := 0; k < 4; k++ {
  162. key := vm[i] + vm[j] + vm[k]
  163. if !compareAB[key] && !compareAB2D[key] && !compareABD[key] && !compareNoPass[key] && !compareAB2CD[key] {
  164. compareABCD[key] = true
  165. //fmt.Println(key)
  166. }
  167. }
  168. }
  169. }
  170. }
  171. func CheckHanAndNum(str string) (b bool) {
  172. return nreg1.MatchString(str) && hreg1.MatchString(str)
  173. }
  174. func CheckZimuAndNum(str string) (b bool) {
  175. return zreg1.MatchString(str) && nreg1.MatchString(str)
  176. }
  177. type KeyMap struct {
  178. Lock sync.Mutex
  179. Map map[string]*Key
  180. }
  181. type ID struct {
  182. Id string
  183. Lock sync.Mutex
  184. pos int
  185. P *ProjectInfo
  186. }
  187. type Key struct {
  188. Arr []string
  189. Lock sync.Mutex
  190. }
  191. type IdAndLock struct {
  192. Id string
  193. Lock sync.Mutex
  194. }
  195. func NewKeyMap() *KeyMap {
  196. return &KeyMap{
  197. Map: map[string]*Key{},
  198. Lock: sync.Mutex{},
  199. }
  200. }
  201. //招标信息实体类
  202. type Info struct {
  203. Id string `json:"_id"`
  204. Href string `json:"href"` //源地址
  205. Publishtime int64 `json:"publishtime"`
  206. Title string `json:"title"`
  207. TopType string `json:"toptype"`
  208. SubType string `json:"subtype"`
  209. ProjectName string `json:"projectname"`
  210. ProjectCode string `json:"projectcode"`
  211. Buyer string `json:"buyer"`
  212. Buyerperson string `json:"buyerperson"`
  213. Buyertel string `json:"buyertel"`
  214. Agency string `json:"agency"`
  215. Area string `json:"area"`
  216. City string `json:"city"`
  217. District string `json:"district"`
  218. HasPackage bool `json:"haspackage"`
  219. Package map[string]interface{} `json:"package"`
  220. PNum string `json:"pnum"`
  221. Topscopeclass []string `json:"topscopeclass"`
  222. Subscopeclass []string `json:"subscopeclass"`
  223. Buyerclass string `json:"buyerclass"`
  224. Bidopentime int64 `json:"bidopentime"`
  225. Budget float64 `json:"budget"`
  226. Bidamount float64 `json:"bidamount"`
  227. Winners []string
  228. dealtype int
  229. Winnerorder []string
  230. PTC string //从标题中抽的项目编号
  231. pnbval int //项目名称、编号、采购单位存在的个数
  232. LenPC int //项目编号长度
  233. LenPN int //项目名称长度
  234. LenPTC int //标题抽的项目编号长度
  235. //以下三个元素做对比,计算包含时候使用
  236. PNBH int //0初始,+包含,-被包含
  237. PCBH int
  238. PTCBH int
  239. }
  240. //项目实体类
  241. type ProjectInfo struct {
  242. Id bson.ObjectId `bson:"_id"`
  243. FirstTime int64 `json:"firsttime"` //项目的最早时间
  244. LastTime int64 `json:"lasttime"` //项目的最后时间
  245. Ids []string `json:"ids"`
  246. Topscopeclass []string `json:"topscopeclass"`
  247. Subscopeclass []string `json:"subscopeclass"` //子行业分类
  248. Winners []string `json:"winners"` //中标人
  249. ProjectName string `json:"projectname"` //项目名称
  250. ProjectCode string `json:"projectcode"` //项目代码唯一(纯数字的权重低)
  251. Buyer string `json:"buyer"` //采购单位唯一
  252. MPN []string `json:"mpn"` //合并后多余的项目名称
  253. MPC []string `json:"mpc"` //合并后多余的项目编号
  254. Buyerperson string `json:"buyerperson"` //采购联系人
  255. Buyertel string `json:"buyertel"` //采购联系人电话
  256. Agency string `json:"agency"` //代理机构
  257. Area string `json:"area"` //地区
  258. City string `json:"city"` //地市
  259. District string `json:"district"` //区县
  260. HasPackage bool `json:"haspackage"` //是否有分包
  261. Package map[string]interface{} `json:"package"` //分包的对比对象
  262. Buyerclass string `json:"buyerclass"` //采购单位分类
  263. Bidopentime int64 `json:"bidopentime"` //开标时间
  264. // Zbtime int64 `json:"zbtime"` //招标时间
  265. // Jgtime int64 `json:"jgtime"` //结果中标时间
  266. Bidamount float64 `json:"bidamount"` //中标金额
  267. Budget float64 `json:"budget"` //预算
  268. //Winnerorder []string `json:"winnerorder"` //中标候选人
  269. score int
  270. comStr string
  271. }
  272. //二分字符串查找
  273. func BinarySearch(s []string, k string) int {
  274. sort.Strings(s)
  275. lo, hi := 0, len(s)-1
  276. for lo <= hi {
  277. m := (lo + hi) >> 1
  278. if s[m] < k {
  279. lo = m + 1
  280. } else if s[m] > k {
  281. hi = m - 1
  282. } else {
  283. return m
  284. }
  285. }
  286. return -1
  287. }
  288. //计算文本相似度
  289. func CosineSimilar(srcWords1, dstWords1 string) float64 {
  290. srcWords, dstWords := strings.Split(srcWords1, ""), strings.Split(dstWords1, "")
  291. // get all words
  292. allWordsMap := make(map[string]int, 0)
  293. for _, word := range srcWords {
  294. if _, found := allWordsMap[word]; !found {
  295. allWordsMap[word] = 1
  296. } else {
  297. allWordsMap[word] += 1
  298. }
  299. }
  300. for _, word := range dstWords {
  301. if _, found := allWordsMap[word]; !found {
  302. allWordsMap[word] = 1
  303. } else {
  304. allWordsMap[word] += 1
  305. }
  306. }
  307. // stable the sort
  308. allWordsSlice := make([]string, 0)
  309. for word, _ := range allWordsMap {
  310. allWordsSlice = append(allWordsSlice, word)
  311. }
  312. // assemble vector
  313. srcVector := make([]int, len(allWordsSlice))
  314. dstVector := make([]int, len(allWordsSlice))
  315. for _, word := range srcWords {
  316. if index := BinarySearch(allWordsSlice, word); index != -1 {
  317. srcVector[index] += 1
  318. }
  319. }
  320. for _, word := range dstWords {
  321. if index := BinarySearch(allWordsSlice, word); index != -1 {
  322. dstVector[index] += 1
  323. }
  324. }
  325. // calc cos
  326. numerator := float64(0)
  327. srcSq := 0
  328. dstSq := 0
  329. for i, srcCount := range srcVector {
  330. dstCount := dstVector[i]
  331. numerator += float64(srcCount * dstCount)
  332. srcSq += srcCount * srcCount
  333. dstSq += dstCount * dstCount
  334. }
  335. denominator := math.Sqrt(float64(srcSq * dstSq))
  336. v1 := numerator / denominator
  337. // if v1 > 0.6 {
  338. // log.Println(v1, srcWords1, dstWords1)
  339. // }
  340. return v1
  341. }