dataMethod.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. package main
  2. import (
  3. qutil "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  4. "math"
  5. "regexp"
  6. "strings"
  7. )
  8. var cleanNameReg_0 = regexp.MustCompile("([(())::\\s ])")
  9. var cleanNameReg_1 = regexp.MustCompile("(项目)(.{0,5})(招标|中标|中标结果|成交|候选人|竞谈|竞争性磋商)(公告)?$")
  10. var cleanNameReg_2 = regexp.MustCompile("(公告|公示|公告公告)$")
  11. var cleanNameReg_3 = regexp.MustCompile("(公开|的)(比选|招标|单一来源)")
  12. var un_cleanNameReg_1 = regexp.MustCompile("(项目[一二三四五六七八九1-9][次](招标|中标|中标结果|成交|候选人|竞谈|竞争性磋商)(公告)?)$")
  13. // 完善判重数据检测-前置条件
  14. func convertArabicNumeralsAndLetters(data string) string {
  15. newData := data
  16. res1, _ := regexp.Compile("[a-zA-Z]+")
  17. if res1.MatchString(data) {
  18. newData = res1.ReplaceAllStringFunc(data, strings.ToUpper)
  19. }
  20. res2, _ := regexp.Compile("[0-9]+")
  21. if res2.MatchString(newData) {
  22. arr1 := []string{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}
  23. arr2 := []string{"零", "一", "二", "三", "四", "五", "六", "七", "八", "九"}
  24. for i := 0; i < len(arr1); i++ {
  25. resTemp, _ := regexp.Compile(arr1[i])
  26. newData = resTemp.ReplaceAllString(newData, arr2[i])
  27. }
  28. }
  29. return newData
  30. }
  31. // 特殊词处理
  32. func dealWithSpecialPhrases(str1 string, str2 string) (string, string) {
  33. newStr1 := str1
  34. newStr2 := str2
  35. res, _ := regexp.Compile("重新招标")
  36. if res.MatchString(newStr1) {
  37. newStr1 = res.ReplaceAllString(newStr1, "重招")
  38. }
  39. if res.MatchString(newStr2) {
  40. newStr2 = res.ReplaceAllString(newStr2, "重招")
  41. }
  42. return newStr1, newStr2
  43. }
  44. // 关键词数量v
  45. func dealWithSpecialWordNumber(info *Info, v *Info) int {
  46. okNum := 0
  47. if info.titleSpecialWord || info.specialWord {
  48. okNum++
  49. }
  50. if v.titleSpecialWord || v.specialWord {
  51. okNum++
  52. }
  53. return okNum
  54. }
  55. // 关键词再次判断
  56. func againRepeat(v *Info, info *Info, site bool) bool {
  57. if isPublishtimeInterval(info.publishtime, v.publishtime) && site {
  58. return true
  59. }
  60. if isBidopentimeInterval(info.bidopentime, v.bidopentime) {
  61. return true
  62. }
  63. if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
  64. return true
  65. }
  66. if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0 && info.bidamount != 0 {
  67. return true
  68. }
  69. if v.winner != info.winner && v.winner != "" && info.winner != "" {
  70. return true
  71. }
  72. if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
  73. return true
  74. }
  75. if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
  76. return true
  77. }
  78. if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
  79. return true
  80. }
  81. if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title {
  82. if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
  83. return true
  84. }
  85. }
  86. return false
  87. }
  88. // 均含有关键词再次判断
  89. func againContainSpecialWord(v *Info, info *Info) bool {
  90. if isBidopentimeInterval(info.bidopentime, v.bidopentime) {
  91. return true
  92. }
  93. if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
  94. return true
  95. }
  96. if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0 && info.bidamount != 0 {
  97. return true
  98. }
  99. if v.winner != info.winner && v.winner != "" && info.winner != "" {
  100. return true
  101. }
  102. if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
  103. return true
  104. }
  105. if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
  106. return true
  107. }
  108. //提取标题-标段号处理
  109. if dealTitleSpecial(v.title, info.title) {
  110. return true
  111. }
  112. return false
  113. }
  114. // 提取标题-标段号处理
  115. func dealTitleSpecial(title1 string, title2 string) bool {
  116. regular1 := "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789]+[))]?"
  117. regular2 := "[0-9a-zA-Z一二三四五六七八九十零123456789]+(包|标段|标包)"
  118. regx1_1, _ := regexp.Compile(regular1)
  119. str1 := regx1_1.FindString(title1)
  120. if str1 == "" {
  121. regx1_2, _ := regexp.Compile(regular2)
  122. str1 = regx1_2.FindString(title1)
  123. }
  124. regx2_1, _ := regexp.Compile(regular1)
  125. str2 := regx2_1.FindString(title2)
  126. if str2 == "" {
  127. regx2_2, _ := regexp.Compile(regular2)
  128. str2 = regx2_2.FindString(title2)
  129. }
  130. //根据提取的结果,在进行清洗
  131. if str1 != "" {
  132. str1 = deleteExtraSpaceName(str1)
  133. str1 = cleanNameReg_0.ReplaceAllString(str1, "")
  134. str1 = convertArabicNumeralsAndLetters(str1)
  135. }
  136. if str2 != "" {
  137. str2 = deleteExtraSpaceName(str2)
  138. str2 = cleanNameReg_0.ReplaceAllString(str2, "")
  139. str2 = convertArabicNumeralsAndLetters(str2)
  140. }
  141. if str1 != str2 {
  142. return true
  143. } else {
  144. return false
  145. }
  146. }
  147. // 删除中标单位字符串中多余的空格(含tab)
  148. func deleteExtraSpaceName(s string) string {
  149. //删除字符串中的多余空格,有多个空格时,仅保留一个空格
  150. s1 := strings.Replace(s, " ", " ", -1) //替换tab为空格
  151. regstr := "\\s{2,}" //两个及两个以上空格的正则表达式
  152. reg, _ := regexp.Compile(regstr) //编译正则表达式
  153. s2 := make([]byte, len(s1)) //定义字符数组切片
  154. copy(s2, s1) //将字符串复制到切片
  155. spc_index := reg.FindStringIndex(string(s2)) //在字符串中搜索
  156. for len(spc_index) > 0 { //找到适配项
  157. s2 = append(s2[:spc_index[0]+1], s2[spc_index[1]:]...) //删除多余空格
  158. spc_index = reg.FindStringIndex(string(s2)) //继续在字符串中搜索
  159. }
  160. return string(s2)
  161. }
  162. // 中标金额倍率:10000
  163. func isBidWinningAmount(f1 float64, f2 float64) bool {
  164. if f1 == f2 || f1*10000 == f2 || f2*10000 == f1 {
  165. return false
  166. }
  167. return true
  168. }
  169. // 时间间隔周期
  170. func isTimeIntervalPeriod(i1 int64, i2 int64) bool {
  171. if math.Abs(float64(i1-i2)) < 172800.0 {
  172. return true
  173. } else {
  174. return false //大于48小时
  175. }
  176. }
  177. // 开标时间区间为一天
  178. func isBidopentimeInterval(i1 int64, i2 int64) bool {
  179. if i1 == 0 || i2 == 0 {
  180. return false
  181. }
  182. //不在同一天-或者同一天间隔超过六小时,属于不相等返回true
  183. timeOne, timeTwo := i1, i2
  184. day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
  185. day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
  186. if day1 == day2 {
  187. //是否间隔超过十二小时
  188. if math.Abs(float64(i1-i2)) > 43200.0 {
  189. return true
  190. } else {
  191. return false
  192. }
  193. } else {
  194. return true
  195. }
  196. }
  197. // 发布时间区间为一天
  198. func isPublishtimeInterval(i1 int64, i2 int64) bool {
  199. if i1 == 0 || i2 == 0 {
  200. return false
  201. }
  202. //不在同一天-或者同一天间隔超过12小时,属于不相等返回true
  203. timeOne, timeTwo := i1, i2
  204. day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
  205. day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
  206. if day1 == day2 {
  207. //是否间隔超过十二小时
  208. if math.Abs(float64(i1-i2)) >= 43200.0 {
  209. return true
  210. } else {
  211. return false
  212. }
  213. } else {
  214. return true
  215. }
  216. }
  217. // 时间区间为一天
  218. func isTheSameDay(i1 int64, i2 int64) bool {
  219. if i1 == 0 || i2 == 0 {
  220. return false
  221. }
  222. timeOne, timeTwo := i1, i2
  223. day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
  224. day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
  225. if day1 == day2 {
  226. return true
  227. }
  228. return false
  229. }
  230. // 前置0 五要素均相等认为重复
  231. func leadingElementSame(v *Info, info *Info) bool {
  232. isok := 0
  233. if info.projectname != "" && v.projectname == info.projectname {
  234. isok++
  235. }
  236. if info.buyer != "" && v.buyer == info.buyer {
  237. isok++
  238. }
  239. if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
  240. if info.contractnumber != "" && v.contractnumber == info.contractnumber {
  241. isok++
  242. }
  243. } else {
  244. if info.projectcode != "" && v.projectcode == info.projectcode {
  245. isok++
  246. }
  247. }
  248. if info.title != "" && v.title == info.title {
  249. isok++
  250. }
  251. if v.agency == info.agency {
  252. isok++
  253. }
  254. if v.winner == info.winner && info.winner != "" {
  255. isok++
  256. }
  257. if isok >= 5 { //加一层金额单位的逻辑校验
  258. if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
  259. return false
  260. }
  261. if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0.0 && info.bidamount != 0.0 {
  262. return false
  263. }
  264. if v.winner != "" && info.winner != "" && v.winner != info.winner {
  265. return false
  266. }
  267. return true
  268. }
  269. return false
  270. }
  271. // 前置0 竞品要素简易计算
  272. func jingPinElementSame(v *Info, info *Info) bool {
  273. if info.projectname != "" && v.projectname != info.projectname {
  274. return false
  275. }
  276. if info.buyer != "" && v.buyer != info.buyer {
  277. return false
  278. }
  279. if info.projectcode != "" && v.projectcode != info.projectcode {
  280. return false
  281. }
  282. if v.agency != info.agency {
  283. return false
  284. }
  285. return true
  286. }
  287. // buyer的优先级
  288. func buyerIsContinue(v *Info, info *Info) bool {
  289. if !isTheSameDay(info.publishtime, v.publishtime) {
  290. return true
  291. }
  292. if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
  293. return true
  294. }
  295. if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title {
  296. if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
  297. return true
  298. }
  299. }
  300. if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
  301. return true
  302. }
  303. if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
  304. return true
  305. }
  306. return false
  307. }
  308. // 判断~是否需要替换数据相关
  309. func judgeIsReplaceInfo(s_href string, i_href string) bool {
  310. if strings.Contains(s_href, "https://www.jianyu360.cn") && i_href != "" &&
  311. !strings.Contains(i_href, "https://www.jianyu360.cn") {
  312. return true
  313. }
  314. return false
  315. }
  316. // 查询抽取表数据
  317. func confrimExtractData(source_id string, info_id string) (bool, map[string]interface{}, map[string]interface{}) {
  318. source_data := map[string]interface{}{}
  319. info_data := map[string]interface{}{}
  320. isvalid := false
  321. source_data = data_mgo.FindById(extract, source_id)
  322. info_data = data_mgo.FindById(extract, info_id)
  323. if len(source_data) > 2 && len(info_data) > 2 {
  324. isvalid = true
  325. ts_id := source_data["_id"]
  326. ti_id := info_data["_id"]
  327. source_data["_id"] = ti_id
  328. info_data["_id"] = ts_id
  329. }
  330. return isvalid, info_data, source_data
  331. }
  332. // 查询历史抽取表数据
  333. func confrimHistoryExtractData(source_id string, info_id string) (bool, bool, map[string]interface{}, map[string]interface{}) {
  334. source_data := map[string]interface{}{}
  335. info_data := map[string]interface{}{}
  336. isvalid := false
  337. isexists := false
  338. if judgeIsCurIds(gtid, lteid, source_id) {
  339. isexists = true
  340. source_data = data_mgo.FindById(extract, source_id)
  341. } else {
  342. source_data = data_mgo.FindById(extract_back, source_id)
  343. }
  344. info_data = data_mgo.FindById(extract, info_id)
  345. if len(source_data) > 2 && len(info_data) > 2 {
  346. isvalid = true
  347. ts_id := source_data["_id"]
  348. ti_id := info_data["_id"]
  349. source_data["_id"] = ti_id
  350. info_data["_id"] = ts_id
  351. }
  352. return isvalid, isexists, info_data, source_data
  353. }
  354. // 查询bidding表数据
  355. func confrimBiddingData(source_id string, info_id string) (bool, map[string]interface{}, map[string]interface{}) {
  356. source_data := map[string]interface{}{}
  357. info_data := map[string]interface{}{}
  358. isvalid := false
  359. source_data = task_mgo.FindById(task_bidding, source_id)
  360. info_data = task_mgo.FindById(task_bidding, info_id)
  361. if len(source_data) > 2 && len(info_data) > 2 {
  362. isvalid = true
  363. ts_id := source_data["_id"]
  364. ti_id := info_data["_id"]
  365. source_data["_id"] = ti_id
  366. info_data["_id"] = ts_id
  367. }
  368. return isvalid, info_data, source_data
  369. }
  370. // 是否为竞品链接
  371. func IsJpHref(href string) bool {
  372. if strings.Contains(href, "www.jianyu360") && href != "" {
  373. return true
  374. }
  375. return false
  376. }
  377. // 验证竞品是否重复
  378. func confirmJingPinIsRepeatData(v *Info, info *Info) bool {
  379. //标题验证~是否有关联~是否需要清洗数据-长度需要考虑
  380. if v.c_title != "" && info.c_title != "" { //标题相似判断
  381. if !(strings.Contains(v.c_title, info.c_title) || strings.Contains(info.c_title, v.c_title)) {
  382. if !jingPinElementSame(v, info) {
  383. return false
  384. }
  385. }
  386. if !isTheSameDay(v.publishtime, info.publishtime) {
  387. return false
  388. }
  389. if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
  390. return false
  391. }
  392. if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0.0 && info.bidamount != 0.0 {
  393. return false
  394. }
  395. if v.winner != "" && info.winner != "" && v.winner != info.winner {
  396. return false
  397. }
  398. if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
  399. return false
  400. }
  401. if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
  402. return false
  403. }
  404. return true
  405. }
  406. return false
  407. }
  408. // 通用清洗~清洗名称~过滤冗余~
  409. func cleanNameFilterRedundant(name string) string {
  410. new_name := name
  411. new_name = cleanNameReg_0.ReplaceAllString(new_name, "")
  412. if !un_cleanNameReg_1.MatchString(new_name) {
  413. new_name = cleanNameReg_1.ReplaceAllString(new_name, "${1}${3}")
  414. }
  415. new_name = cleanNameReg_2.ReplaceAllString(new_name, "")
  416. new_name = cleanNameReg_3.ReplaceAllString(new_name, "${2}")
  417. return new_name
  418. }