package main import ( qutil "jygit.jydev.jianyu360.cn/data_processing/common_utils" "math" "regexp" "strings" ) var cleanNameReg_0 = regexp.MustCompile("([(())::\\s ])") var cleanNameReg_1 = regexp.MustCompile("(项目)(.{0,5})(招标|中标|中标结果|成交|候选人|竞谈|竞争性磋商)(公告)?$") var cleanNameReg_2 = regexp.MustCompile("(公告|公示|公告公告)$") var cleanNameReg_3 = regexp.MustCompile("(公开|的)(比选|招标|单一来源)") var un_cleanNameReg_1 = regexp.MustCompile("(项目[一二三四五六七八九1-9][次](招标|中标|中标结果|成交|候选人|竞谈|竞争性磋商)(公告)?)$") // 完善判重数据检测-前置条件 func convertArabicNumeralsAndLetters(data string) string { newData := data res1, _ := regexp.Compile("[a-zA-Z]+") if res1.MatchString(data) { newData = res1.ReplaceAllStringFunc(data, strings.ToUpper) } res2, _ := regexp.Compile("[0-9]+") if res2.MatchString(newData) { arr1 := []string{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"} arr2 := []string{"零", "一", "二", "三", "四", "五", "六", "七", "八", "九"} for i := 0; i < len(arr1); i++ { resTemp, _ := regexp.Compile(arr1[i]) newData = resTemp.ReplaceAllString(newData, arr2[i]) } } return newData } // 特殊词处理 func dealWithSpecialPhrases(str1 string, str2 string) (string, string) { newStr1 := str1 newStr2 := str2 res, _ := regexp.Compile("重新招标") if res.MatchString(newStr1) { newStr1 = res.ReplaceAllString(newStr1, "重招") } if res.MatchString(newStr2) { newStr2 = res.ReplaceAllString(newStr2, "重招") } return newStr1, newStr2 } // 关键词数量v func dealWithSpecialWordNumber(info *Info, v *Info) int { okNum := 0 if info.titleSpecialWord || info.specialWord { okNum++ } if v.titleSpecialWord || v.specialWord { okNum++ } return okNum } // 关键词再次判断 func againRepeat(v *Info, info *Info, site bool) bool { if isPublishtimeInterval(info.publishtime, v.publishtime) && site { return true } if isBidopentimeInterval(info.bidopentime, v.bidopentime) { return true } if v.budget != info.budget && v.budget != 0 && info.budget != 0 { return true } if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0 && info.bidamount != 0 { return true } if v.winner != info.winner && v.winner != "" && info.winner != "" { return true } if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber { return true } if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode { return true } if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname { return true } if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title { if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname { return true } } return false } // 均含有关键词再次判断 func againContainSpecialWord(v *Info, info *Info) bool { if isBidopentimeInterval(info.bidopentime, v.bidopentime) { return true } if v.budget != info.budget && v.budget != 0 && info.budget != 0 { return true } if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0 && info.bidamount != 0 { return true } if v.winner != info.winner && v.winner != "" && info.winner != "" { return true } if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber { return true } if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode { return true } //提取标题-标段号处理 if dealTitleSpecial(v.title, info.title) { return true } return false } // 提取标题-标段号处理 func dealTitleSpecial(title1 string, title2 string) bool { regular1 := "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789]+[))]?" regular2 := "[0-9a-zA-Z一二三四五六七八九十零123456789]+(包|标段|标包)" regx1_1, _ := regexp.Compile(regular1) str1 := regx1_1.FindString(title1) if str1 == "" { regx1_2, _ := regexp.Compile(regular2) str1 = regx1_2.FindString(title1) } regx2_1, _ := regexp.Compile(regular1) str2 := regx2_1.FindString(title2) if str2 == "" { regx2_2, _ := regexp.Compile(regular2) str2 = regx2_2.FindString(title2) } //根据提取的结果,在进行清洗 if str1 != "" { str1 = deleteExtraSpaceName(str1) str1 = cleanNameReg_0.ReplaceAllString(str1, "") str1 = convertArabicNumeralsAndLetters(str1) } if str2 != "" { str2 = deleteExtraSpaceName(str2) str2 = cleanNameReg_0.ReplaceAllString(str2, "") str2 = convertArabicNumeralsAndLetters(str2) } if str1 != str2 { return true } else { return false } } // 删除中标单位字符串中多余的空格(含tab) func deleteExtraSpaceName(s string) string { //删除字符串中的多余空格,有多个空格时,仅保留一个空格 s1 := strings.Replace(s, " ", " ", -1) //替换tab为空格 regstr := "\\s{2,}" //两个及两个以上空格的正则表达式 reg, _ := regexp.Compile(regstr) //编译正则表达式 s2 := make([]byte, len(s1)) //定义字符数组切片 copy(s2, s1) //将字符串复制到切片 spc_index := reg.FindStringIndex(string(s2)) //在字符串中搜索 for len(spc_index) > 0 { //找到适配项 s2 = append(s2[:spc_index[0]+1], s2[spc_index[1]:]...) //删除多余空格 spc_index = reg.FindStringIndex(string(s2)) //继续在字符串中搜索 } return string(s2) } // 中标金额倍率:10000 func isBidWinningAmount(f1 float64, f2 float64) bool { if f1 == f2 || f1*10000 == f2 || f2*10000 == f1 { return false } return true } // 时间间隔周期 func isTimeIntervalPeriod(i1 int64, i2 int64) bool { if math.Abs(float64(i1-i2)) < 172800.0 { return true } else { return false //大于48小时 } } // 开标时间区间为一天 func isBidopentimeInterval(i1 int64, i2 int64) bool { if i1 == 0 || i2 == 0 { return false } //不在同一天-或者同一天间隔超过六小时,属于不相等返回true timeOne, timeTwo := i1, i2 day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd) day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd) if day1 == day2 { //是否间隔超过十二小时 if math.Abs(float64(i1-i2)) > 43200.0 { return true } else { return false } } else { return true } } // 发布时间区间为一天 func isPublishtimeInterval(i1 int64, i2 int64) bool { if i1 == 0 || i2 == 0 { return false } //不在同一天-或者同一天间隔超过12小时,属于不相等返回true timeOne, timeTwo := i1, i2 day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd) day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd) if day1 == day2 { //是否间隔超过十二小时 if math.Abs(float64(i1-i2)) >= 43200.0 { return true } else { return false } } else { return true } } // 时间区间为一天 func isTheSameDay(i1 int64, i2 int64) bool { if i1 == 0 || i2 == 0 { return false } timeOne, timeTwo := i1, i2 day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd) day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd) if day1 == day2 { return true } return false } // 前置0 五要素均相等认为重复 func leadingElementSame(v *Info, info *Info) bool { isok := 0 if info.projectname != "" && v.projectname == info.projectname { isok++ } if info.buyer != "" && v.buyer == info.buyer { isok++ } if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" { if info.contractnumber != "" && v.contractnumber == info.contractnumber { isok++ } } else { if info.projectcode != "" && v.projectcode == info.projectcode { isok++ } } if info.title != "" && v.title == info.title { isok++ } if v.agency == info.agency { isok++ } if v.winner == info.winner && info.winner != "" { isok++ } if isok >= 5 { //加一层金额单位的逻辑校验 if v.budget != info.budget && v.budget != 0 && info.budget != 0 { return false } if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0.0 && info.bidamount != 0.0 { return false } if v.winner != "" && info.winner != "" && v.winner != info.winner { return false } return true } return false } // 前置0 竞品要素简易计算 func jingPinElementSame(v *Info, info *Info) bool { if info.projectname != "" && v.projectname != info.projectname { return false } if info.buyer != "" && v.buyer != info.buyer { return false } if info.projectcode != "" && v.projectcode != info.projectcode { return false } if v.agency != info.agency { return false } return true } // buyer的优先级 func buyerIsContinue(v *Info, info *Info) bool { if !isTheSameDay(info.publishtime, v.publishtime) { return true } if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname { return true } if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title { if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname { return true } } if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber { return true } if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode { return true } return false } // 判断~是否需要替换数据相关 func judgeIsReplaceInfo(s_href string, i_href string) bool { if strings.Contains(s_href, "https://www.jianyu360.cn") && i_href != "" && !strings.Contains(i_href, "https://www.jianyu360.cn") { return true } return false } // 查询抽取表数据 func confrimExtractData(source_id string, info_id string) (bool, map[string]interface{}, map[string]interface{}) { source_data := map[string]interface{}{} info_data := map[string]interface{}{} isvalid := false source_data = data_mgo.FindById(extract, source_id) info_data = data_mgo.FindById(extract, info_id) if len(source_data) > 2 && len(info_data) > 2 { isvalid = true ts_id := source_data["_id"] ti_id := info_data["_id"] source_data["_id"] = ti_id info_data["_id"] = ts_id } return isvalid, info_data, source_data } // 查询历史抽取表数据 func confrimHistoryExtractData(source_id string, info_id string) (bool, bool, map[string]interface{}, map[string]interface{}) { source_data := map[string]interface{}{} info_data := map[string]interface{}{} isvalid := false isexists := false if judgeIsCurIds(gtid, lteid, source_id) { isexists = true source_data = data_mgo.FindById(extract, source_id) } else { source_data = data_mgo.FindById(extract_back, source_id) } info_data = data_mgo.FindById(extract, info_id) if len(source_data) > 2 && len(info_data) > 2 { isvalid = true ts_id := source_data["_id"] ti_id := info_data["_id"] source_data["_id"] = ti_id info_data["_id"] = ts_id } return isvalid, isexists, info_data, source_data } // 查询bidding表数据 func confrimBiddingData(source_id string, info_id string) (bool, map[string]interface{}, map[string]interface{}) { source_data := map[string]interface{}{} info_data := map[string]interface{}{} isvalid := false source_data = task_mgo.FindById(task_bidding, source_id) info_data = task_mgo.FindById(task_bidding, info_id) if len(source_data) > 2 && len(info_data) > 2 { isvalid = true ts_id := source_data["_id"] ti_id := info_data["_id"] source_data["_id"] = ti_id info_data["_id"] = ts_id } return isvalid, info_data, source_data } // 是否为竞品链接 func IsJpHref(href string) bool { if strings.Contains(href, "www.jianyu360") && href != "" { return true } return false } // 验证竞品是否重复 func confirmJingPinIsRepeatData(v *Info, info *Info) bool { //标题验证~是否有关联~是否需要清洗数据-长度需要考虑 if v.c_title != "" && info.c_title != "" { //标题相似判断 if !(strings.Contains(v.c_title, info.c_title) || strings.Contains(info.c_title, v.c_title)) { if !jingPinElementSame(v, info) { return false } } if !isTheSameDay(v.publishtime, info.publishtime) { return false } if v.budget != info.budget && v.budget != 0 && info.budget != 0 { return false } if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0.0 && info.bidamount != 0.0 { return false } if v.winner != "" && info.winner != "" && v.winner != info.winner { return false } if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber { return false } if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode { return false } return true } return false } // 通用清洗~清洗名称~过滤冗余~ func cleanNameFilterRedundant(name string) string { new_name := name new_name = cleanNameReg_0.ReplaceAllString(new_name, "") if !un_cleanNameReg_1.MatchString(new_name) { new_name = cleanNameReg_1.ReplaceAllString(new_name, "${1}${3}") } new_name = cleanNameReg_2.ReplaceAllString(new_name, "") new_name = cleanNameReg_3.ReplaceAllString(new_name, "${2}") return new_name }