|
@@ -7,9 +7,10 @@ import (
|
|
|
"strings"
|
|
|
)
|
|
|
|
|
|
-var cleanNameReg_0 = regexp.MustCompile("(项目)(.{0,5})(招标公告)$")
|
|
|
-var cleanNameReg_1 = regexp.MustCompile("([(())])")
|
|
|
+var cleanNameReg_0 = regexp.MustCompile("([(())::\\s ])")
|
|
|
+var cleanNameReg_1 = regexp.MustCompile("(项目)(.{0,5})(招标|中标|中标结果|成交|候选人|竞谈|竞争性磋商)(公告)?$")
|
|
|
var cleanNameReg_2 = regexp.MustCompile("(公告|公告公告)$")
|
|
|
+var cleanNameReg_3 = regexp.MustCompile("(公开)(比选|招标)")
|
|
|
|
|
|
//完善判重数据检测-前置条件
|
|
|
func convertArabicNumeralsAndLetters(data string) string {
|
|
@@ -30,6 +31,7 @@ func convertArabicNumeralsAndLetters(data string) string {
|
|
|
return newData
|
|
|
}
|
|
|
|
|
|
+//特殊词处理
|
|
|
func dealWithSpecialPhrases(str1 string, str2 string) (string, string) {
|
|
|
newStr1 := str1
|
|
|
newStr2 := str2
|
|
@@ -78,22 +80,19 @@ func againRepeat(v *Info, info *Info, site bool) bool {
|
|
|
if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
|
|
|
return true
|
|
|
}
|
|
|
-
|
|
|
- if judgeNameIsDifferent(v.title, info.title) {
|
|
|
- if judgeNameIsDifferent(v.projectname, info.projectname) {
|
|
|
+ if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title {
|
|
|
+ if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
|
|
|
return true
|
|
|
}
|
|
|
}
|
|
|
- if judgeNameIsDifferent(v.projectname, info.projectname) {
|
|
|
- return true
|
|
|
- }
|
|
|
-
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
//均含有关键词再次判断
|
|
|
func againContainSpecialWord(v *Info, info *Info) bool {
|
|
|
-
|
|
|
if isBidopentimeInterval(info.bidopentime, v.bidopentime) {
|
|
|
return true
|
|
|
}
|
|
@@ -122,58 +121,34 @@ func againContainSpecialWord(v *Info, info *Info) bool {
|
|
|
|
|
|
//提取标题-标段号处理
|
|
|
func dealTitleSpecial(title1 string, title2 string) bool {
|
|
|
-
|
|
|
regular1 := "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789]+[))]?"
|
|
|
regular2 := "[0-9a-zA-Z一二三四五六七八九十零123456789]+(包|标段|标包)"
|
|
|
regx1_1, _ := regexp.Compile(regular1)
|
|
|
str1 := regx1_1.FindString(title1)
|
|
|
- if str1 != "" {
|
|
|
- //log.Println("标题1,规则一提取:",str1)
|
|
|
- } else {
|
|
|
+ if str1 == "" {
|
|
|
regx1_2, _ := regexp.Compile(regular2)
|
|
|
str1 = regx1_2.FindString(title1)
|
|
|
- if str1 != "" {
|
|
|
- //log.Println("标题1,规则二提取:",str1)
|
|
|
- }
|
|
|
}
|
|
|
-
|
|
|
regx2_1, _ := regexp.Compile(regular1)
|
|
|
str2 := regx2_1.FindString(title2)
|
|
|
- if str2 != "" {
|
|
|
- //log.Println("标题2,规则一提取:",str2)
|
|
|
- } else {
|
|
|
+ if str2 == "" {
|
|
|
regx2_2, _ := regexp.Compile(regular2)
|
|
|
str2 = regx2_2.FindString(title2)
|
|
|
- if str2 != "" {
|
|
|
- //log.Println("标题2,规则二提取:",str2)
|
|
|
- }
|
|
|
}
|
|
|
-
|
|
|
//根据提取的结果,在进行清洗
|
|
|
if str1 != "" {
|
|
|
str1 = deleteExtraSpace(str1)
|
|
|
- str1 = strings.Replace(str1, "(", "", -1)
|
|
|
- str1 = strings.Replace(str1, "(", "", -1)
|
|
|
- str1 = strings.Replace(str1, ")", "", -1)
|
|
|
- str1 = strings.Replace(str1, ")", "", -1)
|
|
|
+ str1 = cleanNameReg_0.ReplaceAllString(str1, "")
|
|
|
str1 = convertArabicNumeralsAndLetters(str1)
|
|
|
}
|
|
|
-
|
|
|
if str2 != "" {
|
|
|
str2 = deleteExtraSpace(str2)
|
|
|
- str2 = strings.Replace(str2, "(", "", -1)
|
|
|
- str2 = strings.Replace(str2, "(", "", -1)
|
|
|
- str2 = strings.Replace(str2, ")", "", -1)
|
|
|
- str2 = strings.Replace(str2, ")", "", -1)
|
|
|
+ str2 = cleanNameReg_0.ReplaceAllString(str2, "")
|
|
|
str2 = convertArabicNumeralsAndLetters(str2)
|
|
|
}
|
|
|
-
|
|
|
- //log.Println("最终:",str1,str2)
|
|
|
if str1 != str2 {
|
|
|
- //log.Println("不一致")
|
|
|
return true
|
|
|
} else {
|
|
|
- //log.Println("一致")
|
|
|
return false
|
|
|
}
|
|
|
}
|
|
@@ -196,7 +171,6 @@ func deleteExtraSpace(s string) string {
|
|
|
|
|
|
//中标金额倍率:10000
|
|
|
func isBidWinningAmount(f1 float64, f2 float64) bool {
|
|
|
-
|
|
|
if f1 == f2 || f1*10000 == f2 || f2*10000 == f1 {
|
|
|
return false
|
|
|
}
|
|
@@ -205,7 +179,6 @@ func isBidWinningAmount(f1 float64, f2 float64) bool {
|
|
|
|
|
|
//时间间隔周期
|
|
|
func isTimeIntervalPeriod(i1 int64, i2 int64) bool {
|
|
|
-
|
|
|
if math.Abs(float64(i1-i2)) < 172800.0 {
|
|
|
return true
|
|
|
} else {
|
|
@@ -255,7 +228,7 @@ func isPublishtimeInterval(i1 int64, i2 int64) bool {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-//开标时间区间为一天
|
|
|
+//时间区间为一天
|
|
|
func isTheSameDay(i1 int64, i2 int64) bool {
|
|
|
if i1 == 0 || i2 == 0 {
|
|
|
return false
|
|
@@ -266,15 +239,11 @@ func isTheSameDay(i1 int64, i2 int64) bool {
|
|
|
if day1 == day2 {
|
|
|
return true
|
|
|
}
|
|
|
- //if math.Abs(float64(i1-i2)) <=86400.0 {
|
|
|
- // return true
|
|
|
- //}
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
//前置0 五要素均相等认为重复
|
|
|
func leadingElementSame(v *Info, info *Info) bool {
|
|
|
-
|
|
|
isok := 0
|
|
|
if info.projectname != "" && v.projectname == info.projectname {
|
|
|
isok++
|
|
@@ -313,70 +282,20 @@ func buyerIsContinue(v *Info, info *Info) bool {
|
|
|
if !isTheSameDay(info.publishtime, v.publishtime) {
|
|
|
return true
|
|
|
}
|
|
|
- if judgeNameIsDifferent(v.title, info.title) {
|
|
|
- if judgeNameIsDifferent(v.projectname, info.projectname) {
|
|
|
+ if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ if v.c_title != "" && info.c_title != "" && v.c_title != info.c_title {
|
|
|
+ if v.c_projectname != "" && info.c_projectname != "" && v.c_projectname != info.c_projectname {
|
|
|
return true
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
- if judgeNameIsDifferent(v.projectname, info.projectname) {
|
|
|
- return true
|
|
|
- }
|
|
|
-
|
|
|
if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
|
|
|
return true
|
|
|
}
|
|
|
if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
|
|
|
return true
|
|
|
}
|
|
|
-
|
|
|
- return false
|
|
|
-}
|
|
|
-
|
|
|
-//采用通用清洗~判断是否为有效不同
|
|
|
-func judgeNameIsDifferent(name_1 string, name_2 string) bool {
|
|
|
- if name_1 == "" || name_2 == "" {
|
|
|
- return false
|
|
|
- }
|
|
|
- if name_1 == name_2 {
|
|
|
- return false
|
|
|
- }
|
|
|
- //通用清洗
|
|
|
- new_name_1, new_name_2 := cleanNameFilterRedundant(name_1), cleanNameFilterRedundant(name_2)
|
|
|
- if new_name_1 == new_name_2 {
|
|
|
- return false
|
|
|
- }
|
|
|
- return true
|
|
|
-}
|
|
|
-
|
|
|
-//临时清洗名称过滤冗余
|
|
|
-func cleanNameFilterRedundant(name string) string {
|
|
|
- new_name := name
|
|
|
- new_name = cleanNameReg_0.ReplaceAllString(new_name, "${1}${3}")
|
|
|
- new_name = cleanNameReg_1.ReplaceAllString(new_name, "")
|
|
|
- new_name = cleanNameReg_2.ReplaceAllString(new_name, "")
|
|
|
-
|
|
|
- return new_name
|
|
|
-}
|
|
|
-
|
|
|
-//无效数据
|
|
|
-func invalidData(d1 string, d2 string, d3 string, d4 string) bool {
|
|
|
- var n int
|
|
|
- if d1 != "" {
|
|
|
- n++
|
|
|
- }
|
|
|
- if d2 != "" {
|
|
|
- n++
|
|
|
- }
|
|
|
- if d3 != "" {
|
|
|
- n++
|
|
|
- }
|
|
|
- if d4 != "" {
|
|
|
- n++
|
|
|
- }
|
|
|
- if n == 0 {
|
|
|
- return true
|
|
|
- }
|
|
|
return false
|
|
|
}
|
|
|
|
|
@@ -446,28 +365,31 @@ func confrimBiddingData(source_id string, info_id string) (bool, map[string]inte
|
|
|
return isvalid, info_data, source_data
|
|
|
}
|
|
|
|
|
|
-func IsJingPinData(s_href string, i_href string) bool {
|
|
|
- if strings.Contains(s_href, "www.jianyu360") ||
|
|
|
- strings.Contains(i_href, "www.jianyu360") {
|
|
|
+//是否为竞品链接
|
|
|
+func IsJpHref(href string) bool {
|
|
|
+ if strings.Contains(href, "www.jianyu360") && href != "" {
|
|
|
return true
|
|
|
}
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
+//验证竞品是否重复
|
|
|
func confirmJingPinIsRepeatData(v *Info, info *Info) bool {
|
|
|
- //先验证标题
|
|
|
- if (strings.Contains(info.title, v.title) || strings.Contains(v.title, info.title)) &&
|
|
|
- v.title != "" && info.title != "" {
|
|
|
+ //标题验证~是否有关联~是否需要清洗数据
|
|
|
+ if v.c_title != "" && info.c_title != "" { //标题相似判断
|
|
|
+ if !(strings.Contains(v.c_title, info.c_title) || strings.Contains(info.c_title, v.c_title)) {
|
|
|
+ return false
|
|
|
+ }
|
|
|
if !isTheSameDay(v.publishtime, info.publishtime) {
|
|
|
return false
|
|
|
}
|
|
|
if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
|
|
|
return false
|
|
|
}
|
|
|
- if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0 && info.bidamount != 0 {
|
|
|
+ if isBidWinningAmount(v.bidamount, info.bidamount) && v.bidamount != 0.0 && info.bidamount != 0.0 {
|
|
|
return false
|
|
|
}
|
|
|
- if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
|
|
|
+ if v.winner != "" && info.winner != "" && deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) {
|
|
|
return false
|
|
|
}
|
|
|
if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
|
|
@@ -479,4 +401,15 @@ func confirmJingPinIsRepeatData(v *Info, info *Info) bool {
|
|
|
return true
|
|
|
}
|
|
|
return false
|
|
|
-}
|
|
|
+}
|
|
|
+
|
|
|
+//通用清洗~清洗名称~过滤冗余~
|
|
|
+func cleanNameFilterRedundant(name string) string {
|
|
|
+ new_name := name
|
|
|
+ new_name = cleanNameReg_0.ReplaceAllString(new_name, "")
|
|
|
+ new_name = cleanNameReg_1.ReplaceAllString(new_name, "${1}${3}")
|
|
|
+ new_name = cleanNameReg_2.ReplaceAllString(new_name, "")
|
|
|
+ new_name = cleanNameReg_3.ReplaceAllString(new_name, "${2}")
|
|
|
+
|
|
|
+ return new_name
|
|
|
+}
|