apple 4 роки тому
батько
коміт
4f2e1c91f3

+ 12 - 3
udpdataclear/udpSensitiveWords/main.go

@@ -1,19 +1,28 @@
 package main
 
 import (
+	"log"
 	"sensitiveWords.udp/util"
+	"time"
 )
 
 func init() {
-	//return
+	return
 	util.InitC()
 }
 func main() {
 
 	//go util.AddTaskSensitiveWordsData() //增量
+	start := int(time.Now().Unix())
 
-	//util.TemporaryTestNewData()
-	//return
+	util.TemporaryDataQYXY()
+	util.TemporaryReadyEsData()
+	util.TemporaryTest()
+	util.TemporaryTestNewData()
+
+	log.Println("全部结束-耗时:",int(time.Now().Unix())-start,"秒")
+
+	return
 	// 主函数中添加
 	util.ExtractUdp() //udp通知抽取
 	lock := make(chan bool)

+ 5 - 1
udpdataclear/udpSensitiveWords/util/udpdata.go

@@ -362,7 +362,7 @@ var reg_alias = regexp.MustCompile("(税务局|工商行政管理局|文化广
 	"卫生和计划生育局|民事政务局|公众安全局|交通管理局|人力资源和社会保障局|劳动和社会保障局|" +
 	"住房和城乡建设局|就业服务局|文物管理局|环境保护局|粮食和物资储备局|教育体育局|" +
 	"体育局|教育局|招商局|农业局|农机局|水务局|林业局|财政局|审计局|统计局|商务局)$")
-var reglen *regexp.Regexp = regexp.MustCompile("^(.{1,5}|.{40,})$")
+var reglen *regexp.Regexp = regexp.MustCompile("^(.{1,3}|.{40,})$")
 var strReg *regexp.Regexp = regexp.MustCompile("^(.{0,3}工程队|.{0,3}总公司|_+|.{0,2}设备安装公司|.{0,2}装[饰修潢]公司|.{0,2}开发公司|.{0,4}有限公司|.{0,4}有限责任公司|.{0,4}设计院|建筑设计研?究?院|省文物考古研究所|经济开发区|省.*|镇人民政府|.{0,2}服务公司|" +
 	".{0,2}工程质量监督站|.{0,3}经[营销]部|.{0,3}事务所|.{0,4}工程公司|.{0,4}责任公司|.*勘测|.{0,4}研究院|.*能源建|.{0,2}安装工程|.*[市省]{1}|.{0,4}中心|.*区.?|" +
 	".{0,3}税务局|.{0,3}财政局|.{0,3}商行|.{0,2}公安处|.{0,2}测绘院|.{0,3}开发|.{0,2}建设局|.{0,2}经销部|.{0,3}委员会|.{0,2}分公司|.{0,2}管理站|.{0,2}事务管理局|" +
@@ -393,5 +393,9 @@ var startWordReg_1 *regexp.Regexp = regexp.MustCompile("^(.{1,5})(省|市|县|
 var startWordReg_2 *regexp.Regexp = regexp.MustCompile("^(北京|天津|重庆|上海|河北|山西|" +
 	"浙江|江西|湖北|吉林|海南|甘肃|广东|陕西|辽宁|山东|河南|云南|黑龙江|福建|贵州|江苏|安徽|" +
 	"湖南|四川|青海|台湾|新疆|内蒙古|宁夏|西藏|广西|澳门|香港)")
+var startWordReg_3 *regexp.Regexp = regexp.MustCompile("^(北京市|天津市|重庆市|上海市|河北省|山西省|" +
+	"浙江省|江西省|湖北省|吉林省|海南省|甘肃省|广东省|陕西省|辽宁省|山东省|河南省|云南省|黑龙江省|福建省|贵州省|江苏省|安徽省|" +
+	"湖南省|四川省|青海省|台湾省|新疆维吾尔自治区|内蒙古自治区|宁夏回族自治区|西藏自治区|广西壮族自治区|澳门特别行政区|香港特别行政区)")
+
 
 var endWordReg *regexp.Regexp = regexp.MustCompile("(有限公司|有限责任公司)$")

+ 112 - 20
udpdataclear/udpSensitiveWords/util/words.go

@@ -64,8 +64,14 @@ func dealWithNameScoreRules(name string) (string, bool, float64 , []map[string]i
 			if proportion >= 1.0 {
 				isok = true
 			} else {
-				if float64(hit)/float64(total) >= 0.8 && new_score > standard_score {
 
+				//前置规则-与分数无关  江苏凤凰出版社--江苏凤凰出版社有限公司
+				//吉林省彩虹城市建设工程有限公司--吉林彩虹城市建设工程有限公司
+				if dealWithPreRule(query_name,new_name) {
+					return new_name, true,new_score, res
+				}
+
+				if float64(hit)/float64(total) >= 0.8 && new_score > standard_score {
 					str1,str2:=startWordReg_1.FindString(name),startWordReg_1.FindString(new_name)
 					if str1!="" && str2!="" {
 						if strings.Contains(str1,str2)||strings.Contains(str2,str1) {
@@ -81,33 +87,116 @@ func dealWithNameScoreRules(name string) (string, bool, float64 , []map[string]i
 						}
 					}
 					isok = true
-				}else if new_score > standard_score {
-					str1,str2:=name,new_name
-					str1 = strings.ReplaceAll(str1,"责任","")
-					str2 = strings.ReplaceAll(str2,"责任","")
+				}else {}
+			}
+		}
+		return new_name, isok,new_score, res
+	}
+	return new_name,isok,new_score,nil
+}
+
+func dealWithPreRule(name string , new_name string) bool {
+	log.Println("规则时:",name,new_name)
+	endstr := endWordReg.FindString(new_name)
+	if endstr !="" {
+		new_name = strings.ReplaceAll(new_name,endstr,"")
+	}
+	if name==new_name {
+		return true
+	}
+	//去掉开头 全程简称
+	str1,str2 := startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
+	if str1!="" && str2!="" && str1==str2 {
+		//在清全程简称
+		start_str_all_1 := startWordReg_3.FindString(name)
+		if start_str_all_1 !="" {
+			name = strings.ReplaceAll(name,start_str_all_1,"")
+		}else {
+			start_str_sim_1 := startWordReg_2.FindString(name)
+			name = strings.ReplaceAll(name,start_str_sim_1,"")
+		}
 
-					str1 = strings.ReplaceAll(str1,"有限","")
-					str2 = strings.ReplaceAll(str2,"有限","")
+		start_str_all_2 := startWordReg_3.FindString(new_name)
+		if start_str_all_2 !="" {
+			new_name = strings.ReplaceAll(new_name,start_str_all_2,"")
+		}else {
+			start_str_sim_2 := startWordReg_2.FindString(new_name)
+			new_name = strings.ReplaceAll(new_name,start_str_sim_2,"")
+		}
+		if name==new_name {
+			return true
+		}
+	}
 
-					str1 = strings.ReplaceAll(str1,"科技","")
-					str2 = strings.ReplaceAll(str2,"科技","")
 
-					str1 = strings.ReplaceAll(str1,"工程","")
-					str2 = strings.ReplaceAll(str2,"工程","")
-					if str1==str2 {
-						return new_name, true,new_score, res
-					}
+	//去掉指定维文字
+	name = strings.ReplaceAll(name,"科技","")
+	new_name = strings.ReplaceAll(new_name,"科技","")
 
-				}else {
+	name = strings.ReplaceAll(name,"建筑工程","")
+	new_name = strings.ReplaceAll(new_name,"建筑工程","")
 
-				}
-			}
-		}
-		return new_name, isok,new_score, res
+	name = strings.ReplaceAll(name,"工程","")
+	new_name = strings.ReplaceAll(new_name,"工程","")
+
+	name = strings.ReplaceAll(name,"标识","")
+	new_name = strings.ReplaceAll(new_name,"标识","")
+
+	name = strings.ReplaceAll(name,"工业","")
+	new_name = strings.ReplaceAll(new_name,"工业","")
+
+	name = strings.ReplaceAll(name,"公司","")
+	new_name = strings.ReplaceAll(new_name,"公司","")
+
+	name = strings.ReplaceAll(name,"(","")
+	new_name = strings.ReplaceAll(new_name,"(","")
+
+	name = strings.ReplaceAll(name,")","")
+	new_name = strings.ReplaceAll(new_name,")","")
+
+	name = strings.ReplaceAll(name,"(","")
+	new_name = strings.ReplaceAll(new_name,"(","")
+
+	name = strings.ReplaceAll(name,")","")
+	new_name = strings.ReplaceAll(new_name,")","")
+
+
+	name = strings.ReplaceAll(name,"信息技术","信息")
+	new_name = strings.ReplaceAll(new_name,"信息技术","信息")
+
+	name = strings.ReplaceAll(name,"电子科技","电子")
+	new_name = strings.ReplaceAll(new_name,"电子科技","电子")
+
+	name = strings.ReplaceAll(name,"电子技术","电子")
+	new_name = strings.ReplaceAll(new_name,"电子技术","电子")
+
+	name = strings.ReplaceAll(name,"建设集团","建设")
+	new_name = strings.ReplaceAll(new_name,"建设集团","建设")
+
+
+
+	log.Println("最终清理后-",name,new_name)
+	if name==new_name {
+		return true
 	}
-	return new_name,isok,new_score,nil
+
+
+
+
+	return false
 }
 
+
+
+
+
+
+
+
+
+
+
+
 //击中数量以及比例
 func dealWithWordsRules(info_name string, source_name string) (int, int) {
 	total, hit := 0, 0
@@ -120,10 +209,13 @@ func dealWithWordsRules(info_name string, source_name string) (int, int) {
 	info_name = strings.ReplaceAll(info_name, ")", "")
 	info_name = strings.ReplaceAll(info_name, "(", "")
 	info_name = strings.ReplaceAll(info_name, ")", "")
+	info_name = strings.ReplaceAll(info_name, "〉", "")
+
 	source_name = strings.ReplaceAll(source_name, "(", "")
 	source_name = strings.ReplaceAll(source_name, ")", "")
 	source_name = strings.ReplaceAll(source_name, "(", "")
 	source_name = strings.ReplaceAll(source_name, ")", "")
+	source_name = strings.ReplaceAll(source_name, "〉", "")
 
 	nameArr, _ := calculateWordCount(info_name)
 	_, total = calculateWordCount(source_name)