|
@@ -0,0 +1,378 @@
|
|
|
+package util
|
|
|
+
|
|
|
+import (
|
|
|
+ "encoding/json"
|
|
|
+ "log"
|
|
|
+ qu "qfw/util"
|
|
|
+ "regexp"
|
|
|
+ "strings"
|
|
|
+ "unicode"
|
|
|
+ "unicode/utf8"
|
|
|
+)
|
|
|
+
|
|
|
+//处理-打分分词
|
|
|
+func dealWithNameScoreRules(name string) (string, bool, float64 , []map[string]interface{}) {
|
|
|
+ new_name, new_score, isok := "", float64(0), false
|
|
|
+ old_name := escapeNew(name)
|
|
|
+ if old_name == "" {
|
|
|
+ return "", false, new_score ,nil
|
|
|
+ }
|
|
|
+ //标准分:
|
|
|
+ standard_score := float64(4.0)
|
|
|
+ query_name := old_name
|
|
|
+ endstr := endWordReg.FindString(query_name)
|
|
|
+ if endstr !="" {
|
|
|
+ standard_score = float64(3.0)
|
|
|
+ query_name = strings.ReplaceAll(query_name,endstr,"")
|
|
|
+ }
|
|
|
+
|
|
|
+ query := `{"query":{"bool":{"must":[{"query_string":{"default_field":"unique_qy.name_word","query":"` + query_name + `"}}],"must_not":[],"should":[]}},"from":"0","size":"200"}`
|
|
|
+ tmp := make(map[string]interface{})
|
|
|
+ json.Unmarshal([]byte(query), &tmp)
|
|
|
+ searchResult, err := Client_Es.Search().Index(Es_index).Type(Es_type).Source(tmp).Do()
|
|
|
+ if err != nil {
|
|
|
+ log.Println("ES查询出错", name, old_name,err)
|
|
|
+ return "", false,new_score, nil
|
|
|
+ }
|
|
|
+ if searchResult.Hits!= nil{
|
|
|
+ resNum := len(searchResult.Hits.Hits)
|
|
|
+ res := make([]map[string]interface{}, resNum)
|
|
|
+ if searchResult.Hits != nil {
|
|
|
+ if resNum < 1000 {
|
|
|
+ for i, hit := range searchResult.Hits.Hits {
|
|
|
+ data := make(map[string]interface{}, 0)
|
|
|
+ json.Unmarshal(*hit.Source, &data)
|
|
|
+ res[i] = map[string]interface{}{
|
|
|
+ "name": data["name"],
|
|
|
+ "score": *hit.Score,
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ log.Println("查询结果太多,查询到:", resNum, "条")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if len(res) > 0 {
|
|
|
+ //针对特殊情况-特殊处理 公司结尾
|
|
|
+ new_name = qu.ObjToString(res[0]["name"])
|
|
|
+ new_score = qu.Float64All(res[0]["score"])
|
|
|
+ if endComReg.MatchString(name) {
|
|
|
+ new_name,new_score = dealWithSpecialName(name,res)
|
|
|
+ if new_name=="" {
|
|
|
+ new_name = qu.ObjToString(res[0]["name"])
|
|
|
+ new_score = qu.Float64All(res[0]["score"])
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if new_name != "" { //分析hit比例
|
|
|
+ total, hit := dealWithWordsRules(name, new_name)
|
|
|
+ proportion := float64(hit) / float64(total)
|
|
|
+ if proportion >= 1.0 {
|
|
|
+ isok = true
|
|
|
+ } else {
|
|
|
+
|
|
|
+ //前置规则-与分数无关 江苏凤凰出版社--江苏凤凰出版社有限公司
|
|
|
+ //吉林省彩虹城市建设工程有限公司--吉林彩虹城市建设工程有限公司
|
|
|
+ if dealWithPreRule(query_name,new_name) {
|
|
|
+ return new_name, true,new_score, res
|
|
|
+ }
|
|
|
+
|
|
|
+ if float64(hit)/float64(total) >= 0.8 && new_score > standard_score {
|
|
|
+ str1,str2:=startWordReg_1.FindString(name),startWordReg_1.FindString(new_name)
|
|
|
+ if str1!="" && str2!="" {
|
|
|
+ if strings.Contains(str1,str2)||strings.Contains(str2,str1) {
|
|
|
+
|
|
|
+ }else {
|
|
|
+ return new_name, false,new_score, res
|
|
|
+ }
|
|
|
+ }
|
|
|
+ str1,str2 = startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
|
|
|
+ if str1!="" && str2!=""{
|
|
|
+ if str1 != str2 {
|
|
|
+ return new_name, false,new_score, res
|
|
|
+ }
|
|
|
+ }
|
|
|
+ isok = true
|
|
|
+ }else {}
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return new_name, isok,new_score, res
|
|
|
+ }
|
|
|
+ return new_name,isok,new_score,nil
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+func dealWithPreRule(name string , new_name string) bool {
|
|
|
+ //log.Println("规则时:",name,new_name)
|
|
|
+ endstr := endWordReg.FindString(new_name)
|
|
|
+ if endstr !="" {
|
|
|
+ new_name = strings.ReplaceAll(new_name,endstr,"")
|
|
|
+ }
|
|
|
+ if name==new_name {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ //去掉开头 全程简称
|
|
|
+ str1,str2 := startWordReg_2.FindString(name),startWordReg_2.FindString(new_name)
|
|
|
+ if str1!="" && str2!="" && str1==str2 {
|
|
|
+ //在清全程简称
|
|
|
+ start_str_all_1 := startWordReg_3.FindString(name)
|
|
|
+ if start_str_all_1 !="" {
|
|
|
+ name = strings.ReplaceAll(name,start_str_all_1,"")
|
|
|
+ }else {
|
|
|
+ start_str_sim_1 := startWordReg_2.FindString(name)
|
|
|
+ name = strings.ReplaceAll(name,start_str_sim_1,"")
|
|
|
+ }
|
|
|
+
|
|
|
+ start_str_all_2 := startWordReg_3.FindString(new_name)
|
|
|
+ if start_str_all_2 !="" {
|
|
|
+ new_name = strings.ReplaceAll(new_name,start_str_all_2,"")
|
|
|
+ }else {
|
|
|
+ start_str_sim_2 := startWordReg_2.FindString(new_name)
|
|
|
+ new_name = strings.ReplaceAll(new_name,start_str_sim_2,"")
|
|
|
+ }
|
|
|
+ if name==new_name {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ //去掉指定维文字
|
|
|
+ name = strings.ReplaceAll(name,"科技","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"科技","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"建筑工程","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"建筑工程","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"工程","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"工程","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"标识","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"标识","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"工业","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"工业","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"公司","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"公司","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"(","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"(","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,")","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,")","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"(","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"(","")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,")","")
|
|
|
+ new_name = strings.ReplaceAll(new_name,")","")
|
|
|
+
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"信息技术","信息")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"信息技术","信息")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"电子科技","电子")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"电子科技","电子")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"电子技术","电子")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"电子技术","电子")
|
|
|
+
|
|
|
+ name = strings.ReplaceAll(name,"建设集团","建设")
|
|
|
+ new_name = strings.ReplaceAll(new_name,"建设集团","建设")
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ //log.Println("最终清理后-",name,new_name)
|
|
|
+ if name==new_name {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+//击中数量以及比例
|
|
|
+func dealWithWordsRules(info_name string, source_name string) (int, int) {
|
|
|
+ total, hit := 0, 0
|
|
|
+ //字符串处理,替换指定字符
|
|
|
+ source_name = strings.ReplaceAll(source_name, "(微型企业)", "")
|
|
|
+ source_name = strings.ReplaceAll(source_name, "(有限合伙)", "")
|
|
|
+ source_name = strings.ReplaceAll(source_name, "(普通合伙)", "")
|
|
|
+
|
|
|
+ info_name = strings.ReplaceAll(info_name, "(", "")
|
|
|
+ info_name = strings.ReplaceAll(info_name, ")", "")
|
|
|
+ info_name = strings.ReplaceAll(info_name, "(", "")
|
|
|
+ info_name = strings.ReplaceAll(info_name, ")", "")
|
|
|
+ info_name = strings.ReplaceAll(info_name, "〉", "")
|
|
|
+
|
|
|
+ source_name = strings.ReplaceAll(source_name, "(", "")
|
|
|
+ source_name = strings.ReplaceAll(source_name, ")", "")
|
|
|
+ source_name = strings.ReplaceAll(source_name, "(", "")
|
|
|
+ source_name = strings.ReplaceAll(source_name, ")", "")
|
|
|
+ source_name = strings.ReplaceAll(source_name, "〉", "")
|
|
|
+
|
|
|
+ nameArr, _ := calculateWordCount(info_name)
|
|
|
+ _, total = calculateWordCount(source_name)
|
|
|
+ for _, v1 := range nameArr {
|
|
|
+ if strings.Contains(source_name, v1) {
|
|
|
+ hit++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return total, hit
|
|
|
+}
|
|
|
+
|
|
|
+//分词结果
|
|
|
+func calculateWordCount(name string) ([]string, int) {
|
|
|
+ arr, space := make([]string, 0), 2
|
|
|
+ total := utf8.RuneCountInString(name) - (space - 1)
|
|
|
+ if name == "" || total <= 0 {
|
|
|
+ return arr, 0
|
|
|
+ }
|
|
|
+ nameRune := []rune(name)
|
|
|
+ for i := 0; i < total; i++ {
|
|
|
+ new_str := string(nameRune[i : space+i])
|
|
|
+ arr = append(arr, new_str)
|
|
|
+ }
|
|
|
+ return arr, len(arr)
|
|
|
+}
|
|
|
+
|
|
|
+//func escape(s string) string {
|
|
|
+// news := ""
|
|
|
+// s = strings.ReplaceAll(s," ","")
|
|
|
+// for _, c := range s {
|
|
|
+// //if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
|
|
|
+// // news = news + string(c)
|
|
|
+// //}else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
|
|
|
+// // a := string([]rune{os.PathSeparator, '\\'})
|
|
|
+// // news = news + a + string(c)
|
|
|
+// //} else {
|
|
|
+// // return ""
|
|
|
+// //}
|
|
|
+// if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {
|
|
|
+// a := string([]rune{os.PathSeparator,'\\'})
|
|
|
+// //news = news + a + `\` + string(c)
|
|
|
+// news = news + a + string(c)
|
|
|
+// } else {
|
|
|
+// news = news + string(c)
|
|
|
+// }
|
|
|
+//
|
|
|
+// }
|
|
|
+// return news
|
|
|
+//}
|
|
|
+
|
|
|
+func escapeNew(s string) string {
|
|
|
+ news := ""
|
|
|
+ s = strings.ReplaceAll(s, " ", "")
|
|
|
+ for _, c := range s {
|
|
|
+ if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
|
|
|
+ news = news + string(c)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return news
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+//处理特殊情况-分析最终
|
|
|
+func dealWithSpecialName(name string,res []map[string]interface{})(string ,float64) {
|
|
|
+ //log.Println("特殊...")
|
|
|
+ new_name ,new_score,proportion := "",float64(0),float64(0)
|
|
|
+ for k,v:=range res {
|
|
|
+ tmp_name := qu.ObjToString(v["name"])
|
|
|
+ tmp_score :=qu.Float64All(v["score"])
|
|
|
+ if k>=20 || tmp_score<3.0 {
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ if endComReg.MatchString(tmp_name) {
|
|
|
+ if endFComReg.MatchString(name) {
|
|
|
+ if !endFComReg.MatchString(tmp_name) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ if endFComReg.MatchString(tmp_name) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ total, hit := dealWithWordsRules(tmp_name, name)
|
|
|
+ tmp_proportion := float64(hit) / float64(total)
|
|
|
+ //log.Println(tmp_proportion,tmp_name,name)
|
|
|
+ if tmp_proportion > proportion {
|
|
|
+ proportion = tmp_proportion
|
|
|
+ new_name = tmp_name
|
|
|
+ new_score = tmp_score
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return new_name,float64(new_score)
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+var endComReg *regexp.Regexp = regexp.MustCompile("(公司)$")
|
|
|
+var endFComReg *regexp.Regexp = regexp.MustCompile("(分公司)$")
|
|
|
+
|
|
|
+var reg_alias = regexp.MustCompile("(税务局|工商行政管理局|文化广播电视新闻出版局|外国专家局|" +
|
|
|
+ "中医药管理局|市场监督管理局|广播电视局|医疗保障局|机关事务管理局|粮食和物资储备局|" +
|
|
|
+ "监狱管理局|畜牧兽医局|食品药品监督管理局|城市管理行政执法局|城市管理局|国家保密局|密码管理局|" +
|
|
|
+ "地方金融监督管理局|住房保障和房屋管理局|质量技术监督局|人力资源与社会保障局|公路管理局|国土资源局|" +
|
|
|
+ "卫生和计划生育局|民事政务局|公众安全局|交通管理局|人力资源和社会保障局|劳动和社会保障局|" +
|
|
|
+ "住房和城乡建设局|就业服务局|文物管理局|环境保护局|粮食和物资储备局|教育体育局|" +
|
|
|
+ "体育局|教育局|招商局|农业局|农机局|水务局|林业局|财政局|审计局|统计局|商务局)$")
|
|
|
+var reglen *regexp.Regexp = regexp.MustCompile("^(.{1,3}|.{40,})$")
|
|
|
+var strReg *regexp.Regexp = regexp.MustCompile("^(.{0,3}工程队|.{0,3}总公司|_+|.{0,2}设备安装公司|.{0,2}装[饰修潢]公司|.{0,2}开发公司|.{0,4}有限公司|.{0,4}有限责任公司|.{0,4}设计院|建筑设计研?究?院|省文物考古研究所|经济开发区|省.*|镇人民政府|.{0,2}服务公司|" +
|
|
|
+ ".{0,2}工程质量监督站|.{0,3}经[营销]部|.{0,3}事务所|.{0,4}工程公司|.{0,4}责任公司|.*勘测|.{0,4}研究院|.*能源建|.{0,2}安装工程|.*[市省]{1}|.{0,4}中心|.*区.?|" +
|
|
|
+ ".{0,3}税务局|.{0,3}财政局|.{0,3}商行|.{0,2}公安处|.{0,2}测绘院|.{0,3}开发|.{0,2}建设局|.{0,2}经销部|.{0,3}委员会|.{0,2}分公司|.{0,2}管理站|.{0,2}事务管理局|" +
|
|
|
+ ".*资料|.{0,2}办公用品.{1,2}|.*唯亭|.*设备|.+安装|.{0,2}技术服务|市.+[台院社局司]|城?区.+[府局室院]|县.+[院台局]|.{0,2}发展公司|经济技术开发|" +
|
|
|
+ "发展和改革局|贵州有色地质|铝塑门窗加工|生产力促进中心|特殊普通合伙|工业集团公司|人民调解协会|人民政府办公厅|机电设备公司|房地产开发有限公司|.{0,4}商店|中等专业学校|" +
|
|
|
+ "农村信用联社|.{0,4}经营部|.{0,4}销售部|驾驶员培训学校|.{2}县.{2}镇|保安服务总公司|住房和城乡建设局|地产评估事务所|生产资料门市部|×+|.{0,3}[0-9]{15}|.*[0-9]+|.*路|.*无字号名称.*|.*车|.*[,,]{1}.*|.*个体工商户|.*运输户)$")
|
|
|
+
|
|
|
+//非中文开头...
|
|
|
+var unstart_strReg *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5])")
|
|
|
+
|
|
|
+//开头
|
|
|
+var start_strReg *regexp.Regexp = regexp.MustCompile("^([a-zA-Z]{1,2}[\u4e00-\u9fa5]{6,}|省|市|县|区|业绩|资格|中标|项目|预算单位)")
|
|
|
+
|
|
|
+//结尾
|
|
|
+var end_strReg *regexp.Regexp = regexp.MustCompile("(\\.|\\.\\.|餐馆|店|腻子|肉庄|画社|美发屋|发廊|网吧|网咖|零售点|新街|包子铺|奶茶铺|(株)|先生|女士|小姐|" +
|
|
|
+ "资格|业绩|中标|项目|预算单位|摊位号|号|厅|室|部|点|馆|场|厂|床|所|处|站|行|中心|合作社|ATMS|" +
|
|
|
+ "吧|楼|摊|摊位|廊|茶社|坊|圃|汤锅|园|民宿|美容院|房|排挡|府|庄|栈|队|批发|苑|养殖户|棋牌|农家乐|货运|" +
|
|
|
+ "城|社|基地|会|服务|娱乐|种植|百货|汽修|农家菜|亭|小吃|快餐|粮库|卫生院|书画院|面|门窗|鸡排|屋|橱|堂|肉铺|服务|服饰|/*)$")
|
|
|
+
|
|
|
+//包含
|
|
|
+var con_strReg *regexp.Regexp = regexp.MustCompile("(\\?|?|%|代码标识|删除|错误|吊销|注销|发起人|待清理|&#|护照号|身份证号|" +
|
|
|
+ "法人| |国家拨入|借款|积累资金|单位自有|认股人|--|、|&|`|美元|[\u4e00-\u9fa5]{2,6}·[\u4e00-\u9fa5]{2,6})|" +
|
|
|
+ "[a-zA-Z]{5,}")
|
|
|
+
|
|
|
+var uncon_strReg *regexp.Regexp = regexp.MustCompile("(园|政府|集团|公司|有限|合伙|企|院|学|局|处)")
|
|
|
+
|
|
|
+var startWordReg_1 *regexp.Regexp = regexp.MustCompile("^(.{1,5})(省|市|县|州|自治区|特别行政区)")
|
|
|
+var startWordReg_2 *regexp.Regexp = regexp.MustCompile("^(北京|天津|重庆|上海|河北|山西|" +
|
|
|
+ "浙江|江西|湖北|吉林|海南|甘肃|广东|陕西|辽宁|山东|河南|云南|黑龙江|福建|贵州|江苏|安徽|" +
|
|
|
+ "湖南|四川|青海|台湾|新疆|内蒙古|宁夏|西藏|广西|澳门|香港)")
|
|
|
+var startWordReg_3 *regexp.Regexp = regexp.MustCompile("^(北京市|天津市|重庆市|上海市|河北省|山西省|" +
|
|
|
+ "浙江省|江西省|湖北省|吉林省|海南省|甘肃省|广东省|陕西省|辽宁省|山东省|河南省|云南省|黑龙江省|福建省|贵州省|江苏省|安徽省|" +
|
|
|
+ "湖南省|四川省|青海省|台湾省|新疆维吾尔自治区|内蒙古自治区|宁夏回族自治区|西藏自治区|广西壮族自治区|澳门特别行政区|香港特别行政区)")
|
|
|
+
|
|
|
+var endWordReg *regexp.Regexp = regexp.MustCompile("(有限公司|有限责任公司)$")
|
|
|
+
|