Browse Source

城市抽取备份 03-12

zhengkun 1 year ago
parent
commit
1c7aca28ba

+ 9 - 14
src/jy/extract/extractcity_new.go

@@ -532,7 +532,6 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	if *area == "" || *area == "全国" {
 	if *area == "" || *area == "全国" {
 		fullProvinceArr := e.SensitiveFullProvince.FindAll(detail)
 		fullProvinceArr := e.SensitiveFullProvince.FindAll(detail)
 		if len(fullProvinceArr) == 1 { //再次计算
 		if len(fullProvinceArr) == 1 { //再次计算
-			fullProvinceArr = e.findAmbiguityRegion(detail, fullProvinceArr[0])
 			for _, v := range fullProvinceArr {
 			for _, v := range fullProvinceArr {
 				if sim_province := e.ProvinceMap[v]; sim_province != "" {
 				if sim_province := e.ProvinceMap[v]; sim_province != "" {
 					*area = sim_province
 					*area = sim_province
@@ -544,7 +543,6 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称城市
 	//全称城市
 	fullCityArr := e.SensitiveFullCity.FindAll(detail)
 	fullCityArr := e.SensitiveFullCity.FindAll(detail)
 	if len(fullCityArr) == 1 {
 	if len(fullCityArr) == 1 {
-		fullCityArr = e.findAmbiguityRegion(detail, fullCityArr[0])
 		for _, v := range fullCityArr {
 		for _, v := range fullCityArr {
 			if cityMap := e.CityFullMap[v]; cityMap != nil {
 			if cityMap := e.CityFullMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" || cityMap.P.Brief == *area {
 				if *area == "" || *area == "全国" || cityMap.P.Brief == *area {
@@ -558,7 +556,6 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称区县
 	//全称区县
 	fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail)
 	fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail)
 	if len(fullDistrictArr) == 1 {
 	if len(fullDistrictArr) == 1 {
-		fullDistrictArr = e.findAmbiguityRegion(detail, fullDistrictArr[0])
 		for _, v := range fullDistrictArr {
 		for _, v := range fullDistrictArr {
 			if citys := e.DistrictCityMap[v]; len(citys) == 1 {
 			if citys := e.DistrictCityMap[v]; len(citys) == 1 {
 				if *area == "" || *area == "全国" || citys[0].P.Brief == *area {
 				if *area == "" || *area == "全国" || citys[0].P.Brief == *area {
@@ -573,7 +570,6 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//简称城市
 	//简称城市
 	simCityArr := e.SensitiveSimCity.FindAll(detail)
 	simCityArr := e.SensitiveSimCity.FindAll(detail)
 	if len(simCityArr) == 1 {
 	if len(simCityArr) == 1 {
-		simCityArr = e.findAmbiguityRegion(detail, simCityArr[0])
 		for _, v := range simCityArr {
 		for _, v := range simCityArr {
 			if cityMap := e.CityBriefMap[v]; cityMap != nil {
 			if cityMap := e.CityBriefMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" {
 				if *area == "" || *area == "全国" {
@@ -595,7 +591,6 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	if *area == "" || *area == "全国" {
 	if *area == "" || *area == "全国" {
 		simProvinceArr := e.SensitiveSimProvince.FindAll(detail)
 		simProvinceArr := e.SensitiveSimProvince.FindAll(detail)
 		if len(simProvinceArr) == 1 {
 		if len(simProvinceArr) == 1 {
-			simProvinceArr = e.findAmbiguityRegion(detail, simProvinceArr[0])
 			for _, v := range simProvinceArr {
 			for _, v := range simProvinceArr {
 				if v != "" {
 				if v != "" {
 					*area = v
 					*area = v
@@ -689,15 +684,15 @@ func (e *ExtractTask) GetRegionByTentativeOperator(winner string, all_regions *m
 }
 }
 
 
 // 通用方法找到指定地域有效词组
 // 通用方法找到指定地域有效词组
-func (e *ExtractTask) findAmbiguityRegion(text string, key string) []string {
-	wordsArr := e.Seg_Full.Cut(text, true)
-	for _, word := range wordsArr {
-		if word == key {
-			return []string{key}
-		}
-	}
-	return []string{}
-}
+//func (e *ExtractTask) findAmbiguityRegion(text string, key string) []string {
+//	wordsArr := e.Seg_Full.Cut(text, true)
+//	for _, word := range wordsArr {
+//		if word == key {
+//			return []string{key}
+//		}
+//	}
+//	return []string{}
+//}
 
 
 //初步确认~站点
 //初步确认~站点
 //func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
 //func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {

+ 1 - 1
src/jy/extract/extractcity_other.go

@@ -37,7 +37,7 @@ func (e *ExtractTask) GetMatchScores(j *ju.Job) {
 	j.SimAreaScore, j.SimCityScore, j.SimDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
 	j.SimAreaScore, j.SimCityScore, j.SimDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
 	rf_area, rf_city, rf_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
 	rf_area, rf_city, rf_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
 	rs_area, rs_city, rs_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
 	rs_area, rs_city, rs_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
-	for _, text := range e.Seg_Full.Cut(j.Content, true) {
+	for _, text := range e.Seg_SV.Cut(j.Content, true) {
 		if text == "" {
 		if text == "" {
 			continue
 			continue
 		}
 		}

+ 22 - 17
src/jy/extract/extractcity_way.go

@@ -24,6 +24,8 @@ var OperatorReg = regexp.MustCompile("^中国(电信|联通|移动).*公司$")
 
 
 var CleanTitleReg1 = regexp.MustCompile("[((].*[))]")
 var CleanTitleReg1 = regexp.MustCompile("[((].*[))]")
 
 
+var CleanCutWords = regexp.MustCompile("^(范县|西区|东区|城区|矿区)$")
+
 // 取特殊类数据
 // 取特殊类数据
 func GetFilialeByBuyerInfo(buyer string) string {
 func GetFilialeByBuyerInfo(buyer string) string {
 	if FilialeReg1.MatchString(buyer) {
 	if FilialeReg1.MatchString(buyer) {
@@ -35,7 +37,6 @@ func GetFilialeByBuyerInfo(buyer string) string {
 	if FilialeReg3.MatchString(buyer) {
 	if FilialeReg3.MatchString(buyer) {
 		return FilialeReg3.ReplaceAllString(buyer, "${2}")
 		return FilialeReg3.ReplaceAllString(buyer, "${2}")
 	}
 	}
-
 	return ""
 	return ""
 }
 }
 
 
@@ -197,12 +198,11 @@ func (e *ExtractTask) GetRegionFromText(text string, regions *map[string]map[str
 		return regionValues
 		return regionValues
 	}
 	}
 	wordsArr := []string{}
 	wordsArr := []string{}
-	//if from == 1 {
-	//	wordsArr = e.Seg_PCD.Cut(text, true)
-	//} else if from == 2 {
-	//	wordsArr = e.Seg_SV.Cut(text, true)
-	//}
-	wordsArr = e.Seg_Full.Cut(text, true)
+	if from == 1 {
+		wordsArr = e.Seg_PCD.Cut(text, true)
+	} else if from == 2 {
+		wordsArr = e.Seg_SV.Cut(text, true)
+	}
 	//词组清洗
 	//词组清洗
 	wordsArr = CleanRegionTextWords(wordsArr)
 	wordsArr = CleanRegionTextWords(wordsArr)
 	regionsArr := []map[string]string{}
 	regionsArr := []map[string]string{}
@@ -494,9 +494,9 @@ func CleanRegionTextWords(wordsArr []string) []string {
 		return newArr
 		return newArr
 	}
 	}
 
 
-	//清除特殊词组~城区
+	//清除特殊词组~城区,范县等
 	for _, v := range wordsArr {
 	for _, v := range wordsArr {
-		if v == "城区" {
+		if CleanCutWords.MatchString(v) {
 			continue
 			continue
 		}
 		}
 		newArr = append(newArr, v)
 		newArr = append(newArr, v)
@@ -507,8 +507,7 @@ func CleanRegionTextWords(wordsArr []string) []string {
 // 链路补充~全称类
 // 链路补充~全称类
 func (e *ExtractTask) LinkSpecialRuleFullStep(text string, area *string, city *string, district *string) {
 func (e *ExtractTask) LinkSpecialRuleFullStep(text string, area *string, city *string, district *string) {
 	regions := map[string]map[string]map[string]string{}
 	regions := map[string]map[string]map[string]string{}
-	//wordsArr := e.Seg_SV.Cut(text, true)
-	wordsArr := e.Seg_Full.Cut(text, true)
+	wordsArr := e.Seg_SV.Cut(text, true)
 	for _, word := range wordsArr {
 	for _, word := range wordsArr {
 		for pos_full, trie_full := range e.Trie_Fulls {
 		for pos_full, trie_full := range e.Trie_Fulls {
 			if pos_full == 3 {
 			if pos_full == 3 {
@@ -565,12 +564,11 @@ func (e *ExtractTask) LinkSpecialRuleBriefStep2(text string, area *string, city
 func (e *ExtractTask) FetchBriefRules(text string, cutype int) map[string]map[string]map[string]string {
 func (e *ExtractTask) FetchBriefRules(text string, cutype int) map[string]map[string]map[string]string {
 	regions := map[string]map[string]map[string]string{}
 	regions := map[string]map[string]map[string]string{}
 	wordsArr := []string{}
 	wordsArr := []string{}
-	//if cutype == 1 {
-	//	wordsArr = e.Seg_Full.Cut(text, true)
-	//} else {
-	//	wordsArr = e.Seg_SV.Cut(text, true)
-	//}
-	wordsArr = e.Seg_Full.Cut(text, true)
+	if cutype == 1 {
+		wordsArr = e.Seg_Full.Cut(text, true)
+	} else {
+		wordsArr = e.Seg_SV.Cut(text, true)
+	}
 	for _, word := range wordsArr {
 	for _, word := range wordsArr {
 		for pos_sim, trie_sim := range e.Trie_Sims {
 		for pos_sim, trie_sim := range e.Trie_Sims {
 			if pos_sim == 2 {
 			if pos_sim == 2 {
@@ -636,6 +634,13 @@ func LinkSpecialQyxyStep(buyer string, area *string, city *string, district *str
 	qyxy_arr, _ := ju.Qyxy_Mgo.Find("qyxy_std", map[string]interface{}{
 	qyxy_arr, _ := ju.Qyxy_Mgo.Find("qyxy_std", map[string]interface{}{
 		"company_name": buyer,
 		"company_name": buyer,
 	}, nil, nil)
 	}, nil, nil)
+	if qyxy_arr != nil && len(qyxy_arr) > 0 {
+
+	} else {
+		qyxy_arr, _ = ju.Qyxy_Mgo.Find("qyxy_std", map[string]interface{}{
+			"history_name": buyer,
+		}, nil, nil)
+	}
 	if qyxy_arr != nil && len(qyxy_arr) > 0 {
 	if qyxy_arr != nil && len(qyxy_arr) > 0 {
 		for _, v := range qyxy_arr {
 		for _, v := range qyxy_arr {
 			qy_area := qu.ObjToString(v["company_area"])
 			qy_area := qu.ObjToString(v["company_area"])

+ 19 - 19
src/jy/extract/extractinit.go

@@ -129,21 +129,21 @@ type ExtractTask struct {
 	AreaCodeMap map[string]*AreaCode //区号
 	AreaCodeMap map[string]*AreaCode //区号
 	InfoType    []map[string]interface{}
 	InfoType    []map[string]interface{}
 
 
-	Trie_Full_Province  *ju.Trie   //省全称 省、直辖市、自治区
-	Trie_Full_City      *ju.Trie   //市全称 地级市
-	Trie_Full_District  *ju.Trie   //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
-	Trie_Full_Street    *ju.Trie   //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
-	Trie_Full_Community *ju.Trie   //村/委员会全称  村、居委会
-	Trie_Sim_Province   *ju.Trie   //省简称
-	Trie_Sim_City       *ju.Trie   //市简称
-	Trie_Sim_District   *ju.Trie   //县简称
-	Trie_Fulls          []*ju.Trie //所有全称
-	Trie_Sims           []*ju.Trie //所有简称
-	//Seg_PCD             *gse.Segmenter //分词
-	//Seg_SV              *gse.Segmenter //分词
-	Seg_Full  *gse.Segmenter //分词
-	Luacodes  *sync.Map      //站点规则
-	SiteMerge *sync.Map      //抽取合并
+	Trie_Full_Province  *ju.Trie       //省全称 省、直辖市、自治区
+	Trie_Full_City      *ju.Trie       //市全称 地级市
+	Trie_Full_District  *ju.Trie       //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
+	Trie_Full_Street    *ju.Trie       //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
+	Trie_Full_Community *ju.Trie       //村/委员会全称  村、居委会
+	Trie_Sim_Province   *ju.Trie       //省简称
+	Trie_Sim_City       *ju.Trie       //市简称
+	Trie_Sim_District   *ju.Trie       //县简称
+	Trie_Fulls          []*ju.Trie     //所有全称
+	Trie_Sims           []*ju.Trie     //所有简称
+	Seg_PCD             *gse.Segmenter //分词
+	Seg_SV              *gse.Segmenter //分词
+	Seg_Full            *gse.Segmenter //分词
+	Luacodes            *sync.Map      //站点规则
+	SiteMerge           *sync.Map      //抽取合并
 }
 }
 
 
 type SiteCity struct {
 type SiteCity struct {
@@ -1333,11 +1333,11 @@ func (e *ExtractTask) InitVar() {
 	e.Trie_Sim_District = &ju.Trie{}
 	e.Trie_Sim_District = &ju.Trie{}
 
 
 	//初始化分词
 	//初始化分词
-	//e.Seg_PCD = &gse.Segmenter{}
-	//e.Seg_SV = &gse.Segmenter{}
+	e.Seg_PCD = &gse.Segmenter{}
+	e.Seg_SV = &gse.Segmenter{}
 	e.Seg_Full = &gse.Segmenter{}
 	e.Seg_Full = &gse.Segmenter{}
-	//e.Seg_PCD.LoadDict("./res/pcd.txt")
-	//e.Seg_SV.LoadDict("./res/sv.txt")
+	e.Seg_PCD.LoadDict("./res/pcd.txt")
+	e.Seg_SV.LoadDict("./res/sv.txt")
 	e.Seg_Full.LoadDict("./res/dictionary.txt")
 	e.Seg_Full.LoadDict("./res/dictionary.txt")
 
 
 	//初始化城市相关
 	//初始化城市相关