Ver código fonte

备份调整的城市版本

zhengkun 1 ano atrás
pai
commit
2beb293241

+ 1 - 1
extcity/src/ext/extRegion.go

@@ -221,7 +221,7 @@ func (e *ExtractTask) GetRegionByTentativeJsonData(jsondata map[string]interface
 	regions := map[string]map[string]map[string]string{}
 	if jsondata != nil {
 		if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
-			e.GetRegionFromText(a_c_d, &regions, false, false, 2)
+			e.GetRegionFromText(a_c_d, &regions, false, false, 1)
 		}
 	}
 	if len(regions) == 1 {

+ 14 - 14
src/jy/extract/extractcity_new.go

@@ -532,7 +532,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	if *area == "" || *area == "全国" {
 		fullProvinceArr := e.SensitiveFullProvince.FindAll(detail)
 		if len(fullProvinceArr) == 1 { //再次计算
-			fullProvinceArr = e.findAmbiguityRegion(detail, fullProvinceArr[0])
+			//fullProvinceArr = e.findAmbiguityRegion(detail, fullProvinceArr[0])
 			for _, v := range fullProvinceArr {
 				if sim_province := e.ProvinceMap[v]; sim_province != "" {
 					*area = sim_province
@@ -544,7 +544,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称城市
 	fullCityArr := e.SensitiveFullCity.FindAll(detail)
 	if len(fullCityArr) == 1 {
-		fullCityArr = e.findAmbiguityRegion(detail, fullCityArr[0])
+		//fullCityArr = e.findAmbiguityRegion(detail, fullCityArr[0])
 		for _, v := range fullCityArr {
 			if cityMap := e.CityFullMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" || cityMap.P.Brief == *area {
@@ -558,7 +558,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//全称区县
 	fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail)
 	if len(fullDistrictArr) == 1 {
-		fullDistrictArr = e.findAmbiguityRegion(detail, fullDistrictArr[0])
+		//fullDistrictArr = e.findAmbiguityRegion(detail, fullDistrictArr[0])
 		for _, v := range fullDistrictArr {
 			if citys := e.DistrictCityMap[v]; len(citys) == 1 {
 				if *area == "" || *area == "全国" || citys[0].P.Brief == *area {
@@ -573,7 +573,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	//简称城市
 	simCityArr := e.SensitiveSimCity.FindAll(detail)
 	if len(simCityArr) == 1 {
-		simCityArr = e.findAmbiguityRegion(detail, simCityArr[0])
+		//simCityArr = e.findAmbiguityRegion(detail, simCityArr[0])
 		for _, v := range simCityArr {
 			if cityMap := e.CityBriefMap[v]; cityMap != nil {
 				if *area == "" || *area == "全国" {
@@ -595,7 +595,7 @@ func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *
 	if *area == "" || *area == "全国" {
 		simProvinceArr := e.SensitiveSimProvince.FindAll(detail)
 		if len(simProvinceArr) == 1 {
-			simProvinceArr = e.findAmbiguityRegion(detail, simProvinceArr[0])
+			//simProvinceArr = e.findAmbiguityRegion(detail, simProvinceArr[0])
 			for _, v := range simProvinceArr {
 				if v != "" {
 					*area = v
@@ -689,15 +689,15 @@ func (e *ExtractTask) GetRegionByTentativeOperator(winner string, all_regions *m
 }
 
 // 通用方法找到指定地域有效词组
-func (e *ExtractTask) findAmbiguityRegion(text string, key string) []string {
-	wordsArr := e.Seg_Full.Cut(text, true)
-	for _, word := range wordsArr {
-		if word == key {
-			return []string{key}
-		}
-	}
-	return []string{}
-}
+//func (e *ExtractTask) findAmbiguityRegion(text string, key string) []string {
+//	wordsArr := e.Seg_Full.Cut(text, true)
+//	for _, word := range wordsArr {
+//		if word == key {
+//			return []string{key}
+//		}
+//	}
+//	return []string{}
+//}
 
 //初步确认~站点
 //func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {

+ 2 - 1
src/jy/extract/extractcity_other.go

@@ -37,7 +37,8 @@ func (e *ExtractTask) GetMatchScores(j *ju.Job) {
 	j.SimAreaScore, j.SimCityScore, j.SimDistrictScore = map[string]float64{}, map[string]float64{}, map[string]float64{}
 	rf_area, rf_city, rf_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
 	rs_area, rs_city, rs_district := map[string]bool{}, map[string]bool{}, map[string]bool{}
-	for _, text := range e.Seg_Full.Cut(j.Content, true) {
+	//for _, text := range e.Seg_Full.Cut(j.Content, true) {
+	for _, text := range e.Seg_SV.Cut(j.Content, true) {
 		if text == "" {
 			continue
 		}

+ 14 - 14
src/jy/extract/extractcity_way.go

@@ -197,12 +197,12 @@ func (e *ExtractTask) GetRegionFromText(text string, regions *map[string]map[str
 		return regionValues
 	}
 	wordsArr := []string{}
-	//if from == 1 {
-	//	wordsArr = e.Seg_PCD.Cut(text, true)
-	//} else if from == 2 {
-	//	wordsArr = e.Seg_SV.Cut(text, true)
-	//}
-	wordsArr = e.Seg_Full.Cut(text, true)
+	if from == 1 {
+		wordsArr = e.Seg_PCD.Cut(text, true)
+	} else if from == 2 {
+		wordsArr = e.Seg_SV.Cut(text, true)
+	}
+	//wordsArr = e.Seg_Full.Cut(text, true)
 	//词组清洗
 	wordsArr = CleanRegionTextWords(wordsArr)
 	regionsArr := []map[string]string{}
@@ -507,8 +507,8 @@ func CleanRegionTextWords(wordsArr []string) []string {
 // 链路补充~全称类
 func (e *ExtractTask) LinkSpecialRuleFullStep(text string, area *string, city *string, district *string) {
 	regions := map[string]map[string]map[string]string{}
-	//wordsArr := e.Seg_SV.Cut(text, true)
-	wordsArr := e.Seg_Full.Cut(text, true)
+	wordsArr := e.Seg_SV.Cut(text, true)
+	//wordsArr := e.Seg_Full.Cut(text, true)
 	for _, word := range wordsArr {
 		for pos_full, trie_full := range e.Trie_Fulls {
 			if pos_full == 3 {
@@ -565,12 +565,12 @@ func (e *ExtractTask) LinkSpecialRuleBriefStep2(text string, area *string, city
 func (e *ExtractTask) FetchBriefRules(text string, cutype int) map[string]map[string]map[string]string {
 	regions := map[string]map[string]map[string]string{}
 	wordsArr := []string{}
-	//if cutype == 1 {
-	//	wordsArr = e.Seg_Full.Cut(text, true)
-	//} else {
-	//	wordsArr = e.Seg_SV.Cut(text, true)
-	//}
-	wordsArr = e.Seg_Full.Cut(text, true)
+	if cutype == 1 {
+		wordsArr = e.Seg_Full.Cut(text, true)
+	} else {
+		wordsArr = e.Seg_SV.Cut(text, true)
+	}
+	//wordsArr = e.Seg_Full.Cut(text, true)
 	for _, word := range wordsArr {
 		for pos_sim, trie_sim := range e.Trie_Sims {
 			if pos_sim == 2 {

+ 19 - 19
src/jy/extract/extractinit.go

@@ -129,21 +129,21 @@ type ExtractTask struct {
 	AreaCodeMap map[string]*AreaCode //区号
 	InfoType    []map[string]interface{}
 
-	Trie_Full_Province  *ju.Trie   //省全称 省、直辖市、自治区
-	Trie_Full_City      *ju.Trie   //市全称 地级市
-	Trie_Full_District  *ju.Trie   //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
-	Trie_Full_Street    *ju.Trie   //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
-	Trie_Full_Community *ju.Trie   //村/委员会全称  村、居委会
-	Trie_Sim_Province   *ju.Trie   //省简称
-	Trie_Sim_City       *ju.Trie   //市简称
-	Trie_Sim_District   *ju.Trie   //县简称
-	Trie_Fulls          []*ju.Trie //所有全称
-	Trie_Sims           []*ju.Trie //所有简称
-	//Seg_PCD             *gse.Segmenter //分词
-	//Seg_SV              *gse.Segmenter //分词
-	Seg_Full  *gse.Segmenter //分词
-	Luacodes  *sync.Map      //站点规则
-	SiteMerge *sync.Map      //抽取合并
+	Trie_Full_Province  *ju.Trie       //省全称 省、直辖市、自治区
+	Trie_Full_City      *ju.Trie       //市全称 地级市
+	Trie_Full_District  *ju.Trie       //县全称 市辖区、县(旗)、县级市、自治县(自治旗)、特区、林区
+	Trie_Full_Street    *ju.Trie       //街道、乡镇全称  镇、乡、民族乡、县辖区、街道
+	Trie_Full_Community *ju.Trie       //村/委员会全称  村、居委会
+	Trie_Sim_Province   *ju.Trie       //省简称
+	Trie_Sim_City       *ju.Trie       //市简称
+	Trie_Sim_District   *ju.Trie       //县简称
+	Trie_Fulls          []*ju.Trie     //所有全称
+	Trie_Sims           []*ju.Trie     //所有简称
+	Seg_PCD             *gse.Segmenter //分词
+	Seg_SV              *gse.Segmenter //分词
+	Seg_Full            *gse.Segmenter //分词
+	Luacodes            *sync.Map      //站点规则
+	SiteMerge           *sync.Map      //抽取合并
 }
 
 type SiteCity struct {
@@ -1333,11 +1333,11 @@ func (e *ExtractTask) InitVar() {
 	e.Trie_Sim_District = &ju.Trie{}
 
 	//初始化分词
-	//e.Seg_PCD = &gse.Segmenter{}
-	//e.Seg_SV = &gse.Segmenter{}
+	e.Seg_PCD = &gse.Segmenter{}
+	e.Seg_SV = &gse.Segmenter{}
 	e.Seg_Full = &gse.Segmenter{}
-	//e.Seg_PCD.LoadDict("./res/pcd.txt")
-	//e.Seg_SV.LoadDict("./res/sv.txt")
+	e.Seg_PCD.LoadDict("./res/pcd.txt")
+	e.Seg_SV.LoadDict("./res/sv.txt")
 	e.Seg_Full.LoadDict("./res/dictionary.txt")
 
 	//初始化城市相关

+ 23 - 1
src/jy/pretreated/division.go

@@ -831,6 +831,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 	if !flag {
 		return false, ""
 	}
+	//根据分包关键词判断是否同组过滤···
+	CleanOutPkgName(pkg)
+
 	//	util.Debug(con)
 	//	util.Debug(pkg)
 	//分包前面添加换行
@@ -939,7 +942,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 	}
 	//获取截取标识
 	surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con)
-	//查找分包内容,分kv
+	//查找分包内容,分kv...此方法有缺陷...不知哪行代码,无序过滤掉分包
 	for _, iv := range indexs {
 		text := indexTextMap[iv]
 		tmptext := text
@@ -1177,3 +1180,22 @@ func kvAfterDivideBlock(tp, text string, from int, ruleBlock *util.RuleBlock, is
 	}
 	return kvs
 }
+
+func CleanOutPkgName(pkg map[string][]string) {
+	var r *regexp.Regexp = regexp.MustCompile("^项目[一二三四五六七八九]$")
+	key := ""
+	for k, _ := range pkg {
+		if r.MatchString(k) {
+			key = k
+			break
+		}
+	}
+	if key != "" {
+		var nr *regexp.Regexp = regexp.MustCompile("^" + key + "[\\d]+$")
+		for k, _ := range pkg {
+			if nr.MatchString(k) {
+				delete(pkg, k)
+			}
+		}
+	}
+}