Prechádzať zdrojové kódy

封装城市抽取的方法

unknown 6 rokov pred
rodič
commit
6bb26e32b5
2 zmenil súbory, kde vykonal 414 pridanie a 1 odobranie
  1. 408 0
      src/jy/extract/extractcity2.go
  2. 6 1
      src/main_test.go

+ 408 - 0
src/jy/extract/extractcity2.go

@@ -0,0 +1,408 @@
+package extract
+
+import (
+	db "jy/mongodbutil"
+	"log"
+	qu "qfw/util"
+	"strings"
+)
+
+var ProvinceMap2 map[string]string
+var CityBrief2 map[string]*City         //只加载一次即可
+var ProvinceBrief2 map[string]*Province //只加载一次
+var AreaToCity2 map[string][]*City      //两个文件共用
+var DistrictCityMap2 map[string]*City
+var StreetDistrictMap2 map[string]*District
+var AreaGet2 DFA         //市全称
+var AreaDistrict2 DFA    //区或县
+var AreaProvinceGet2 DFA //省
+var AreaSimGet2 DFA      //市简称
+var AreaStreet2 DFA      //街道
+
+func InitDFA2() {
+	defer qu.Catch()
+	AreaGet2 = DFA{}
+	AreaProvinceGet2 = DFA{}
+	AreaStreet2 = DFA{}
+	//初始化map
+	if ProvinceMap2 == nil {
+		ProvinceMap2 = make(map[string]string)
+	}
+	if CityBrief2 == nil {
+		CityBrief2 = make(map[string]*City)
+	}
+	if ProvinceBrief2 == nil {
+		ProvinceBrief2 = make(map[string]*Province)
+	}
+	if AreaToCity2 == nil {
+		AreaToCity2 = make(map[string][]*City)
+	}
+	if DistrictCityMap2 == nil {
+		DistrictCityMap2 = make(map[string]*City)
+	}
+	if StreetDistrictMap2 == nil {
+		StreetDistrictMap2 = make(map[string]*District)
+	}
+	//初始化省
+	fn1 := InitProvince("v3.0")
+	for k, v := range fn1 {
+		for _, p := range v.([]interface{}) {
+			p1, _ := p.(string)
+			AreaProvinceGet2.AddWord(p1)
+			ProvinceMap2[p1] = k
+		}
+	}
+
+	//初始化城市全称
+	fn2 := InitCityAll("v3.0")
+	for k, v := range fn2 {
+		AreaProvinceGet2.AddWord(k) //省全称
+		p := &Province{}
+		p.Name = k
+		p.Brief = v["brief"].(string)
+		ProvinceMap2[k] = p.Brief
+		//
+		ProvinceBrief2[p.Brief] = p
+		p.Cap = v["captial"].(string)
+		city, _ := v["city"].(map[string]interface{})
+		for k1, v1 := range city {
+			v1m, _ := v1.(map[string]interface{})
+			c := &City{}
+			c.Name = k1
+			//			if v1m["brief"] == nil {
+			//			}
+			c.Brief = v1m["brief"].(string)
+			//
+			CityBrief2[c.Brief] = c
+			c.P = p
+			if c.Brief == p.Cap {
+				p.Captial = c
+			}
+			//加入到城市map中
+			cs := AreaToCity2[k1]
+			AreaGet2.AddWord(k1) //市全称
+			if cs != nil {
+				cs = append(cs, c)
+			} else {
+				cs = []*City{c}
+			}
+			AreaToCity2[k1] = cs
+			//区县
+			districtmap := v1m["area"].(map[string]interface{}) //区或县
+			for district, streetarr := range districtmap {
+				d := &District{}
+				d.Name = district
+				d.C = c
+				AreaDistrict2.AddWord(district) //加入区或县敏感词
+				ctmp := DistrictCityMap2[district]
+				if ctmp == nil {
+					DistrictCityMap2[district] = c
+				}
+				//街道
+				for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
+					AreaStreet2.AddWord(s) //加入街道敏感词
+					dtmp := StreetDistrictMap2[s]
+					if dtmp == nil {
+						StreetDistrictMap2[s] = d
+					}
+				}
+			}
+		}
+	}
+	//初始化城市简称
+	fn3 := InitCitySim("v3.0")
+	AreaSimGet2 = DFA{}
+	for k, v := range fn3 {
+		pb := v["brief"].(string)
+		p := ProvinceBrief2[pb]
+		//加载
+		for _, ss := range []string{k, pb} {
+			cs := AreaToCity2[ss]
+			if cs != nil {
+				cs = append(cs, p.Captial)
+			} else {
+				cs = []*City{p.Captial}
+			}
+			AreaToCity2[ss] = cs
+			AreaSimGet2.AddWord(ss) //省全称和省简称
+		}
+		city, _ := v["city"].(map[string]interface{})
+		for k1, v1 := range city {
+			v1m, _ := v1.(map[string]interface{})
+			if v1m["brief"] == nil {
+			}
+			cb := v1m["brief"].(string)
+			c := AreaToCity2[k1][0]
+			//加入到城市map中
+			for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州  浙江杭州
+				AreaSimGet2.AddWord(ss)
+				cs := AreaToCity2[ss]
+				if cs != nil {
+					cs = append(cs, c)
+				} else {
+					cs = []*City{c}
+				}
+				AreaToCity2[ss] = cs
+			}
+			arr := v1m["area"].([]interface{})
+			for _, k2 := range arr {
+				s := k2.(string)
+				for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
+					cs := AreaToCity2[ss]
+					AreaSimGet2.AddWord(ss)
+					if cs != nil {
+						cs = append(cs, c)
+					} else {
+						cs = []*City{c}
+					}
+					AreaToCity2[ss] = cs
+
+					//只加入简称
+					if n == 0 {
+						d := &District{}
+						d.Name = ss
+						d.C = c
+						AreaDistrict2.AddWord(ss) //加入区或县简称敏感词
+						ctmp := DistrictCityMap2[ss]
+						if ctmp == nil {
+							DistrictCityMap2[ss] = c
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+func FindBuyer() {
+	list, _ := db.Mgo.Find("buyer", nil, nil, `{"name":1}`, false, -1, -1)
+	for _, l := range *list {
+		val := qu.ObjToString(l["name"])
+		if val != "" {
+			//开始抽取城市省份
+			bres, c, p := ExtractProvinceCity2("", "", qu.BsonIdToSId(l["_id"]), []string{val})
+			bres, p, c, d := ExtractDistrict2([]string{val}, bres, c, p, qu.BsonIdToSId(l["_id"])) //抽取区或县
+			log.Println(bres, c, p, d)
+		}
+	}
+}
+
+//抽取城市、省份
+func ExtractProvinceCity2(province, city, id string, text []string) (bres bool, c, p string) {
+	defer qu.Catch()
+	bc := true //是否继续抽取
+	if city != "" {
+		lock.Lock()
+		citybrief := CityBrief2[city]
+		//log.Println("citybrief========", citybrief)
+		lock.Unlock()
+		if citybrief == nil { //简称不存在
+			log.Println("city err:", city, id)
+		} else { //简称存在
+			lock.Lock()
+			pbrief := CityBrief2[city].P.Brief
+			//log.Println("pbrief========", pbrief)
+			lock.Unlock()
+			if province != pbrief { //省份不配对
+				log.Println("province err:", city, province, id)
+			} else {
+				bc = false
+				//城市省份都正确
+			}
+		}
+	}
+	//有省份
+	bp := false
+	lock.Lock()
+	provincebrief := ProvinceBrief2[province]
+	//log.Println("provincebrief========", provincebrief)
+	lock.Unlock()
+	if provincebrief != nil { //省份简称正确
+		bp = true
+	} else { //没有省份,先识别省份
+		for _, str := range text { //没有省的简称,从配置的字段信息中抽取省
+			word := AreaProvinceGet2.CheckSensitiveWord(str) //省全称DFA中匹配
+			if word != "" {
+				lock.Lock()
+				province = ProvinceMap2[word]
+				lock.Unlock()
+				bp = true
+				break
+			}
+		}
+	}
+	//匹配城市
+	if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
+		for pos, GET := range []DFA{AreaGet2, AreaSimGet2} { //AreaGet市全称,AreaSimGet省全称和简称
+			ws := make([]string, 5)
+			for n, str := range text {
+				if str != "" {
+					word := GET.CheckSensitiveWord(str)
+					if pos == 1 { //用简称 后辍为路、集团替换
+						str1 := strings.Replace(str, word+"路", "", 1)
+						if str1 != str {
+							word = GET.CheckSensitiveWord(str1)
+						}
+					}
+					ws[n] = word
+					if word != "" {
+						lock.Lock()
+						res := AreaToCity2[word]
+						lock.Unlock()
+						if len(res) == 1 {
+							//判断省份
+							if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
+								bres = true
+								c = res[0].Brief
+								p = res[0].P.Brief
+								break
+							} else { //不一致时。。暂时不处理
+							}
+						} else { //多个时(出现这种情况是多个省中的市,市名相同。现在的配置文件中已经将市名,县名重复的全部去掉)
+						}
+					}
+				}
+			}
+			if !bres { //没有匹配到
+				mc := map[string]int{}
+				for _, w := range ws {
+					lock.Lock()
+					res := AreaToCity2[w]
+					lock.Unlock()
+					for _, ct := range res {
+						log.Println("ct===", ct)
+						if ct == nil {
+							continue
+						}
+						if bp { //有省份
+							if ct.P != nil && ct.P.Brief == province {
+								mc[ct.Brief]++
+							}
+						} else { //没有省份
+							mc[ct.Brief]++
+						}
+					}
+				}
+				//计算mc中最大值且大于1
+				max := 1
+				v := ""
+				for mk, mv := range mc {
+					if mv > max {
+						v = mk
+					}
+				}
+				if v != "" {
+					bres = true
+					lock.Lock()
+					ctb := CityBrief2[v]
+					lock.Unlock()
+					c = ctb.Brief
+					p = ctb.P.Brief
+				} else if len(mc) > 0 {
+					//取级别更大的
+					v := ""
+					for mk, _ := range mc {
+						lock.Lock()
+						cb := CityBrief2[mk]
+						lock.Unlock()
+						if cb.P.Cap == mk {
+							bres = true
+							c = cb.Brief
+							p = cb.P.Brief
+							break
+						} else {
+							v = mk
+						}
+					}
+					if !bres {
+						bres = true
+						lock.Lock()
+						cbb := CityBrief2[v]
+						c = cbb.Brief
+						p = cbb.P.Brief
+						lock.Unlock()
+					}
+				}
+			}
+			if bres {
+				break
+			}
+		}
+	} else {
+		return
+	}
+	if !bres {
+		//取默认省会
+		lock.Lock()
+		pbp := ProvinceBrief2[province]
+		lock.Unlock()
+		if pbp != nil {
+			bres = true
+			c = pbp.Cap
+			p = province
+		}
+	}
+	return
+}
+func ExtractDistrict2(field []string, bres bool, c, p, id string) (bool, string, string, string) {
+	d := ""
+	for _, str := range field {
+		//log.Println("field===========", str)
+		for pos, GET := range []DFA{AreaDistrict2, AreaStreet2} { //先匹配区或县再匹配街道
+			word := GET.CheckSensitiveWord(str)
+			//log.Println("word================", word)
+			if word != "" {
+				if pos == 0 { //区或县匹配
+					//log.Println("县直接匹配到====", word)
+					lock.Lock()
+					city := DistrictCityMap2[word]
+					lock.Unlock()
+					//log.Println("city================", city)
+					if city != nil {
+						d = word
+						ctmp := city.Brief
+						ptmp := city.P.Brief
+						//log.Println("ctmpptmp================", ptmp, ctmp)
+						if !bres { //城市省份没有抽到,通过区或县定位市和省
+							c = ctmp
+							p = ptmp
+							bres = true
+						} else { //对比抽到的城市省份是否一致
+							if c != ctmp || p != ptmp {
+								//log.Println("str---", str, "====", word)
+								//log.Println("district: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
+								c = ctmp
+								p = ptmp
+							}
+						}
+					}
+				} else { //街道匹配
+					//log.Println("匹配到街道====", word)
+					lock.Lock()
+					district := StreetDistrictMap2[word]
+					lock.Unlock()
+					//log.Println("district================", district)
+					if district != nil {
+						d = district.Name
+						ctmp := district.C.Brief
+						ptmp := district.C.P.Brief
+						//log.Println("districtptmp================", ctmp, ptmp)
+						if !bres { //城市省份没有抽到,通过区或县定位市和省
+							c = ctmp
+							p = ptmp
+							bres = true
+						} else { //对比抽到的城市省份是否一致
+							if c != ctmp || p != ptmp {
+								//log.Println("street: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
+								c = ctmp
+								p = ptmp
+							}
+						}
+					}
+				}
+				return bres, p, c, d
+			}
+		}
+	}
+	return bres, p, c, d
+}

+ 6 - 1
src/main_test.go

@@ -18,7 +18,12 @@ func Test_task(t *testing.T) {
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }
-
+func Test_extractcity(t *testing.T) {
+	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
+	extract.InitDFA2()
+	//查询采购单位信息
+	extract.FindBuyer()
+}
 func Test_reghan(t *testing.T) {
 	context := `你好`
 	reg := regexp.MustCompile(`^[\p{Han}]+$`) //纯汉字