|
@@ -0,0 +1,408 @@
|
|
|
+package extract
|
|
|
+
|
|
|
+import (
|
|
|
+ db "jy/mongodbutil"
|
|
|
+ "log"
|
|
|
+ qu "qfw/util"
|
|
|
+ "strings"
|
|
|
+)
|
|
|
+
|
|
|
+var ProvinceMap2 map[string]string
|
|
|
+var CityBrief2 map[string]*City //只加载一次即可
|
|
|
+var ProvinceBrief2 map[string]*Province //只加载一次
|
|
|
+var AreaToCity2 map[string][]*City //两个文件共用
|
|
|
+var DistrictCityMap2 map[string]*City
|
|
|
+var StreetDistrictMap2 map[string]*District
|
|
|
+var AreaGet2 DFA //市全称
|
|
|
+var AreaDistrict2 DFA //区或县
|
|
|
+var AreaProvinceGet2 DFA //省
|
|
|
+var AreaSimGet2 DFA //市简称
|
|
|
+var AreaStreet2 DFA //街道
|
|
|
+
|
|
|
+func InitDFA2() {
|
|
|
+ defer qu.Catch()
|
|
|
+ AreaGet2 = DFA{}
|
|
|
+ AreaProvinceGet2 = DFA{}
|
|
|
+ AreaStreet2 = DFA{}
|
|
|
+ //初始化map
|
|
|
+ if ProvinceMap2 == nil {
|
|
|
+ ProvinceMap2 = make(map[string]string)
|
|
|
+ }
|
|
|
+ if CityBrief2 == nil {
|
|
|
+ CityBrief2 = make(map[string]*City)
|
|
|
+ }
|
|
|
+ if ProvinceBrief2 == nil {
|
|
|
+ ProvinceBrief2 = make(map[string]*Province)
|
|
|
+ }
|
|
|
+ if AreaToCity2 == nil {
|
|
|
+ AreaToCity2 = make(map[string][]*City)
|
|
|
+ }
|
|
|
+ if DistrictCityMap2 == nil {
|
|
|
+ DistrictCityMap2 = make(map[string]*City)
|
|
|
+ }
|
|
|
+ if StreetDistrictMap2 == nil {
|
|
|
+ StreetDistrictMap2 = make(map[string]*District)
|
|
|
+ }
|
|
|
+ //初始化省
|
|
|
+ fn1 := InitProvince("v3.0")
|
|
|
+ for k, v := range fn1 {
|
|
|
+ for _, p := range v.([]interface{}) {
|
|
|
+ p1, _ := p.(string)
|
|
|
+ AreaProvinceGet2.AddWord(p1)
|
|
|
+ ProvinceMap2[p1] = k
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ //初始化城市全称
|
|
|
+ fn2 := InitCityAll("v3.0")
|
|
|
+ for k, v := range fn2 {
|
|
|
+ AreaProvinceGet2.AddWord(k) //省全称
|
|
|
+ p := &Province{}
|
|
|
+ p.Name = k
|
|
|
+ p.Brief = v["brief"].(string)
|
|
|
+ ProvinceMap2[k] = p.Brief
|
|
|
+ //
|
|
|
+ ProvinceBrief2[p.Brief] = p
|
|
|
+ p.Cap = v["captial"].(string)
|
|
|
+ city, _ := v["city"].(map[string]interface{})
|
|
|
+ for k1, v1 := range city {
|
|
|
+ v1m, _ := v1.(map[string]interface{})
|
|
|
+ c := &City{}
|
|
|
+ c.Name = k1
|
|
|
+ // if v1m["brief"] == nil {
|
|
|
+ // }
|
|
|
+ c.Brief = v1m["brief"].(string)
|
|
|
+ //
|
|
|
+ CityBrief2[c.Brief] = c
|
|
|
+ c.P = p
|
|
|
+ if c.Brief == p.Cap {
|
|
|
+ p.Captial = c
|
|
|
+ }
|
|
|
+ //加入到城市map中
|
|
|
+ cs := AreaToCity2[k1]
|
|
|
+ AreaGet2.AddWord(k1) //市全称
|
|
|
+ if cs != nil {
|
|
|
+ cs = append(cs, c)
|
|
|
+ } else {
|
|
|
+ cs = []*City{c}
|
|
|
+ }
|
|
|
+ AreaToCity2[k1] = cs
|
|
|
+ //区县
|
|
|
+ districtmap := v1m["area"].(map[string]interface{}) //区或县
|
|
|
+ for district, streetarr := range districtmap {
|
|
|
+ d := &District{}
|
|
|
+ d.Name = district
|
|
|
+ d.C = c
|
|
|
+ AreaDistrict2.AddWord(district) //加入区或县敏感词
|
|
|
+ ctmp := DistrictCityMap2[district]
|
|
|
+ if ctmp == nil {
|
|
|
+ DistrictCityMap2[district] = c
|
|
|
+ }
|
|
|
+ //街道
|
|
|
+ for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
|
|
|
+ AreaStreet2.AddWord(s) //加入街道敏感词
|
|
|
+ dtmp := StreetDistrictMap2[s]
|
|
|
+ if dtmp == nil {
|
|
|
+ StreetDistrictMap2[s] = d
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //初始化城市简称
|
|
|
+ fn3 := InitCitySim("v3.0")
|
|
|
+ AreaSimGet2 = DFA{}
|
|
|
+ for k, v := range fn3 {
|
|
|
+ pb := v["brief"].(string)
|
|
|
+ p := ProvinceBrief2[pb]
|
|
|
+ //加载
|
|
|
+ for _, ss := range []string{k, pb} {
|
|
|
+ cs := AreaToCity2[ss]
|
|
|
+ if cs != nil {
|
|
|
+ cs = append(cs, p.Captial)
|
|
|
+ } else {
|
|
|
+ cs = []*City{p.Captial}
|
|
|
+ }
|
|
|
+ AreaToCity2[ss] = cs
|
|
|
+ AreaSimGet2.AddWord(ss) //省全称和省简称
|
|
|
+ }
|
|
|
+ city, _ := v["city"].(map[string]interface{})
|
|
|
+ for k1, v1 := range city {
|
|
|
+ v1m, _ := v1.(map[string]interface{})
|
|
|
+ if v1m["brief"] == nil {
|
|
|
+ }
|
|
|
+ cb := v1m["brief"].(string)
|
|
|
+ c := AreaToCity2[k1][0]
|
|
|
+ //加入到城市map中
|
|
|
+ for _, ss := range []string{cb, k + cb, pb + cb} { //杭州 浙江省杭州 浙江杭州
|
|
|
+ AreaSimGet2.AddWord(ss)
|
|
|
+ cs := AreaToCity2[ss]
|
|
|
+ if cs != nil {
|
|
|
+ cs = append(cs, c)
|
|
|
+ } else {
|
|
|
+ cs = []*City{c}
|
|
|
+ }
|
|
|
+ AreaToCity2[ss] = cs
|
|
|
+ }
|
|
|
+ arr := v1m["area"].([]interface{})
|
|
|
+ for _, k2 := range arr {
|
|
|
+ s := k2.(string)
|
|
|
+ for n, ss := range []string{s, cb + s, pb + s, k + s} { //淳安 杭州淳安 浙江淳安 浙江省淳安
|
|
|
+ cs := AreaToCity2[ss]
|
|
|
+ AreaSimGet2.AddWord(ss)
|
|
|
+ if cs != nil {
|
|
|
+ cs = append(cs, c)
|
|
|
+ } else {
|
|
|
+ cs = []*City{c}
|
|
|
+ }
|
|
|
+ AreaToCity2[ss] = cs
|
|
|
+
|
|
|
+ //只加入简称
|
|
|
+ if n == 0 {
|
|
|
+ d := &District{}
|
|
|
+ d.Name = ss
|
|
|
+ d.C = c
|
|
|
+ AreaDistrict2.AddWord(ss) //加入区或县简称敏感词
|
|
|
+ ctmp := DistrictCityMap2[ss]
|
|
|
+ if ctmp == nil {
|
|
|
+ DistrictCityMap2[ss] = c
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func FindBuyer() {
|
|
|
+ list, _ := db.Mgo.Find("buyer", nil, nil, `{"name":1}`, false, -1, -1)
|
|
|
+ for _, l := range *list {
|
|
|
+ val := qu.ObjToString(l["name"])
|
|
|
+ if val != "" {
|
|
|
+ //开始抽取城市省份
|
|
|
+ bres, c, p := ExtractProvinceCity2("", "", qu.BsonIdToSId(l["_id"]), []string{val})
|
|
|
+ bres, p, c, d := ExtractDistrict2([]string{val}, bres, c, p, qu.BsonIdToSId(l["_id"])) //抽取区或县
|
|
|
+ log.Println(bres, c, p, d)
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//抽取城市、省份
|
|
|
+func ExtractProvinceCity2(province, city, id string, text []string) (bres bool, c, p string) {
|
|
|
+ defer qu.Catch()
|
|
|
+ bc := true //是否继续抽取
|
|
|
+ if city != "" {
|
|
|
+ lock.Lock()
|
|
|
+ citybrief := CityBrief2[city]
|
|
|
+ //log.Println("citybrief========", citybrief)
|
|
|
+ lock.Unlock()
|
|
|
+ if citybrief == nil { //简称不存在
|
|
|
+ log.Println("city err:", city, id)
|
|
|
+ } else { //简称存在
|
|
|
+ lock.Lock()
|
|
|
+ pbrief := CityBrief2[city].P.Brief
|
|
|
+ //log.Println("pbrief========", pbrief)
|
|
|
+ lock.Unlock()
|
|
|
+ if province != pbrief { //省份不配对
|
|
|
+ log.Println("province err:", city, province, id)
|
|
|
+ } else {
|
|
|
+ bc = false
|
|
|
+ //城市省份都正确
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //有省份
|
|
|
+ bp := false
|
|
|
+ lock.Lock()
|
|
|
+ provincebrief := ProvinceBrief2[province]
|
|
|
+ //log.Println("provincebrief========", provincebrief)
|
|
|
+ lock.Unlock()
|
|
|
+ if provincebrief != nil { //省份简称正确
|
|
|
+ bp = true
|
|
|
+ } else { //没有省份,先识别省份
|
|
|
+ for _, str := range text { //没有省的简称,从配置的字段信息中抽取省
|
|
|
+ word := AreaProvinceGet2.CheckSensitiveWord(str) //省全称DFA中匹配
|
|
|
+ if word != "" {
|
|
|
+ lock.Lock()
|
|
|
+ province = ProvinceMap2[word]
|
|
|
+ lock.Unlock()
|
|
|
+ bp = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //匹配城市
|
|
|
+ if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
|
|
|
+ for pos, GET := range []DFA{AreaGet2, AreaSimGet2} { //AreaGet市全称,AreaSimGet省全称和简称
|
|
|
+ ws := make([]string, 5)
|
|
|
+ for n, str := range text {
|
|
|
+ if str != "" {
|
|
|
+ word := GET.CheckSensitiveWord(str)
|
|
|
+ if pos == 1 { //用简称 后辍为路、集团替换
|
|
|
+ str1 := strings.Replace(str, word+"路", "", 1)
|
|
|
+ if str1 != str {
|
|
|
+ word = GET.CheckSensitiveWord(str1)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ ws[n] = word
|
|
|
+ if word != "" {
|
|
|
+ lock.Lock()
|
|
|
+ res := AreaToCity2[word]
|
|
|
+ lock.Unlock()
|
|
|
+ if len(res) == 1 {
|
|
|
+ //判断省份
|
|
|
+ if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
|
|
|
+ bres = true
|
|
|
+ c = res[0].Brief
|
|
|
+ p = res[0].P.Brief
|
|
|
+ break
|
|
|
+ } else { //不一致时。。暂时不处理
|
|
|
+ }
|
|
|
+ } else { //多个时(出现这种情况是多个省中的市,市名相同。现在的配置文件中已经将市名,县名重复的全部去掉)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if !bres { //没有匹配到
|
|
|
+ mc := map[string]int{}
|
|
|
+ for _, w := range ws {
|
|
|
+ lock.Lock()
|
|
|
+ res := AreaToCity2[w]
|
|
|
+ lock.Unlock()
|
|
|
+ for _, ct := range res {
|
|
|
+ log.Println("ct===", ct)
|
|
|
+ if ct == nil {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if bp { //有省份
|
|
|
+ if ct.P != nil && ct.P.Brief == province {
|
|
|
+ mc[ct.Brief]++
|
|
|
+ }
|
|
|
+ } else { //没有省份
|
|
|
+ mc[ct.Brief]++
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //计算mc中最大值且大于1
|
|
|
+ max := 1
|
|
|
+ v := ""
|
|
|
+ for mk, mv := range mc {
|
|
|
+ if mv > max {
|
|
|
+ v = mk
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if v != "" {
|
|
|
+ bres = true
|
|
|
+ lock.Lock()
|
|
|
+ ctb := CityBrief2[v]
|
|
|
+ lock.Unlock()
|
|
|
+ c = ctb.Brief
|
|
|
+ p = ctb.P.Brief
|
|
|
+ } else if len(mc) > 0 {
|
|
|
+ //取级别更大的
|
|
|
+ v := ""
|
|
|
+ for mk, _ := range mc {
|
|
|
+ lock.Lock()
|
|
|
+ cb := CityBrief2[mk]
|
|
|
+ lock.Unlock()
|
|
|
+ if cb.P.Cap == mk {
|
|
|
+ bres = true
|
|
|
+ c = cb.Brief
|
|
|
+ p = cb.P.Brief
|
|
|
+ break
|
|
|
+ } else {
|
|
|
+ v = mk
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if !bres {
|
|
|
+ bres = true
|
|
|
+ lock.Lock()
|
|
|
+ cbb := CityBrief2[v]
|
|
|
+ c = cbb.Brief
|
|
|
+ p = cbb.P.Brief
|
|
|
+ lock.Unlock()
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if bres {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ return
|
|
|
+ }
|
|
|
+ if !bres {
|
|
|
+ //取默认省会
|
|
|
+ lock.Lock()
|
|
|
+ pbp := ProvinceBrief2[province]
|
|
|
+ lock.Unlock()
|
|
|
+ if pbp != nil {
|
|
|
+ bres = true
|
|
|
+ c = pbp.Cap
|
|
|
+ p = province
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+func ExtractDistrict2(field []string, bres bool, c, p, id string) (bool, string, string, string) {
|
|
|
+ d := ""
|
|
|
+ for _, str := range field {
|
|
|
+ //log.Println("field===========", str)
|
|
|
+ for pos, GET := range []DFA{AreaDistrict2, AreaStreet2} { //先匹配区或县再匹配街道
|
|
|
+ word := GET.CheckSensitiveWord(str)
|
|
|
+ //log.Println("word================", word)
|
|
|
+ if word != "" {
|
|
|
+ if pos == 0 { //区或县匹配
|
|
|
+ //log.Println("县直接匹配到====", word)
|
|
|
+ lock.Lock()
|
|
|
+ city := DistrictCityMap2[word]
|
|
|
+ lock.Unlock()
|
|
|
+ //log.Println("city================", city)
|
|
|
+ if city != nil {
|
|
|
+ d = word
|
|
|
+ ctmp := city.Brief
|
|
|
+ ptmp := city.P.Brief
|
|
|
+ //log.Println("ctmpptmp================", ptmp, ctmp)
|
|
|
+ if !bres { //城市省份没有抽到,通过区或县定位市和省
|
|
|
+ c = ctmp
|
|
|
+ p = ptmp
|
|
|
+ bres = true
|
|
|
+ } else { //对比抽到的城市省份是否一致
|
|
|
+ if c != ctmp || p != ptmp {
|
|
|
+ //log.Println("str---", str, "====", word)
|
|
|
+ //log.Println("district: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
|
|
|
+ c = ctmp
|
|
|
+ p = ptmp
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else { //街道匹配
|
|
|
+ //log.Println("匹配到街道====", word)
|
|
|
+ lock.Lock()
|
|
|
+ district := StreetDistrictMap2[word]
|
|
|
+ lock.Unlock()
|
|
|
+ //log.Println("district================", district)
|
|
|
+ if district != nil {
|
|
|
+ d = district.Name
|
|
|
+ ctmp := district.C.Brief
|
|
|
+ ptmp := district.C.P.Brief
|
|
|
+ //log.Println("districtptmp================", ctmp, ptmp)
|
|
|
+ if !bres { //城市省份没有抽到,通过区或县定位市和省
|
|
|
+ c = ctmp
|
|
|
+ p = ptmp
|
|
|
+ bres = true
|
|
|
+ } else { //对比抽到的城市省份是否一致
|
|
|
+ if c != ctmp || p != ptmp {
|
|
|
+ //log.Println("street: City And Province, Inconsistent Before And After,Id:", id, c, p, ctmp, ptmp, d)
|
|
|
+ c = ctmp
|
|
|
+ p = ptmp
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return bres, p, c, d
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return bres, p, c, d
|
|
|
+}
|