|
@@ -1,784 +0,0 @@
|
|
|
-package ext
|
|
|
-
|
|
|
-import (
|
|
|
- log "github.com/donnie4w/go-logger/logger"
|
|
|
- qu "qfw/util"
|
|
|
- "regexp"
|
|
|
- "sync"
|
|
|
-)
|
|
|
-type Job struct {
|
|
|
- FullAreaScore map[string]float64 //全称province得分
|
|
|
- FullCityScore map[string]float64 //全称city得分
|
|
|
- FullDistrictScore map[string]float64 //全称district得分
|
|
|
- SimAreaScore map[string]float64 //简称province得分
|
|
|
- SimCityScore map[string]float64 //简称city得分
|
|
|
- SimDistrictScore map[string]float64 //简称district得分
|
|
|
-}
|
|
|
-var AgencyReg = []*regexp.Regexp{
|
|
|
- regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
|
|
|
- regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
|
|
|
-}
|
|
|
-//支持排序的map
|
|
|
-type SortMap struct {
|
|
|
- Index map[string]int
|
|
|
- Keys []string
|
|
|
- Map map[string]interface{}
|
|
|
- Lock sync.Mutex
|
|
|
-}
|
|
|
-func NewSortMap() *SortMap {
|
|
|
- return &SortMap{
|
|
|
- Index: map[string]int{},
|
|
|
- Keys: []string{},
|
|
|
- Map: map[string]interface{}{},
|
|
|
- }
|
|
|
-}
|
|
|
-func (s *SortMap) AddKey(key string, val interface{}) {
|
|
|
- //判断val
|
|
|
- // if v, ok := val.(string); ok && NullVal.ReplaceAllString(u.TrimLRSpace(v, ""), "") == "" {
|
|
|
- // return
|
|
|
- // }
|
|
|
- s.Lock.Lock()
|
|
|
- defer s.Lock.Unlock()
|
|
|
- //重复
|
|
|
- if s.Map[key] == nil {
|
|
|
- s.Index[key] = len(s.Keys)
|
|
|
- s.Keys = append(s.Keys, key)
|
|
|
- }
|
|
|
- s.Map[key] = val
|
|
|
-}
|
|
|
-//增加值
|
|
|
-func (s *SortMap) ReplaceKey(key string, val interface{}, replacekey string) {
|
|
|
- s.Lock.Lock()
|
|
|
- defer s.Lock.Unlock()
|
|
|
- //重复
|
|
|
- v := s.Index[replacekey]
|
|
|
- s.Index[key] = v
|
|
|
- delete(s.Index, replacekey)
|
|
|
- s.Keys = append(s.Keys[:v], append([]string{key}, s.Keys[v+1:]...)...)
|
|
|
- delete(s.Map, replacekey)
|
|
|
- s.Map[key] = val
|
|
|
-}
|
|
|
-//删除值
|
|
|
-func (s *SortMap) RemoveKey(key string) {
|
|
|
- s.Lock.Lock()
|
|
|
- defer s.Lock.Unlock()
|
|
|
- delete(s.Map, key)
|
|
|
- //pos := s.Index[key]
|
|
|
- delete(s.Index, key)
|
|
|
- s.Keys = removeslice(s.Keys, key)
|
|
|
-}
|
|
|
-func removeslice(slice []string, elem interface{}) []string {
|
|
|
- if len(slice) == 0 {
|
|
|
- return slice
|
|
|
- }
|
|
|
- for i, v := range slice {
|
|
|
- if v == elem {
|
|
|
- slice = append(slice[:i], slice[i+1:]...)
|
|
|
- return removeslice(slice, elem)
|
|
|
- }
|
|
|
- }
|
|
|
- return slice
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-//抽取城市
|
|
|
-func (e *ExtractTask) NewExtractCityField(data map[string]interface{})( map[string]interface{}) {
|
|
|
- defer qu.Catch()
|
|
|
- //初始化
|
|
|
- j := &Job{
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- }
|
|
|
- pscore := make(map[string]float64)
|
|
|
- cscore := make(map[string]float64)
|
|
|
- dscore := make(map[string]float64)
|
|
|
- sm := NewSortMap()
|
|
|
- for k,v := range data{
|
|
|
- sm.AddKey(k, qu.ObjToString(v))
|
|
|
- }
|
|
|
- e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
|
|
|
- MergeFullSimScore(j)//全称简称得分合并
|
|
|
- MergeScores(j, &pscore, &cscore, &dscore)
|
|
|
-
|
|
|
- finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
|
|
|
- e.RemoveCD(finishP, j)
|
|
|
- finishC := HighestScoreArr(j.FullCityScore)
|
|
|
- finishD := HighestScoreArr(j.FullDistrictScore)
|
|
|
- arearesult := ""
|
|
|
- cityresult := ""
|
|
|
- districtresult := ""
|
|
|
- tmpcity := []string{}
|
|
|
- if len(finishP) == 1 { //最高分一个
|
|
|
- arearesult = finishP[0] //抽取结果直接赋值
|
|
|
- cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
|
|
|
- cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
- } else if len(finishP) > 1 { //province最高分多个
|
|
|
- if len(finishC) == 1 {
|
|
|
- cityresult = finishC[0]
|
|
|
- if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
|
|
|
- arearesult = cfMap.P.Brief
|
|
|
- tmpcity = append(tmpcity, cityresult)
|
|
|
- cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
- }
|
|
|
- } else { //对应的city有多个(多个province和city)
|
|
|
- arearesult = "全国"
|
|
|
- }
|
|
|
- }
|
|
|
- if cityresult != "" && cityresult == districtresult {
|
|
|
- districtresult = ""
|
|
|
- }
|
|
|
- //直辖市
|
|
|
- if arearesult == "北京" {
|
|
|
- cityresult = "北京市"
|
|
|
- if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
|
|
|
- districtresult = "朝阳区"
|
|
|
- }
|
|
|
- } else if arearesult == "天津" {
|
|
|
- cityresult = "天津市"
|
|
|
- } else if arearesult == "上海" {
|
|
|
- cityresult = "上海市"
|
|
|
- } else if arearesult == "重庆" {
|
|
|
- cityresult = "重庆市"
|
|
|
- }
|
|
|
- if arearesult == "" {
|
|
|
- arearesult = "全国"
|
|
|
- }
|
|
|
- resultTmp := map[string]interface{}{}
|
|
|
- if arearesult!="" {
|
|
|
- resultTmp["area"] = arearesult
|
|
|
- }
|
|
|
- if cityresult!="" {
|
|
|
- resultTmp["city"] = cityresult
|
|
|
- }
|
|
|
- if districtresult!="" {
|
|
|
- resultTmp["district"] = districtresult
|
|
|
- }
|
|
|
- return resultTmp
|
|
|
-}
|
|
|
-//单字段正文
|
|
|
-func (e *ExtractTask) NewExtractCityDetail(data map[string]interface{})( map[string]interface{}) {
|
|
|
- defer qu.Catch()
|
|
|
- //初始化
|
|
|
- j := &Job{
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- map[string]float64{},
|
|
|
- }
|
|
|
- detail := qu.ObjToString(data["detail"])
|
|
|
- e.NewGetCityByDetail(j,detail)
|
|
|
- MergeFullSimScore(j) //合并detail的全简称
|
|
|
- finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
|
|
|
- e.RemoveCD(finishP, j) //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
|
|
|
- //获取结果
|
|
|
- finishC := HighestScoreArr(j.FullCityScore)
|
|
|
- finishD := HighestScoreArr(j.FullDistrictScore)
|
|
|
- arearesult := ""
|
|
|
- cityresult := ""
|
|
|
- districtresult := ""
|
|
|
- tmpcity := []string{}
|
|
|
- if len(finishP) == 1 { //最高分一个
|
|
|
- arearesult = finishP[0] //抽取结果直接赋值
|
|
|
- cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
|
|
|
- cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
- } else if len(finishP) > 1 { //province最高分多个
|
|
|
- if len(finishC) == 1 {
|
|
|
- cityresult = finishC[0]
|
|
|
- if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
|
|
|
- arearesult = cfMap.P.Brief
|
|
|
- tmpcity = append(tmpcity, cityresult)
|
|
|
- cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
- }
|
|
|
- } else { //对应的city有多个(多个province和city)
|
|
|
- arearesult = "全国"
|
|
|
- }
|
|
|
- }
|
|
|
- if cityresult != "" && cityresult == districtresult {
|
|
|
- districtresult = ""
|
|
|
- }
|
|
|
- //直辖市
|
|
|
- if arearesult == "北京" {
|
|
|
- cityresult = "北京市"
|
|
|
- if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
|
|
|
- districtresult = "朝阳区"
|
|
|
- }
|
|
|
- } else if arearesult == "天津" {
|
|
|
- cityresult = "天津市"
|
|
|
- } else if arearesult == "上海" {
|
|
|
- cityresult = "上海市"
|
|
|
- } else if arearesult == "重庆" {
|
|
|
- cityresult = "重庆市"
|
|
|
- }
|
|
|
- if arearesult == "" {
|
|
|
- arearesult = "全国"
|
|
|
- }
|
|
|
- resultTmp := map[string]interface{}{}
|
|
|
- if arearesult!="" {
|
|
|
- resultTmp["area"] = arearesult
|
|
|
- }
|
|
|
- if cityresult!="" {
|
|
|
- resultTmp["city"] = cityresult
|
|
|
- }
|
|
|
- if districtresult!="" {
|
|
|
- resultTmp["district"] = districtresult
|
|
|
- }
|
|
|
- return resultTmp
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-//不同情况的抽取方法
|
|
|
-func (e *ExtractTask) NewGetCityByDetail(j *Job,detail string) {
|
|
|
- repeatP_full := map[string]bool{}
|
|
|
- repeatC_full := map[string]bool{}
|
|
|
- repeatD_full := map[string]bool{}
|
|
|
- repeatP_sim := map[string]bool{}
|
|
|
- repeatC_sim := map[string]bool{}
|
|
|
- repeatD_sim := map[string]bool{}
|
|
|
- for _, reg := range AgencyReg {
|
|
|
- detail = reg.ReplaceAllString(detail, "")
|
|
|
- }
|
|
|
- for _, text := range e.Seg_SV.Cut(detail, true) {
|
|
|
- if len([]rune(text)) > 1 {
|
|
|
- //全称匹配
|
|
|
- for pos_full, trie_full := range e.Trie_Fulls {
|
|
|
- if trie_full.Get(text) {
|
|
|
- if pos_full == 0 { //省全称
|
|
|
- if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
|
|
|
- PCDScore(j, "province", tmpPbrief, 1, true)
|
|
|
- repeatP_full[tmpPbrief] = true
|
|
|
- break
|
|
|
- }
|
|
|
- } else if pos_full == 1 { //市全称
|
|
|
- if cfMap := e.CityFullMap[text]; cfMap != nil {
|
|
|
- if !repeatP_full[cfMap.P.Brief] {
|
|
|
- PCDScore(j, "province", cfMap.P.Brief, 1, true)
|
|
|
- repeatP_full[cfMap.P.Brief] = true
|
|
|
- }
|
|
|
- if !repeatC_full[cfMap.Name] {
|
|
|
- PCDScore(j, "city", cfMap.Name, 1, true)
|
|
|
- repeatC_full[cfMap.Name] = true
|
|
|
- }
|
|
|
- break
|
|
|
- }
|
|
|
- } else if pos_full == 2 { //区全称
|
|
|
- citys := e.NewDistrictCityMap[text]
|
|
|
- if len(citys) > 0 {
|
|
|
- if !repeatD_full[text] {
|
|
|
- PCDScore(j, "district", text, 1, true)
|
|
|
- repeatD_full[text] = true
|
|
|
- }
|
|
|
- for _, c := range citys {
|
|
|
- if !repeatC_full[c.Name] {
|
|
|
- PCDScore(j, "city", c.Name, 1, true)
|
|
|
- repeatC_full[c.Name] = true
|
|
|
- }
|
|
|
- if !repeatP_full[c.P.Brief] {
|
|
|
- PCDScore(j, "province", c.P.Brief, 1, true)
|
|
|
- repeatP_full[c.P.Brief] = true
|
|
|
- }
|
|
|
- }
|
|
|
- break
|
|
|
- }
|
|
|
- } else if pos_full == 3 { //街道全称
|
|
|
- districts := e.NewStreetDistrictMap[text]
|
|
|
- if len(districts) == 1 {
|
|
|
- DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
|
|
|
- }
|
|
|
- } else if pos_full == 4 { //居委会全称
|
|
|
- districts := e.CommunityDistrictMap[text]
|
|
|
- if len(districts) == 1 {
|
|
|
- DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- //qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
- //简称匹配
|
|
|
- for pos_sim, trie_sim := range e.Trie_Sims {
|
|
|
- if trie_sim.Get(text) {
|
|
|
- if pos_sim == 0 && !repeatP_sim[text] { //省简称
|
|
|
- PCDScore(j, "province", text, 1, false)
|
|
|
- repeatP_sim[text] = true
|
|
|
- break
|
|
|
- } else if pos_sim == 1 { //市简称
|
|
|
- if cbMap := e.CityBriefMap[text]; cbMap != nil {
|
|
|
- if !repeatP_sim[cbMap.P.Brief] {
|
|
|
- PCDScore(j, "province", cbMap.P.Brief, 1, false)
|
|
|
- repeatP_sim[cbMap.P.Brief] = true
|
|
|
- }
|
|
|
- if !repeatC_sim[cbMap.Name] {
|
|
|
- PCDScore(j, "city", cbMap.Name, 1, false)
|
|
|
- repeatC_sim[cbMap.Name] = true
|
|
|
- }
|
|
|
- break
|
|
|
- }
|
|
|
- } else if pos_sim == 2 { //区简称
|
|
|
- dfull_citys := e.NewDistrictSimAndAll[text]
|
|
|
- if len(dfull_citys) == 1 {
|
|
|
- for _, dfull_city := range dfull_citys {
|
|
|
- for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
|
|
|
- if !repeatD_sim[dfull] {
|
|
|
- PCDScore(j, "district", dfull, 1, false)
|
|
|
- repeatD_sim[dfull] = true
|
|
|
- }
|
|
|
- if ctmp == nil {
|
|
|
- continue
|
|
|
- }
|
|
|
- if !repeatC_sim[ctmp.Name] {
|
|
|
- PCDScore(j, "city", ctmp.Name, 1, false)
|
|
|
- repeatC_sim[ctmp.Name] = true
|
|
|
- }
|
|
|
- if !repeatP_sim[ctmp.P.Brief] {
|
|
|
- PCDScore(j, "province", ctmp.P.Brief, 1, false)
|
|
|
- repeatP_sim[ctmp.P.Brief] = true
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- //qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-func (e *ExtractTask) NewGetCityByOthers(j *Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
|
|
|
- /*
|
|
|
- 1.对字段进行分词
|
|
|
- 2.省、市、区、街道、居委会全称进行匹配打分
|
|
|
- 3.省、市、区简称进行匹配打分
|
|
|
- */
|
|
|
- ts := 0.5
|
|
|
- for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
|
|
|
- if i > 1 {
|
|
|
- ts = 0.2
|
|
|
- }
|
|
|
- p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
|
|
|
- str, _ := sm.Map[from].(string)
|
|
|
- jbText := e.Seg_SV.Cut(str, true)
|
|
|
- for _, text := range jbText {
|
|
|
- if len([]rune(text)) == 1 {
|
|
|
- continue
|
|
|
- }
|
|
|
- //全称匹配
|
|
|
- //qu.Debug("text------", text)
|
|
|
- for pos_full, trie_full := range e.Trie_Fulls {
|
|
|
- if trie_full.Get(text) {
|
|
|
- if pos_full == 0 && p_full == "" { //省全称
|
|
|
- if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
|
|
|
- p_full = tmpPbrief
|
|
|
- PCDScore(j, "province", p_full, 4+ts, true)
|
|
|
- break
|
|
|
- }
|
|
|
- } else if pos_full == 1 && c_full == "" { //市全称
|
|
|
- if cfMap := e.CityFullMap[text]; cfMap != nil {
|
|
|
- tmpPbrief := cfMap.P.Brief
|
|
|
- if p_full == "" {
|
|
|
- p_full = tmpPbrief
|
|
|
- c_full = cfMap.Name
|
|
|
- PCDScore(j, "province", p_full, 4+ts, true)
|
|
|
- PCDScore(j, "city", c_full, 4+ts, true)
|
|
|
- break
|
|
|
- } else if p_full == tmpPbrief {
|
|
|
- c_full = cfMap.Name
|
|
|
- PCDScore(j, "province", tmpPbrief, 4+ts, true) //
|
|
|
- PCDScore(j, "city", c_full, 4+ts, true)
|
|
|
- break
|
|
|
- } else if p_full != "" && p_full != tmpPbrief {
|
|
|
- //city不做处理
|
|
|
- }
|
|
|
- }
|
|
|
- } else if pos_full == 2 && d_full == "" { //区全称
|
|
|
- repeatPb := map[string]bool{}
|
|
|
- isOk := false
|
|
|
- districtOk := false
|
|
|
- citys := e.NewDistrictCityMap[text]
|
|
|
- for _, c := range citys {
|
|
|
- tmpPbrief := c.P.Brief
|
|
|
- if p_full == tmpPbrief { //省份一致
|
|
|
- d_full = text
|
|
|
- if c_full == "" {
|
|
|
- c_full = c.Name
|
|
|
- PCDScore(j, "city", c_full, 4+ts, true)
|
|
|
- PCDScore(j, "province", tmpPbrief, 4+ts, true) //
|
|
|
- }
|
|
|
- isOk = true
|
|
|
- districtOk = true
|
|
|
- } else if p_full == "" { //省份不存在
|
|
|
- districtOk = true
|
|
|
- if len(citys) == 1 { //对应一个city
|
|
|
- p_full = tmpPbrief
|
|
|
- c_full = c.Name
|
|
|
- d_full = text
|
|
|
- PCDScore(j, "province", p_full, 4+ts, true)
|
|
|
- PCDScore(j, "city", c_full, 4+ts, true)
|
|
|
- isOk = true
|
|
|
- } else { //多个city,只打分,不赋值
|
|
|
- if !repeatPb[tmpPbrief] {
|
|
|
- PCDScore(j, "province", tmpPbrief, 2+ts, true)
|
|
|
- repeatPb[tmpPbrief] = true
|
|
|
- }
|
|
|
- //PCDScore(j, "province", tmpPbrief, 2, true)
|
|
|
- PCDScore(j, "city", c.Name, 2+ts, true)
|
|
|
- }
|
|
|
- } else if p_full != "" && p_full != tmpPbrief { //干扰项减分
|
|
|
- if !repeatPb[tmpPbrief] {
|
|
|
- PCDScore(j, "province", tmpPbrief, -5, true)
|
|
|
- repeatPb[tmpPbrief] = true
|
|
|
- }
|
|
|
- //PCDScore(j, "province", tmpPbrief, -5, true)
|
|
|
- PCDScore(j, "city", c.Name, -5, true)
|
|
|
- }
|
|
|
- }
|
|
|
- if districtOk {
|
|
|
- PCDScore(j, "district", text, 4+ts, true)
|
|
|
- } else {
|
|
|
- PCDScore(j, "district", text, -5, true)
|
|
|
- }
|
|
|
- if isOk {
|
|
|
- break
|
|
|
- }
|
|
|
- } else if pos_full == 3 { //街道全称
|
|
|
- districts := e.NewStreetDistrictMap[text]
|
|
|
- if len(districts) == 1 { //街道唯一
|
|
|
- DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
|
|
|
- }
|
|
|
- } else if pos_full == 4 { //居委会全称
|
|
|
- districts := e.CommunityDistrictMap[text]
|
|
|
- if len(districts) == 1 { //居委会唯一
|
|
|
- DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- //qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
|
|
|
- //简称匹配
|
|
|
- for pos_sim, trie_sim := range e.Trie_Sims {
|
|
|
- if trie_sim.Get(text) {
|
|
|
- if pos_sim == 0 && p_sim == "" { //省简称
|
|
|
- p_sim = text
|
|
|
- PCDScore(j, "province", p_sim, 3+ts, false)
|
|
|
- break
|
|
|
- } else if pos_sim == 1 { //市简称
|
|
|
- if cbMap := e.CityBriefMap[text]; cbMap != nil {
|
|
|
- tmpPbrief := cbMap.P.Brief
|
|
|
- if p_sim == "" {
|
|
|
- score := 2.0 + ts
|
|
|
- if tmpPbrief == p_full {
|
|
|
- score += 1.0
|
|
|
- }
|
|
|
- p_sim = tmpPbrief
|
|
|
- c_sim = cbMap.Brief
|
|
|
- PCDScore(j, "province", p_sim, score, false)
|
|
|
- PCDScore(j, "city", cbMap.Name, score, false)
|
|
|
- break
|
|
|
- } else if p_sim == tmpPbrief {
|
|
|
- c_sim = cbMap.Brief
|
|
|
- PCDScore(j, "city", cbMap.Name, 3+ts, false)
|
|
|
- PCDScore(j, "province", tmpPbrief, 3+ts, false)
|
|
|
- break
|
|
|
- } else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
|
|
|
- delete(j.SimAreaScore, p_sim)
|
|
|
- c_sim = text //
|
|
|
- p_sim = tmpPbrief //
|
|
|
- PCDScore(j, "province", tmpPbrief, 3+ts, false)
|
|
|
- PCDScore(j, "city", cbMap.Name, 3+ts, false)
|
|
|
- }
|
|
|
- }
|
|
|
- } else if pos_sim == 2 && d_sim == "" { //区简称
|
|
|
- repeatPb := map[string]bool{}
|
|
|
- repeatDb := map[string]bool{}
|
|
|
- dfull_citys := e.NewDistrictSimAndAll[text]
|
|
|
- for _, dfull_city := range dfull_citys {
|
|
|
- for dfull, c := range dfull_city { //dfull:简称对应的全称
|
|
|
- if c == nil || c.P == nil {
|
|
|
- continue
|
|
|
- }
|
|
|
- tmpPbrief := c.P.Brief
|
|
|
- if p_sim == tmpPbrief { //省份一致
|
|
|
- d_sim = text
|
|
|
- PCDScore(j, "district", dfull, 2+ts, false)
|
|
|
- if c_sim == "" {
|
|
|
- c_sim = c.Brief
|
|
|
- PCDScore(j, "city", c.Name, 2+ts, false)
|
|
|
- }
|
|
|
- PCDScore(j, "province", tmpPbrief, 2+ts, false) //
|
|
|
- } else if p_sim == "" {
|
|
|
- if !repeatDb[dfull] {
|
|
|
- PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
|
|
|
- repeatDb[dfull] = true
|
|
|
- }
|
|
|
- if len(dfull_citys) == 1 {
|
|
|
- PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
|
|
|
- PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
|
|
|
- } else {
|
|
|
- if !repeatPb[tmpPbrief] {
|
|
|
- PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
|
|
|
- repeatPb[tmpPbrief] = true
|
|
|
- }
|
|
|
- PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
|
|
|
- }
|
|
|
- } else if p_sim != "" && p_sim != tmpPbrief {
|
|
|
- if !repeatPb[tmpPbrief] {
|
|
|
- PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
|
|
|
- repeatPb[tmpPbrief] = true
|
|
|
- }
|
|
|
- PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
|
|
|
- PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- //qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-func MergeScores(j *Job, pscore, cscore, dscore *map[string]float64) {
|
|
|
- if len(j.FullAreaScore) > 0 {
|
|
|
- for pt, ps := range *pscore {
|
|
|
- j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
|
|
|
- }
|
|
|
- for ct, cs := range *cscore {
|
|
|
- j.FullCityScore[ct] = j.FullCityScore[ct] + cs
|
|
|
- }
|
|
|
- for dt, ds := range *dscore {
|
|
|
- j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-func MergeFullSimScore(j *Job) {
|
|
|
- if len(j.FullAreaScore) == 0 {
|
|
|
- j.FullAreaScore = j.SimAreaScore
|
|
|
- } else {
|
|
|
- for p_text, p_score := range j.FullAreaScore {
|
|
|
- j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
|
|
|
- }
|
|
|
- }
|
|
|
- for c_text, c_score := range j.SimCityScore {
|
|
|
- j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
|
|
|
- }
|
|
|
- for d_text, d_score := range j.SimDistrictScore {
|
|
|
- j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
|
|
|
- }
|
|
|
- // if len(j.FullCityScore) == 0 {
|
|
|
- // j.FullCityScore = j.SimCityScore
|
|
|
- // } else {
|
|
|
- // for c_text, c_score := range j.FullCityScore {
|
|
|
- // j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
|
|
|
- // }
|
|
|
- // }
|
|
|
- // if len(j.FullDistrictScore) == 0 {
|
|
|
- // j.FullDistrictScore = j.SimDistrictScore
|
|
|
- // } else {
|
|
|
- // for d_text, d_score := range j.FullDistrictScore {
|
|
|
- // j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
|
|
|
- // }
|
|
|
- // }
|
|
|
-}
|
|
|
-func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
|
|
|
- for _, c := range finishC { //取最高分与province匹配的city
|
|
|
- if cfMap := e.CityFullMap[c]; cfMap != nil {
|
|
|
- if cfMap.P.Brief == area {
|
|
|
- // city = c
|
|
|
- // break
|
|
|
- tmpcity = append(tmpcity, c)
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- if len(tmpcity) == 1 {
|
|
|
- city = tmpcity[0]
|
|
|
- }
|
|
|
- return city, tmpcity
|
|
|
-}
|
|
|
-func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
|
|
|
- for _, d := range finishD { //取最高分与province匹配的district
|
|
|
- citys := e.NewDistrictCityMap[d]
|
|
|
- for _, c := range citys {
|
|
|
- if len(tmpcity) == 0 { //没有city
|
|
|
- if c.P.Brief == area {
|
|
|
- city = c.Name
|
|
|
- district = d
|
|
|
- return city, district
|
|
|
- }
|
|
|
- } else if len(tmpcity) == 1 { //一个city
|
|
|
- if c.Name == city && c.P.Brief == area {
|
|
|
- district = d
|
|
|
- return city, district
|
|
|
- }
|
|
|
- } else { //多个city
|
|
|
- for _, tc := range tmpcity { //多个city根据district最高分取
|
|
|
- if tc == c.Name && len(finishD) == 1 {
|
|
|
- city = c.Name
|
|
|
- district = d
|
|
|
- return city, district
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- return city, district
|
|
|
-}
|
|
|
-func HighestScoreArr(m map[string]float64) []string {
|
|
|
- result := make(map[float64][]string)
|
|
|
- tmpscore := 0.0
|
|
|
- for str, score := range m {
|
|
|
- if str != "" && tmpscore <= score {
|
|
|
- if result[tmpscore] != nil && tmpscore != score {
|
|
|
- delete(result, tmpscore)
|
|
|
- }
|
|
|
- if r := result[score]; r != nil {
|
|
|
- r = append(r, str)
|
|
|
- result[score] = r
|
|
|
- } else {
|
|
|
- result[score] = []string{str}
|
|
|
- }
|
|
|
- tmpscore = score
|
|
|
- }
|
|
|
- }
|
|
|
- return result[tmpscore]
|
|
|
-}
|
|
|
-func (e *ExtractTask) RemoveCD(finishP []string, j *Job) {
|
|
|
- if len(j.FullDistrictScore) > 0 {
|
|
|
- for d, _ := range j.FullDistrictScore {
|
|
|
- tmpCitys := e.NewDistrictCityMap[d]
|
|
|
- for _, c := range tmpCitys {
|
|
|
- if j.FullCityScore[c.Name] != 0 {
|
|
|
- tmpPb := c.P.Brief
|
|
|
- //if j.FullAreaScore[tmpPb] != 0 {
|
|
|
- flag := false
|
|
|
- for _, p := range finishP {
|
|
|
- if tmpPb == p {
|
|
|
- flag = true
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
- if !flag {
|
|
|
- delete(j.FullCityScore, c.Name)
|
|
|
- delete(j.FullDistrictScore, d)
|
|
|
- }
|
|
|
- //}
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- if len(j.FullCityScore) > 0 {
|
|
|
- for tmpcity, _ := range j.FullCityScore {
|
|
|
- c := e.CityFullMap[tmpcity]
|
|
|
- if c == nil {
|
|
|
- log.Debug("行政区划错误数据:", tmpcity)
|
|
|
- continue
|
|
|
- }
|
|
|
- tmpPb := c.P.Brief
|
|
|
- //if j.FullAreaScore[tmpPb] != 0 {
|
|
|
- flag := false
|
|
|
- for _, p := range finishP {
|
|
|
- if tmpPb == p {
|
|
|
- flag = true
|
|
|
- break
|
|
|
- }
|
|
|
- }
|
|
|
- if !flag {
|
|
|
- delete(j.FullCityScore, tmpcity)
|
|
|
- }
|
|
|
- //}
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
-}
|
|
|
-//计算province,city,district得分
|
|
|
-func PCDScore(j *Job, stype, text string, score float64, isfull bool) {
|
|
|
- defer qu.Catch()
|
|
|
- if text != "" {
|
|
|
- if stype == "district" {
|
|
|
- tmpdistrict := make(map[string]float64)
|
|
|
- if isfull {
|
|
|
- tmpdistrict = j.FullDistrictScore
|
|
|
- } else {
|
|
|
- tmpdistrict = j.SimDistrictScore
|
|
|
- }
|
|
|
- scoretmp := tmpdistrict[text]
|
|
|
- tmpdistrict[text] = scoretmp + score
|
|
|
- } else if stype == "city" {
|
|
|
- tmpcity := make(map[string]float64)
|
|
|
- if isfull {
|
|
|
- tmpcity = j.FullCityScore
|
|
|
- } else {
|
|
|
- tmpcity = j.SimCityScore
|
|
|
- }
|
|
|
- scoretmp := tmpcity[text]
|
|
|
- tmpcity[text] = scoretmp + score
|
|
|
- } else if stype == "province" {
|
|
|
- tmpprovince := make(map[string]float64)
|
|
|
- if isfull {
|
|
|
- tmpprovince = j.FullAreaScore
|
|
|
- } else {
|
|
|
- tmpprovince = j.SimAreaScore
|
|
|
- }
|
|
|
- scoretmp := tmpprovince[text]
|
|
|
- tmpprovince[text] = scoretmp + score
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-//街道、居委会对应多地市处理
|
|
|
-func DealMultipleDistrict(e *ExtractTask, j *Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
|
|
|
- if len(districts) == 1 {
|
|
|
- district := districts[0]
|
|
|
- city := district.C.Name
|
|
|
- tmpPbrief := district.C.P.Brief
|
|
|
- if pbrief != "" && tmpPbrief == pbrief {
|
|
|
- PCDScore(j, "province", tmpPbrief, score, true)
|
|
|
- PCDScore(j, "city", city, score, true)
|
|
|
- PCDScore(j, "district", district.Name, score, true)
|
|
|
- } else if pbrief == "" {
|
|
|
- if repeatP != nil && !(*repeatP)[tmpPbrief] {
|
|
|
- PCDScore(j, "province", tmpPbrief, score, true)
|
|
|
- (*repeatP)[tmpPbrief] = true
|
|
|
- } else if repeatP == nil {
|
|
|
- PCDScore(j, "province", tmpPbrief, score, true)
|
|
|
- }
|
|
|
- if repeatC != nil && !(*repeatC)[city] {
|
|
|
- PCDScore(j, "city", city, score, true)
|
|
|
- (*repeatC)[city] = true
|
|
|
- } else if repeatC == nil {
|
|
|
- PCDScore(j, "city", city, score, true)
|
|
|
- }
|
|
|
- if repeatD != nil && !(*repeatD)[tmpPbrief] {
|
|
|
- PCDScore(j, "district", district.Name, score, true)
|
|
|
- (*repeatD)[district.Name] = true
|
|
|
- } else if repeatD == nil {
|
|
|
- PCDScore(j, "district", district.Name, score, true)
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-//计算province,city,district区或县匹配的得分
|
|
|
-func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
|
|
|
- defer qu.Catch()
|
|
|
- if t != "" {
|
|
|
- if stype == "d" {
|
|
|
- tmpscore := (*ds)[t]
|
|
|
- (*ds)[t] = tmpscore + score
|
|
|
- } else if stype == "c" {
|
|
|
- tmpscore := (*cs)[t]
|
|
|
- (*cs)[t] = tmpscore + score
|
|
|
- } else if stype == "p" {
|
|
|
- tmpscore := (*ps)[t]
|
|
|
- (*ps)[t] = tmpscore + score
|
|
|
- }
|
|
|
- }
|
|
|
-}
|