123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994 |
- package extract
- import (
- . "jy/pretreated"
- ju "jy/util"
- qu "qfw/util"
- "strings"
- log "github.com/donnie4w/go-logger/logger"
- )
- //抽取city
- func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp *map[string]interface{}) {
- /*
- 高准确率:
- 1.爬虫数据jsondata
- 2.采购单位库
- 3.邮编
- 4.固话
- 5.site(todo)
- 低准确率:(全称库匹配到不走简称库)
- 1.city全称库(buyeraddr;title,projectname)
- 2.city简称库(buyeraddr;title,projectname)
- */
- defer qu.Catch()
- //初始化
- if j.FullAreaScore == nil {
- j.FullAreaScore = make(map[string]float64)
- }
- if j.FullCityScore == nil {
- j.FullCityScore = make(map[string]float64)
- }
- if j.FullDistrictScore == nil {
- j.FullDistrictScore = make(map[string]float64)
- }
- if j.SimAreaScore == nil {
- j.SimAreaScore = make(map[string]float64)
- }
- if j.SimCityScore == nil {
- j.SimCityScore = make(map[string]float64)
- }
- if j.SimDistrictScore == nil {
- j.SimDistrictScore = make(map[string]float64)
- }
- //记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
- pscore := make(map[string]float64)
- cscore := make(map[string]float64)
- dscore := make(map[string]float64)
- sm := NewSortMap()
- //1.jsondata抽取
- e.NewGetCityByJsonData(j)
- //2.site库抽取
- e.NewGetCityBySite(j)
- //3.采购单位库抽取(暂时没有采购单位库)
- //4.postcode邮编抽取
- buyerzipcode := qu.ObjToString((*resulttmp)["buyerzipcode"])
- e.NewGetCityByPostCode(j, buyerzipcode)
- //5.areacode固话区号抽取
- buyertel := qu.ObjToString((*resulttmp)["buyertel"])
- e.NewGetCityByAreaCode(j, buyertel)
- //6.buyeraddr,title,projectname抽取
- buyeraddr := qu.ObjToString((*resulttmp)["buyeraddr"])
- title := qu.ObjToString((*resulttmp)["title"])
- projectname := qu.ObjToString((*resulttmp)["projectname"])
- buyer := qu.ObjToString((*resulttmp)["buyer"])
- addressing := qu.ObjToString((*resulttmp)["addressing"])
- sm.AddKey("buyeraddr", buyeraddr)
- sm.AddKey("buyer", buyer)
- sm.AddKey("title", title)
- sm.AddKey("projectname", projectname)
- sm.AddKey("addressing", addressing) //新增地址辅助字段
- if projectaddr, isok := (*resulttmp)["projectaddr"].(string); isok {
- sm.AddKey("projectaddr", projectaddr)
- }
- if bidopenaddress, isok := (*resulttmp)["bidopenaddress"].(string); isok {
- sm.AddKey("bidopenaddress", bidopenaddress)
- }
- //7.buyeraddr buyer title projectname抽取
- e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
- //qu.Debug("全称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
- //qu.Debug("简称打分后结果---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
- //全称简称得分合并
- MergeFullSimScore(j) //合并buyer buyeraddr title projectname全称简称
- //qu.Debug("全称简称合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
- //合并区简称得分
- //qu.Debug("pcd=====", pscore, cscore, dscore)
- MergeScores(j, &pscore, &cscore, &dscore) //合并区简称匹配的pcd
- //qu.Debug("合并区简称打分后结果---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
- j.SimAreaScore = map[string]float64{}
- j.SimCityScore = map[string]float64{}
- j.SimDistrictScore = map[string]float64{}
- //8.detail抽取
- if len(j.FullAreaScore) > 0 && len(j.FullCityScore) > 0 { //以上抽取有省有市再从detail中抽取进行判断
- e.NewGetCityByDetail(j)
- }
- //qu.Debug("detail打分后全称---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
- //qu.Debug("detail打分后简称---", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
- MergeFullSimScore(j) //合并detail的全简称
- //qu.Debug("detail合并后---", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
- finishP := HighestScoreArr(j.FullAreaScore) //获取最高分的省
- e.RemoveCD(finishP, j) //将city中所属干扰项省的city去除,同时去除district.5d2bd4aba5cb26b9b769d18e
- //qu.Debug("去除干扰项后的city和district得分---", finishP, j.FullCityScore, j.FullDistrictScore)
- //获取结果
- finishC := HighestScoreArr(j.FullCityScore)
- finishD := HighestScoreArr(j.FullDistrictScore)
- arearesult := ""
- cityresult := ""
- districtresult := ""
- tmpcity := []string{}
- if len(finishP) == 1 { //最高分一个
- arearesult = finishP[0] //抽取结果直接赋值
- cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
- cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
- } else if len(finishP) > 1 { //province最高分多个
- if len(finishC) == 1 {
- cityresult = finishC[0]
- if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
- arearesult = cfMap.P.Brief
- tmpcity = append(tmpcity, cityresult)
- cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
- }
- } else { //对应的city有多个(多个province和city)
- //arearesult = finishP[0] //抽取结果直接赋值
- //cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
- //cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
- arearesult = "全国"
- }
- }
- if cityresult != "" && cityresult == districtresult {
- districtresult = ""
- }
- //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
- //直辖市
- if arearesult == "北京" {
- cityresult = "北京市"
- if districtresult == "北京朝阳" { //特殊情况(北京朝阳中西医结合急诊抢救中心:5a84079740d2d9bbe88bad90)
- districtresult = "朝阳区"
- }
- } else if arearesult == "天津" {
- cityresult = "天津市"
- } else if arearesult == "上海" {
- cityresult = "上海市"
- } else if arearesult == "重庆" {
- cityresult = "重庆市"
- }
- if arearesult == "" {
- arearesult = "全国"
- } /* else if cityresult == "" {
- if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
- cityresult = pbMap.Cap
- resulttmp["defaultpcap"] = true
- }
- }*/
- //qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
- (*resulttmp)["area"] = arearesult
- (*resulttmp)["city"] = cityresult
- (*resulttmp)["district"] = districtresult
- //校验-映射新疆兵团
- if xjbtReg.MatchString(buyer) && cityresult == "" {
- a, c, d, ok := e.CheckingXjbtCity(buyer)
- if ok {
- (*resulttmp)["area"] = a
- (*resulttmp)["city"] = c
- (*resulttmp)["district"] = d
- }
- }
- //如果-仅有省份-敏感词-校验核对方法
- if arearesult != "全国" && cityresult == "" {
- sensitive_city := e.SensitiveCityData(qu.ObjToString((*j.Data)["detail"]), arearesult)
- if sensitive_city != "" {
- (*resulttmp)["city"] = sensitive_city
- (*resulttmp)["is_sensitive"] = 1
- }
- }
- }
- //jsondata中抽取城市
- func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
- defer qu.Catch()
- if j.Jsondata != nil {
- jsondata := *j.Jsondata
- //jsondata中获取province和city
- if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
- p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配
- GetByACDSimJb(p, c, d, a_c_d, e, j) //简称匹配
- }
- city, _ = jsondata["city"].(string) //city全称或者简称
- province, _ = jsondata["area"].(string) //province简称
- district, _ = jsondata["district"].(string) //district全称
- }
- PCDScore(j, "district", district, 5, true) //district打分
- bp := false
- if province != "" {
- if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
- bp = true //省份正确
- }
- }
- pbrief := ""
- if city != "" {
- cityfullmap := e.CityFullMap[city] //判断city全称是否正确
- if cityfullmap != nil {
- pbrief = cityfullmap.P.Brief //province简称
- } else {
- citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
- if citybriefmap != nil {
- city = citybriefmap.Name //city简称替换为全称
- pbrief = citybriefmap.P.Brief
- }
- }
- }
- if bp {
- if pbrief == province { //爬虫的province和city匹配
- PCDScore(j, "city", city, 5, true)
- } else { //pbrief不匹配province(此时city为空或者错误)
- city = ""
- }
- PCDScore(j, "province", province, 5, true)
- } else { //省份错误或为空,取city的对应的pbrief为province
- if pbrief != "" {
- province = pbrief
- PCDScore(j, "province", province, 5, true)
- PCDScore(j, "city", city, 5, true)
- } else {
- province = ""
- city = ""
- }
- }
- return
- }
- //全称从area_city_district中抽城市
- func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) {
- text := e.Seg_PCD.Cut(a_c_d, true)
- repeatPb := map[string]bool{}
- for _, full := range text {
- if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
- if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
- pbrief = tmpPbrief //省简称
- PCDScore(j, "province", pbrief, 5, true)
- }
- } else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
- if cfMap := e.CityFullMap[full]; cfMap != nil {
- tmpcity := cfMap.Name //城市全称
- tmpPbrief := cfMap.P.Brief //省简称
- if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
- city = tmpcity
- PCDScore(j, "city", city, 5, true)
- } else if pbrief == "" {
- city = tmpcity
- pbrief = tmpPbrief
- PCDScore(j, "city", city, 5, true)
- PCDScore(j, "province", pbrief, 5, true)
- }
- }
- } else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
- carr := e.DistrictCityMap[full]
- if len(carr) > 0 {
- district = full
- PCDScore(j, "district", district, 5, true)
- for _, c := range carr {
- tmpcity := c.Name //城市全称
- tmpPbrief := c.P.Brief //省简称
- if pbrief == "" { //之前没有匹配到省份
- PCDScore(j, "city", tmpcity, 5, true)
- if !repeatPb[tmpPbrief] {
- PCDScore(j, "province", tmpPbrief, 5, true)
- repeatPb[tmpPbrief] = true
- }
- } else { //已有省份
- if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
- PCDScore(j, "city", tmpcity, -5, true)
- PCDScore(j, "province", tmpPbrief, -5, true)
- } else { //与之前匹配结果一致
- if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
- PCDScore(j, "city", tmpcity, 5, true)
- }
- }
- }
- }
- }
- }
- }
- return pbrief, city, district
- }
- //简称从area_city_district中抽城市
- func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) {
- text := e.Seg_PCD.Cut(a_c_d, true)
- repeatPb := map[string]bool{}
- for _, sim := range text {
- if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
- if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
- pbrief = pbMap.Brief
- PCDScore(j, "province", pbrief, 5, true) //打分
- //PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
- }
- } else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
- if cbMap := e.CityBriefMap[sim]; cbMap != nil {
- tmpcity := cbMap.Name
- tmpPbrief := cbMap.P.Brief
- if pbrief != "" && pbrief == tmpPbrief {
- city = tmpcity
- PCDScore(j, "city", city, 5, true)
- } else if pbrief == "" {
- city = tmpcity
- pbrief = tmpPbrief
- PCDScore(j, "city", city, 5, true)
- PCDScore(j, "province", pbrief, 5, true)
- //PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
- }
- }
- } else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
- dfullarr := e.DistrictSimAndAll[sim]
- if len(dfullarr) > 0 {
- PCDScore(j, "district", sim, 5, true)
- for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
- for _, c := range dfullAndCity {
- if c == nil {
- continue
- }
- tmpcity := c.Name //城市全称
- tmpPbrief := c.P.Brief //省简称
- if pbrief == "" { //之前没有匹配到省份
- PCDScore(j, "city", tmpcity, 5, true)
- if !repeatPb[tmpPbrief] {
- PCDScore(j, "province", tmpPbrief, 5, true)
- repeatPb[tmpPbrief] = true
- }
- } else { //已有省份
- if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
- PCDScore(j, "city", tmpcity, -5, true)
- PCDScore(j, "province", tmpPbrief, -5, true)
- } else { //与之前匹配结果一致
- if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
- PCDScore(j, "city", tmpcity, 5, true)
- }
- }
- }
- }
- }
- }
- }
- }
- }
- //通过site提取城市
- func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
- site, _ := (*j.Data)["site"].(string)
- //qu.Debug("site--------", site)
- if scMap := e.SiteCityMap[site]; scMap != nil {
- if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
- PCDScore(j, "province", scMap.P, 5, true)
- }
- if scMap.C != "" && scMap.C != "null" {
- PCDScore(j, "city", scMap.C, 5, true)
- }
- if scMap.D != "" && scMap.D != "null" {
- PCDScore(j, "district", scMap.D, 5, true)
- }
- }
- }
- //通过邮编提取城市
- func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
- defer qu.Catch()
- pc := e.PostCodeMap[postcode]
- if pc != nil {
- province = pc.P
- city = pc.C
- districtTmp := pc.D //邮编可能对应多个区
- score := 3.0
- if len(districtTmp) == 1 && districtTmp[0] != "" {
- score = 5.0
- }
- for _, district := range districtTmp {
- PCDScore(j, "district", district, score, true)
- }
- PCDScore(j, "province", province, 5, true)
- PCDScore(j, "city", city, 5, true)
- }
- return
- }
- //固话区号提取城市
- func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
- defer qu.Catch()
- if len(buyertel) >= 11 {
- if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
- n := 4
- L:
- areacode := buyertel[:n]
- ac := e.AreaCodeMap[areacode]
- if ac != nil {
- province = ac.P
- citytmp := ac.C
- if len(citytmp) == 1 { //对应多个city舍去
- city = citytmp[0]
- score := float64(5)
- if areacode == "0371" {
- score = float64(4)
- }
- PCDScore(j, "city", city, score, true)
- }
- PCDScore(j, "province", province, 5, true)
- } else {
- n = n - 1
- if n >= 3 {
- goto L
- }
- }
- } /* else if buyertel[:3] == "853" { //澳门
- province = "澳门"
- city = "澳门"
- PCDScore(j, "province", province, 5, true)
- PCDScore(j, "city", city, 5, true)
- }*/
- }
- return
- }
- func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]float64) {
- /*
- 1.对字段进行分词
- 2.省、市、区、街道、居委会全称进行匹配打分
- 3.省、市、区简称进行匹配打分
- */
- ts := 0.5
- for i, from := range sm.Keys { //buyer;buyeraddr;title;projectname
- if i > 1 {
- ts = 0.2
- }
- p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
- str, _ := sm.Map[from].(string)
- jbText := e.Seg_SV.Cut(str, true)
- for jb_index, text := range jbText {
- if len([]rune(text)) == 1 {
- continue
- }
- //全称匹配
- //qu.Debug("text------", text)
- for pos_full, trie_full := range e.Trie_Fulls {
- if trie_full.Get(text) {
- if pos_full == 0 && p_full == "" { //省全称
- if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
- p_full = tmpPbrief
- PCDScore(j, "province", p_full, 4+ts, true)
- break
- }
- } else if pos_full == 1 && c_full == "" { //市全称
- if cfMap := e.CityFullMap[text]; cfMap != nil {
- tmpPbrief := cfMap.P.Brief
- if p_full == "" {
- p_full = tmpPbrief
- c_full = cfMap.Name
- PCDScore(j, "province", p_full, 4+ts, true)
- PCDScore(j, "city", c_full, 4+ts, true)
- break
- } else if p_full == tmpPbrief {
- c_full = cfMap.Name
- PCDScore(j, "province", tmpPbrief, 4+ts, true) //
- PCDScore(j, "city", c_full, 4+ts, true)
- break
- } else if p_full != "" && p_full != tmpPbrief {
- //city不做处理
- }
- }
- } else if pos_full == 2 && d_full == "" { //区全称
- repeatPb := map[string]bool{}
- isOk := false
- districtOk := false
- citys := e.DistrictCityMap[text]
- for _, c := range citys {
- tmpPbrief := c.P.Brief
- if p_full == tmpPbrief { //省份一致
- d_full = text
- if c_full == "" {
- c_full = c.Name
- PCDScore(j, "city", c_full, 4+ts, true)
- PCDScore(j, "province", tmpPbrief, 4+ts, true) //
- }
- isOk = true
- districtOk = true
- } else if p_full == "" { //省份不存在
- districtOk = true
- if len(citys) == 1 { //对应一个city
- p_full = tmpPbrief
- c_full = c.Name
- d_full = text
- PCDScore(j, "province", p_full, 4+ts, true)
- PCDScore(j, "city", c_full, 4+ts, true)
- isOk = true
- } else { //多个city,只打分,不赋值
- if !repeatPb[tmpPbrief] {
- PCDScore(j, "province", tmpPbrief, 2+ts, true)
- repeatPb[tmpPbrief] = true
- }
- //PCDScore(j, "province", tmpPbrief, 2, true)
- PCDScore(j, "city", c.Name, 2+ts, true)
- }
- } else if p_full != "" && p_full != tmpPbrief { //干扰项减分
- if !repeatPb[tmpPbrief] {
- PCDScore(j, "province", tmpPbrief, -5, true)
- repeatPb[tmpPbrief] = true
- }
- //PCDScore(j, "province", tmpPbrief, -5, true)
- PCDScore(j, "city", c.Name, -5, true)
- }
- }
- if districtOk {
- PCDScore(j, "district", text, 4+ts, true)
- } else {
- PCDScore(j, "district", text, -5, true)
- }
- if isOk {
- break
- }
- } else if pos_full == 3 { //街道全称
- districts := e.StreetDistrictMap[text]
- if len(districts) == 1 { //街道唯一
- DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
- }
- } else if pos_full == 4 { //居委会全称
- //districts := e.CommunityDistrictMap[text]
- //if len(districts) == 1 { //居委会唯一
- // DealMultipleDistrict(e, j, districts, 2+ts, p_full, nil, nil, nil)
- //}
- }
- }
- }
- //qu.Debug("全称后--", j.FullAreaScore, j.FullCityScore, j.FullDistrictScore)
- //简称匹配
- for pos_sim, trie_sim := range e.Trie_Sims {
- if trie_sim.Get(text) {
- if pos_sim == 0 && p_sim == "" { //省简称
- p_sim = text
- PCDScore(j, "province", p_sim, 3+ts, false)
- break
- } else if pos_sim == 1 { //市简称
- if cbMap := e.CityBriefMap[text]; cbMap != nil {
- tmpPbrief := cbMap.P.Brief
- if p_sim == "" {
- score := 2.0 + ts
- if tmpPbrief == p_full {
- score += 1.0
- }
- p_sim = tmpPbrief
- c_sim = cbMap.Brief
- PCDScore(j, "province", p_sim, score, false)
- PCDScore(j, "city", cbMap.Name, score, false)
- break
- } else if p_sim == tmpPbrief {
- c_sim = cbMap.Brief
- PCDScore(j, "city", cbMap.Name, 3+ts, false)
- PCDScore(j, "province", tmpPbrief, 3+ts, false)
- break
- } else if p_sim != "" && p_sim != tmpPbrief { //上海宝冶集团有限公司南京分公司 北京朝阳中西医结合急诊抢救中心
- delete(j.SimAreaScore, p_sim)
- c_sim = text //
- p_sim = tmpPbrief //
- PCDScore(j, "province", tmpPbrief, 3+ts, false)
- PCDScore(j, "city", cbMap.Name, 3+ts, false)
- }
- }
- } else if pos_sim == 2 && d_sim == "" { //区简称
- repeatPb := map[string]bool{}
- repeatDb := map[string]bool{}
- dfull_citys := e.DistrictSimAndAll[text]
- for _, dfull_city := range dfull_citys {
- for dfull, c := range dfull_city { //dfull:简称对应的全称
- if c == nil || c.P == nil {
- continue
- }
- tmpPbrief := c.P.Brief
- if p_sim == tmpPbrief { //省份一致
- d_sim = text
- PCDScore(j, "district", dfull, 2+ts, false)
- if c_sim == "" {
- c_sim = c.Brief
- PCDScore(j, "city", c.Name, 2+ts, false)
- }
- PCDScore(j, "province", tmpPbrief, 2+ts, false) //
- } else if p_sim == "" { //暂未匹配到省
- if !repeatDb[dfull] {
- PCDScoreByDistrictSim("d", dfull, 1+ts, pscore, cscore, dscore)
- repeatDb[dfull] = true
- }
- if len(dfull_citys) == 1 {
- PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
- PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
- } else {
- if !repeatPb[tmpPbrief] {
- PCDScoreByDistrictSim("p", tmpPbrief, 1+ts, pscore, cscore, dscore)
- repeatPb[tmpPbrief] = true
- }
- PCDScoreByDistrictSim("c", c.Name, 1+ts, pscore, cscore, dscore)
- }
- //新增~特殊组情况下~津市高新区管委会~切词首"津市"~均未匹配到情况下
- if jb_index == 0 && len(dfull_citys) == 1 && len(j.FullAreaScore) == 0 && len(j.SimAreaScore) == 0 {
- PCDScore(j, "district", dfull, 0, false)
- PCDScore(j, "city", c.Name, 0, false)
- PCDScore(j, "province", tmpPbrief, 0, false) //
- }
- } else if p_sim != "" && p_sim != tmpPbrief {
- if !repeatPb[tmpPbrief] {
- PCDScoreByDistrictSim("p", tmpPbrief, ts, pscore, cscore, dscore)
- repeatPb[tmpPbrief] = true
- }
- PCDScoreByDistrictSim("c", c.Name, ts, pscore, cscore, dscore)
- PCDScoreByDistrictSim("d", dfull, ts, pscore, cscore, dscore)
- }
- }
- }
- }
- }
- }
- //qu.Debug("简称后--", j.SimAreaScore, j.SimCityScore, j.SimDistrictScore)
- }
- }
- }
- func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
- repeatP_full := map[string]bool{}
- repeatC_full := map[string]bool{}
- repeatD_full := map[string]bool{}
- repeatP_sim := map[string]bool{}
- repeatC_sim := map[string]bool{}
- repeatD_sim := map[string]bool{}
- detailRune := []rune(j.Content)
- detail := j.Content
- if len(detailRune) > 600 {
- start := detailRune[:300]
- end := detailRune[len(detailRune)-300:]
- detail = string(start) + string(end)
- }
- for _, reg := range AgencyReg {
- detail = reg.ReplaceAllString(detail, "")
- }
- for _, text := range e.Seg_SV.Cut(detail, true) {
- if len([]rune(text)) > 1 {
- //全称匹配
- for pos_full, trie_full := range e.Trie_Fulls {
- if trie_full.Get(text) {
- if pos_full == 0 { //省全称
- if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" && !repeatP_full[tmpPbrief] { //取简称
- PCDScore(j, "province", tmpPbrief, 1, true)
- repeatP_full[tmpPbrief] = true
- break
- }
- } else if pos_full == 1 { //市全称
- if cfMap := e.CityFullMap[text]; cfMap != nil {
- if !repeatP_full[cfMap.P.Brief] {
- PCDScore(j, "province", cfMap.P.Brief, 1, true)
- repeatP_full[cfMap.P.Brief] = true
- }
- if !repeatC_full[cfMap.Name] {
- PCDScore(j, "city", cfMap.Name, 1, true)
- repeatC_full[cfMap.Name] = true
- }
- break
- }
- } else if pos_full == 2 { //区全称
- citys := e.DistrictCityMap[text]
- if len(citys) > 0 {
- if !repeatD_full[text] {
- PCDScore(j, "district", text, 1, true)
- repeatD_full[text] = true
- }
- for _, c := range citys {
- if !repeatC_full[c.Name] {
- PCDScore(j, "city", c.Name, 1, true)
- repeatC_full[c.Name] = true
- }
- if !repeatP_full[c.P.Brief] {
- PCDScore(j, "province", c.P.Brief, 1, true)
- repeatP_full[c.P.Brief] = true
- }
- }
- break
- }
- } else if pos_full == 3 { //街道全称
- districts := e.StreetDistrictMap[text]
- if len(districts) == 1 {
- DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
- }
- } else if pos_full == 4 { //居委会全称
- //districts := e.CommunityDistrictMap[text]
- //if len(districts) == 1 {
- // DealMultipleDistrict(e, j, districts, 1, "", &repeatP_full, &repeatC_full, &repeatD_full)
- //}
- }
- }
- }
- //qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
- //简称匹配
- for pos_sim, trie_sim := range e.Trie_Sims {
- if trie_sim.Get(text) {
- if pos_sim == 0 && !repeatP_sim[text] { //省简称
- PCDScore(j, "province", text, 1, false)
- repeatP_sim[text] = true
- break
- } else if pos_sim == 1 { //市简称
- if cbMap := e.CityBriefMap[text]; cbMap != nil {
- if !repeatP_sim[cbMap.P.Brief] {
- PCDScore(j, "province", cbMap.P.Brief, 1, false)
- repeatP_sim[cbMap.P.Brief] = true
- }
- if !repeatC_sim[cbMap.Name] {
- PCDScore(j, "city", cbMap.Name, 1, false)
- repeatC_sim[cbMap.Name] = true
- }
- break
- }
- } else if pos_sim == 2 { //区简称
- dfull_citys := e.DistrictSimAndAll[text]
- if len(dfull_citys) == 1 {
- for _, dfull_city := range dfull_citys {
- for dfull, ctmp := range dfull_city { //dfull:简称对应的全称
- if !repeatD_sim[dfull] {
- PCDScore(j, "district", dfull, 1, false)
- repeatD_sim[dfull] = true
- }
- if ctmp == nil {
- continue
- }
- if !repeatC_sim[ctmp.Name] {
- PCDScore(j, "city", ctmp.Name, 1, false)
- repeatC_sim[ctmp.Name] = true
- }
- if !repeatP_sim[ctmp.P.Brief] {
- PCDScore(j, "province", ctmp.P.Brief, 1, false)
- repeatP_sim[ctmp.P.Brief] = true
- }
- }
- }
- }
- }
- }
- }
- //qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
- }
- }
- }
- //街道、居委会对应多地市处理
- func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score float64, pbrief string, repeatP, repeatC, repeatD *map[string]bool) {
- if len(districts) == 1 {
- district := districts[0]
- city := district.C.Name
- tmpPbrief := district.C.P.Brief
- if pbrief != "" && tmpPbrief == pbrief {
- PCDScore(j, "province", tmpPbrief, score, true)
- PCDScore(j, "city", city, score, true)
- PCDScore(j, "district", district.Name, score, true)
- } else if pbrief == "" {
- if repeatP != nil && !(*repeatP)[tmpPbrief] {
- PCDScore(j, "province", tmpPbrief, score, true)
- (*repeatP)[tmpPbrief] = true
- } else if repeatP == nil {
- PCDScore(j, "province", tmpPbrief, score, true)
- }
- if repeatC != nil && !(*repeatC)[city] {
- PCDScore(j, "city", city, score, true)
- (*repeatC)[city] = true
- } else if repeatC == nil {
- PCDScore(j, "city", city, score, true)
- }
- if repeatD != nil && !(*repeatD)[tmpPbrief] {
- PCDScore(j, "district", district.Name, score, true)
- (*repeatD)[district.Name] = true
- } else if repeatD == nil {
- PCDScore(j, "district", district.Name, score, true)
- }
- }
- }
- }
- func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
- for _, c := range finishC { //取最高分与province匹配的city
- if cfMap := e.CityFullMap[c]; cfMap != nil {
- if cfMap.P.Brief == area {
- // city = c
- // break
- tmpcity = append(tmpcity, c)
- }
- }
- }
- if len(tmpcity) == 1 {
- city = tmpcity[0]
- }
- return city, tmpcity
- }
- func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
- for _, d := range finishD { //取最高分与province匹配的district
- citys := e.DistrictCityMap[d]
- for _, c := range citys {
- if len(tmpcity) == 0 { //没有city
- if c.P.Brief == area {
- city = c.Name
- district = d
- return city, district
- }
- } else if len(tmpcity) == 1 { //一个city
- if c.Name == city && c.P.Brief == area {
- district = d
- return city, district
- }
- } else { //多个city
- for _, tc := range tmpcity { //多个city根据district最高分取
- if tc == c.Name && len(finishD) == 1 {
- city = c.Name
- district = d
- return city, district
- }
- }
- }
- }
- }
- return city, district
- }
- //计算province,city,district区或县匹配的得分
- func PCDScoreByDistrictSim(stype, t string, score float64, ps, cs, ds *map[string]float64) {
- defer qu.Catch()
- if t != "" {
- if stype == "d" {
- tmpscore := (*ds)[t]
- (*ds)[t] = tmpscore + score
- } else if stype == "c" {
- tmpscore := (*cs)[t]
- (*cs)[t] = tmpscore + score
- } else if stype == "p" {
- tmpscore := (*ps)[t]
- (*ps)[t] = tmpscore + score
- }
- }
- }
- func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]float64) {
- if len(j.FullAreaScore) > 0 {
- for pt, ps := range *pscore {
- j.FullAreaScore[pt] = j.FullAreaScore[pt] + ps
- }
- for ct, cs := range *cscore {
- j.FullCityScore[ct] = j.FullCityScore[ct] + cs
- }
- for dt, ds := range *dscore {
- j.FullDistrictScore[dt] = j.FullDistrictScore[dt] + ds
- }
- }
- }
- func MergeFullSimScore(j *ju.Job) {
- if len(j.FullAreaScore) == 0 {
- j.FullAreaScore = j.SimAreaScore
- } else {
- for p_text, p_score := range j.FullAreaScore {
- j.FullAreaScore[p_text] = j.SimAreaScore[p_text] + p_score
- }
- }
- for c_text, c_score := range j.SimCityScore {
- j.FullCityScore[c_text] = j.FullCityScore[c_text] + c_score
- }
- for d_text, d_score := range j.SimDistrictScore {
- j.FullDistrictScore[d_text] = j.FullDistrictScore[d_text] + d_score
- }
- // if len(j.FullCityScore) == 0 {
- // j.FullCityScore = j.SimCityScore
- // } else {
- // for c_text, c_score := range j.FullCityScore {
- // j.FullCityScore[c_text] = j.SimCityScore[c_text] + c_score
- // }
- // }
- // if len(j.FullDistrictScore) == 0 {
- // j.FullDistrictScore = j.SimDistrictScore
- // } else {
- // for d_text, d_score := range j.FullDistrictScore {
- // j.FullDistrictScore[d_text] = j.SimDistrictScore[d_text] + d_score
- // }
- // }
- }
- func (e *ExtractTask) RemoveCD(finishP []string, j *ju.Job) {
- if len(j.FullDistrictScore) > 0 {
- for d, _ := range j.FullDistrictScore {
- tmpCitys := e.DistrictCityMap[d]
- for _, c := range tmpCitys {
- if j.FullCityScore[c.Name] != 0 {
- tmpPb := c.P.Brief
- //if j.FullAreaScore[tmpPb] != 0 {
- flag := false
- for _, p := range finishP {
- if tmpPb == p {
- flag = true
- break
- }
- }
- if !flag {
- delete(j.FullCityScore, c.Name)
- delete(j.FullDistrictScore, d)
- }
- //}
- }
- }
- }
- }
- if len(j.FullCityScore) > 0 {
- for tmpcity, _ := range j.FullCityScore {
- c := e.CityFullMap[tmpcity]
- if c == nil {
- log.Debug("行政区划错误数据:", tmpcity, j.SourceMid)
- continue
- }
- tmpPb := c.P.Brief
- //if j.FullAreaScore[tmpPb] != 0 {
- flag := false
- for _, p := range finishP {
- if tmpPb == p {
- flag = true
- break
- }
- }
- if !flag {
- delete(j.FullCityScore, tmpcity)
- }
- //}
- }
- }
- }
- func HighestScoreArr(m map[string]float64) []string {
- result := make(map[float64][]string)
- tmpscore := 0.0
- for str, score := range m {
- if str != "" && tmpscore <= score {
- if result[tmpscore] != nil && tmpscore != score {
- delete(result, tmpscore)
- }
- if r := result[score]; r != nil {
- r = append(r, str)
- result[score] = r
- } else {
- result[score] = []string{str}
- }
- tmpscore = score
- }
- }
- return result[tmpscore]
- }
- //计算province,city,district得分
- func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) {
- defer qu.Catch()
- if text != "" {
- if stype == "district" {
- tmpdistrict := make(map[string]float64)
- if isfull {
- tmpdistrict = j.FullDistrictScore
- } else {
- tmpdistrict = j.SimDistrictScore
- }
- scoretmp := tmpdistrict[text]
- tmpdistrict[text] = scoretmp + score
- } else if stype == "city" {
- tmpcity := make(map[string]float64)
- if isfull {
- tmpcity = j.FullCityScore
- } else {
- tmpcity = j.SimCityScore
- }
- scoretmp := tmpcity[text]
- tmpcity[text] = scoretmp + score
- } else if stype == "province" {
- tmpprovince := make(map[string]float64)
- if isfull {
- tmpprovince = j.FullAreaScore
- } else {
- tmpprovince = j.SimAreaScore
- }
- scoretmp := tmpprovince[text]
- tmpprovince[text] = scoretmp + score
- }
- }
- }
|