|
@@ -0,0 +1,772 @@
|
|
|
+package extract
|
|
|
+
|
|
|
+import (
|
|
|
+ . "jy/pretreated"
|
|
|
+ ju "jy/util"
|
|
|
+ qu "qfw/util"
|
|
|
+ "strings"
|
|
|
+)
|
|
|
+
|
|
|
+//抽取city
|
|
|
+func (e *ExtractTask) NewExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
|
|
|
+ /*
|
|
|
+ 高准确率:
|
|
|
+ 1.爬虫数据jsondata
|
|
|
+ 2.采购单位库
|
|
|
+ 3.邮编
|
|
|
+ 4.固话
|
|
|
+ 5.site(todo)
|
|
|
+ 低准确率:(全称库匹配到不走简称库)
|
|
|
+ 1.city全称库(buyeraddr;title,projectname)
|
|
|
+ 2.city简称库(buyeraddr;title,projectname)
|
|
|
+ */
|
|
|
+ defer qu.Catch()
|
|
|
+
|
|
|
+ //初始化
|
|
|
+ if j.AreaScore == nil {
|
|
|
+ j.AreaScore = make(map[string]int)
|
|
|
+ }
|
|
|
+ if j.CityScore == nil {
|
|
|
+ j.CityScore = make(map[string]int)
|
|
|
+ }
|
|
|
+ if j.DistrictScore == nil {
|
|
|
+ j.DistrictScore = make(map[string]int)
|
|
|
+ }
|
|
|
+ //记录区或县简称匹配的p、c、d的得分;如果全称匹配和p、c简称匹配的有结果,再将得分合并,否则舍弃
|
|
|
+ pscore := make(map[string]int)
|
|
|
+ cscore := make(map[string]int)
|
|
|
+ dscore := make(map[string]int)
|
|
|
+
|
|
|
+ sm := NewSortMap()
|
|
|
+ //1.jsondata抽取
|
|
|
+ e.NewGetCityByJsonData(j)
|
|
|
+ //qu.Debug("jsondata打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //2.site库抽取
|
|
|
+ e.NewGetCityBySite(j)
|
|
|
+ //qu.Debug("site打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //3.采购单位库抽取(暂时没有采购单位库)
|
|
|
+ //buyer, _ := resulttmp["buyer"].(string)
|
|
|
+ //4.postcode邮编抽取
|
|
|
+ buyerzipcode, _ := resulttmp["buyerzipcode"].(string)
|
|
|
+ e.NewGetCityByPostCode(j, buyerzipcode)
|
|
|
+ //qu.Debug("邮编打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //5.areacode固话区号抽取
|
|
|
+ buyertel, _ := resulttmp["buyertel"].(string)
|
|
|
+ e.NewGetCityByAreaCode(j, buyertel)
|
|
|
+ //qu.Debug("固话打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //6.buyeraddr,title,projectname抽取
|
|
|
+ buyeraddr, _ := resulttmp["buyeraddr"].(string)
|
|
|
+ title, _ := resulttmp["title"].(string)
|
|
|
+ projectname, _ := resulttmp["projectname"].(string)
|
|
|
+ buyer, _ := resulttmp["buyer"].(string)
|
|
|
+ //qu.Debug("buyeraddr--", buyeraddr, "--buyer--", buyer, "--title--", title, "--projectname--", projectname)
|
|
|
+ sm.AddKey("buyeraddr", buyeraddr)
|
|
|
+ sm.AddKey("title", title)
|
|
|
+ sm.AddKey("projectname", projectname)
|
|
|
+ sm.AddKey("buyer", buyer)
|
|
|
+ e.NewGetCityByOthers(j, sm, &pscore, &cscore, &dscore)
|
|
|
+ //qu.Debug("打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //7.detail抽取
|
|
|
+ if len(j.AreaScore) > 0 {
|
|
|
+ e.NewGetCityByDetail(j)
|
|
|
+ }
|
|
|
+ //qu.Debug("detail打分后---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //合并得分
|
|
|
+ //qu.Debug("pcd=====", pscore, cscore, dscore)
|
|
|
+ MergeScores(j, &pscore, &cscore, &dscore)
|
|
|
+ //qu.Debug("合并打分后结果---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+
|
|
|
+ finishP := HighestScoreArr(j.AreaScore)
|
|
|
+ finishC := HighestScoreArr(j.CityScore)
|
|
|
+ finishD := HighestScoreArr(j.DistrictScore)
|
|
|
+ arearesult := ""
|
|
|
+ cityresult := ""
|
|
|
+ districtresult := ""
|
|
|
+ tmpcity := []string{}
|
|
|
+ if len(finishP) == 1 { //最高分一个
|
|
|
+ arearesult = finishP[0] //抽取结果直接赋值
|
|
|
+ cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
|
|
|
+ cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
+ } else if len(finishP) > 1 { //province最高分多个
|
|
|
+ if len(finishC) == 1 {
|
|
|
+ cityresult = finishC[0]
|
|
|
+ if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
|
|
|
+ arearesult = cfMap.P.Brief
|
|
|
+ cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
+ }
|
|
|
+ } else { //对应的city有多个(多个province和city)
|
|
|
+ arearesult = finishP[0] //抽取结果直接赋值
|
|
|
+ cityresult, tmpcity = NewGetCity(arearesult, cityresult, e, finishC, tmpcity)
|
|
|
+ cityresult, districtresult = NewGetDistrict(arearesult, cityresult, districtresult, e, finishD, tmpcity)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
|
|
|
+ if arearesult == "" {
|
|
|
+ arearesult = "全国"
|
|
|
+ } else if cityresult == "" {
|
|
|
+ if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
|
|
|
+ cityresult = pbMap.Cap
|
|
|
+ resulttmp["defaultpcap"] = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //qu.Debug("结果2===", arearesult, "--", cityresult, "--", districtresult)
|
|
|
+ resulttmp["area"] = arearesult
|
|
|
+ resulttmp["city"] = cityresult
|
|
|
+ resulttmp["district"] = districtresult
|
|
|
+}
|
|
|
+
|
|
|
+//jsondata中抽取城市
|
|
|
+func (e *ExtractTask) NewGetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
|
|
|
+ defer qu.Catch()
|
|
|
+ jsondata := *j.Jsondata
|
|
|
+ if jsondata != nil { //jsondata中获取province和city
|
|
|
+ if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
|
|
|
+ p, c, d = GetByACDFullJb(p, c, d, a_c_d, e, j) //全称匹配
|
|
|
+ GetByACDSimJb(p, c, d, a_c_d, e, j) //简称匹配
|
|
|
+ }
|
|
|
+ city, _ = jsondata["city"].(string) //city全称或者简称
|
|
|
+ province, _ = jsondata["area"].(string) //province简称
|
|
|
+ district, _ = jsondata["district"].(string) //district全称
|
|
|
+ }
|
|
|
+ PCDScore(j, "district", district, 5) //district打分
|
|
|
+ bp := false
|
|
|
+ if province != "" {
|
|
|
+ if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
|
|
|
+ bp = true //省份正确
|
|
|
+ }
|
|
|
+ }
|
|
|
+ pbrief := ""
|
|
|
+ if city != "" {
|
|
|
+ cityfullmap := e.CityFullMap[city] //判断city全称是否正确
|
|
|
+ if cityfullmap != nil {
|
|
|
+ pbrief = cityfullmap.P.Brief //province简称
|
|
|
+ } else {
|
|
|
+ citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
|
|
|
+ if citybriefmap != nil {
|
|
|
+ city = citybriefmap.Name //city简称替换为全称
|
|
|
+ pbrief = citybriefmap.P.Brief
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if bp {
|
|
|
+ if pbrief == province { //爬虫的province和city匹配
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ } else { //pbrief不匹配province(此时city为空或者错误)
|
|
|
+ city = ""
|
|
|
+ }
|
|
|
+ PCDScore(j, "province", province, 5)
|
|
|
+ } else { //省份错误或为空,取city的对应的pbrief为province
|
|
|
+ if pbrief != "" {
|
|
|
+ province = pbrief
|
|
|
+ PCDScore(j, "province", province, 5)
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ } else {
|
|
|
+ province = ""
|
|
|
+ city = ""
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+//全称从area_city_district中抽城市
|
|
|
+func GetByACDFullJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) (string, string, string) {
|
|
|
+ text := e.JB_PCD.Cut(a_c_d, true)
|
|
|
+ //qu.Debug("Full----", text)
|
|
|
+ repeatPb := map[string]bool{}
|
|
|
+ for _, full := range text {
|
|
|
+ if e.Trie_Full_Province.Get(full) { //a_c_d有province全称
|
|
|
+ if tmpPbrief := e.ProvinceMap[full]; tmpPbrief != "" {
|
|
|
+ pbrief = tmpPbrief //省简称
|
|
|
+ PCDScore(j, "province", pbrief, 5)
|
|
|
+ }
|
|
|
+ } else if e.Trie_Full_City.Get(full) { //a_c_d有city全称
|
|
|
+ if cfMap := e.CityFullMap[full]; cfMap != nil {
|
|
|
+ tmpcity := cfMap.Name //城市全称
|
|
|
+ tmpPbrief := cfMap.P.Brief //省简称
|
|
|
+ if pbrief != "" && pbrief == tmpPbrief { //已获取省简称
|
|
|
+ city = tmpcity
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ } else if pbrief == "" {
|
|
|
+ city = tmpcity
|
|
|
+ pbrief = tmpPbrief
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ PCDScore(j, "province", pbrief, 5)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if e.Trie_Full_District.Get(full) { //a_c_d有district全称(district可能对应多个城市)
|
|
|
+ carr := e.NewDistrictCityMap[full]
|
|
|
+ if len(carr) > 0 {
|
|
|
+ district = full
|
|
|
+ PCDScore(j, "district", district, 5)
|
|
|
+ for _, c := range carr {
|
|
|
+ tmpcity := c.Name //城市全称
|
|
|
+ tmpPbrief := c.P.Brief //省简称
|
|
|
+ if pbrief == "" { //之前没有匹配到省份
|
|
|
+ PCDScore(j, "city", tmpcity, 5)
|
|
|
+ if !repeatPb[tmpPbrief] {
|
|
|
+ PCDScore(j, "province", tmpPbrief, 5)
|
|
|
+ repeatPb[tmpPbrief] = true
|
|
|
+ }
|
|
|
+ } else { //已有省份
|
|
|
+ if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
|
|
|
+ PCDScore(j, "city", tmpcity, -5)
|
|
|
+ PCDScore(j, "province", tmpPbrief, -5)
|
|
|
+ } else { //与之前匹配结果一致
|
|
|
+ if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
|
|
|
+ PCDScore(j, "city", tmpcity, 5)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return pbrief, city, district
|
|
|
+}
|
|
|
+
|
|
|
+//简称从area_city_district中抽城市
|
|
|
+func GetByACDSimJb(pbrief, city, district, a_c_d string, e *ExtractTask, j *ju.Job) {
|
|
|
+ text := e.JB_PCD.Cut(a_c_d, true)
|
|
|
+ repeatPb := map[string]bool{}
|
|
|
+ for _, sim := range text {
|
|
|
+ if pbrief == "" && e.Trie_Sim_Province.Get(sim) { //全称未匹配到确定的province
|
|
|
+ if pbMap := e.ProvinceBriefMap[sim]; pbMap != nil {
|
|
|
+ pbrief = pbMap.Brief
|
|
|
+ PCDScore(j, "province", pbrief, 5) //打分
|
|
|
+ //PCDSubtractScore(e, j, "province", pbrief, -5) //减分(area_city_district:河南鼓楼区)
|
|
|
+ }
|
|
|
+ } else if city == "" && e.Trie_Sim_City.Get(sim) { //全称未匹配到确定的city
|
|
|
+ if cbMap := e.CityBriefMap[sim]; cbMap != nil {
|
|
|
+ tmpcity := cbMap.Name
|
|
|
+ tmpPbrief := cbMap.P.Brief
|
|
|
+ if pbrief != "" && pbrief == tmpPbrief {
|
|
|
+ city = tmpcity
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ } else if pbrief == "" {
|
|
|
+ city = tmpcity
|
|
|
+ pbrief = tmpPbrief
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ PCDScore(j, "province", pbrief, 5)
|
|
|
+ //PCDSubtractScore(e, j, "city", tmpcity, -5) //减分(area_city_district:开封鼓楼区)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if district == "" && e.Trie_Sim_District.Get(sim) { //全称未匹配到确定的district
|
|
|
+ dfullarr := e.NewDistrictSimAndAll[sim]
|
|
|
+ if len(dfullarr) > 0 {
|
|
|
+ PCDScore(j, "district", sim, 5)
|
|
|
+ for _, dfullAndCity := range dfullarr { //district简称对应的所有全称
|
|
|
+ for _, c := range dfullAndCity {
|
|
|
+ tmpcity := c.Name //城市全称
|
|
|
+ tmpPbrief := c.P.Brief //省简称
|
|
|
+ if pbrief == "" { //之前没有匹配到省份
|
|
|
+ PCDScore(j, "city", tmpcity, 5)
|
|
|
+ if !repeatPb[tmpPbrief] {
|
|
|
+ PCDScore(j, "province", tmpPbrief, 5)
|
|
|
+ repeatPb[tmpPbrief] = true
|
|
|
+ }
|
|
|
+ } else { //已有省份
|
|
|
+ if pbrief != tmpPbrief { //区对应的多个城市,与之前匹配结果不一致,认为是干扰项
|
|
|
+ PCDScore(j, "city", tmpcity, -5)
|
|
|
+ PCDScore(j, "province", tmpPbrief, -5)
|
|
|
+ } else { //与之前匹配结果一致
|
|
|
+ if city == "" { //这种情况是处理area_city_district:(河南省二七区),city在前两步匹配不到,在这里通过district补充
|
|
|
+ PCDScore(j, "city", tmpcity, 5)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//通过site提取城市
|
|
|
+func (e *ExtractTask) NewGetCityBySite(j *ju.Job) {
|
|
|
+ site, _ := (*j.Data)["site"].(string)
|
|
|
+ //qu.Debug("site--------", site)
|
|
|
+ if scMap := e.SiteCityMap[site]; scMap != nil {
|
|
|
+ if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
|
|
|
+ PCDScore(j, "province", scMap.P, 5)
|
|
|
+ }
|
|
|
+ if scMap.C != "" && scMap.C != "null" {
|
|
|
+ PCDScore(j, "city", scMap.C, 5)
|
|
|
+ }
|
|
|
+ if scMap.D != "" && scMap.D != "null" {
|
|
|
+ PCDScore(j, "district", scMap.D, 5)
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//通过邮编提取城市
|
|
|
+func (e *ExtractTask) NewGetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
|
|
|
+ defer qu.Catch()
|
|
|
+ pc := e.PostCodeMap[postcode]
|
|
|
+ if pc != nil {
|
|
|
+ province = pc.P
|
|
|
+ city = pc.C
|
|
|
+ districtTmp := pc.D //邮编可能对应多个区
|
|
|
+ score := 3
|
|
|
+ if len(districtTmp) == 1 && districtTmp[0] != "" {
|
|
|
+ score = 5
|
|
|
+ }
|
|
|
+ for _, district := range districtTmp {
|
|
|
+ PCDScore(j, "district", district, score)
|
|
|
+ }
|
|
|
+ PCDScore(j, "province", province, 5)
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+//固话区号提取城市
|
|
|
+func (e *ExtractTask) NewGetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if len(buyertel) >= 11 {
|
|
|
+ if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
|
|
|
+ n := 4
|
|
|
+ L:
|
|
|
+ areacode := buyertel[:n]
|
|
|
+ ac := e.AreaCodeMap[areacode]
|
|
|
+ if ac != nil {
|
|
|
+ province = ac.P
|
|
|
+ citytmp := ac.C
|
|
|
+ if len(citytmp) == 1 { //对应多个city舍去
|
|
|
+ city = citytmp[0]
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ }
|
|
|
+ PCDScore(j, "province", province, 5)
|
|
|
+ } else {
|
|
|
+ n = n - 1
|
|
|
+ if n >= 3 {
|
|
|
+ goto L
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if buyertel[:3] == "853" { //澳门
|
|
|
+ province = "澳门"
|
|
|
+ city = "澳门"
|
|
|
+ PCDScore(j, "province", province, 5)
|
|
|
+ PCDScore(j, "city", city, 5)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+func (e *ExtractTask) NewGetCityByOthers(j *ju.Job, sm *SortMap, pscore, cscore, dscore *map[string]int) {
|
|
|
+ /*
|
|
|
+ 1.对字段进行分词
|
|
|
+ 2.省、市、区、街道、居委会全称进行匹配打分
|
|
|
+ 3.省、市、区简称进行匹配打分
|
|
|
+ */
|
|
|
+ for _, from := range sm.Keys { //buyeraddr;title;projectname
|
|
|
+ p_full, c_full, d_full, p_sim, c_sim, d_sim := "", "", "", "", "", "" //每个字段抽取的时候重新定义该字段抽取的province,city,district
|
|
|
+ str, _ := sm.Map[from].(string)
|
|
|
+ //qu.Debug(str, "---分词结果---", e.JB_SV.Cut(str, true), p_full, c_full, d_full, p_sim, c_sim, d_sim)
|
|
|
+ jbText := e.JB_SV.Cut(str, true)
|
|
|
+ for _, text := range jbText { //结巴分词
|
|
|
+ if len([]rune(text)) == 1 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //全称匹配
|
|
|
+ //qu.Debug("text------", text)
|
|
|
+ for pos_full, trie_full := range e.Trie_Fulls {
|
|
|
+ if trie_full.Get(text) {
|
|
|
+ if pos_full == 0 && p_full == "" { //省全称
|
|
|
+ if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
|
|
|
+ p_full = tmpPbrief
|
|
|
+ PCDScore(j, "province", p_full, 4)
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if pos_full == 1 && c_full == "" { //市全称
|
|
|
+ if cfMap := e.CityFullMap[text]; cfMap != nil {
|
|
|
+ tmpPbrief := cfMap.P.Brief
|
|
|
+ //qu.Debug("市--------", text, tmpPbrief, p_full)
|
|
|
+ if p_full == "" {
|
|
|
+ p_full = tmpPbrief
|
|
|
+ c_full = cfMap.Name
|
|
|
+ PCDScore(j, "province", p_full, 4)
|
|
|
+ PCDScore(j, "city", c_full, 4)
|
|
|
+ break
|
|
|
+ } else if p_full == tmpPbrief {
|
|
|
+ c_full = cfMap.Name
|
|
|
+ PCDScore(j, "city", c_full, 4)
|
|
|
+ break
|
|
|
+ } else if p_full != "" && p_full != tmpPbrief {
|
|
|
+ //city不做处理
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if pos_full == 2 && d_full == "" { //区全称
|
|
|
+ //qu.Debug("区全称===========")
|
|
|
+ repeatPb := map[string]bool{}
|
|
|
+ isOk := false
|
|
|
+ districtOk := false
|
|
|
+ citys := e.NewDistrictCityMap[text]
|
|
|
+ for _, c := range citys {
|
|
|
+ tmpPbrief := c.P.Brief
|
|
|
+ if p_full == tmpPbrief { //省份一致
|
|
|
+ d_full = text
|
|
|
+ if c_full == "" {
|
|
|
+ c_full = c.Name
|
|
|
+ PCDScore(j, "city", c_full, 4)
|
|
|
+ }
|
|
|
+ isOk = true
|
|
|
+ districtOk = true
|
|
|
+ } else if p_full == "" { //省份不存在
|
|
|
+ districtOk = true
|
|
|
+ if len(citys) == 1 { //对应一个city
|
|
|
+ p_full = tmpPbrief
|
|
|
+ c_full = c.Name
|
|
|
+ d_full = text
|
|
|
+ PCDScore(j, "province", p_full, 4)
|
|
|
+ PCDScore(j, "city", c_full, 4)
|
|
|
+ isOk = true
|
|
|
+ } else { //多个city,只打分,不赋值
|
|
|
+ if !repeatPb[tmpPbrief] {
|
|
|
+ PCDScore(j, "province", tmpPbrief, 2)
|
|
|
+ repeatPb[tmpPbrief] = true
|
|
|
+ }
|
|
|
+ //PCDScore(j, "province", tmpPbrief, 2)
|
|
|
+ PCDScore(j, "city", c.Name, 2)
|
|
|
+ }
|
|
|
+ } else if p_full != "" && p_full != tmpPbrief { //干扰项减分
|
|
|
+ if !repeatPb[tmpPbrief] {
|
|
|
+ PCDScore(j, "province", tmpPbrief, -5)
|
|
|
+ repeatPb[tmpPbrief] = true
|
|
|
+ }
|
|
|
+ //PCDScore(j, "province", tmpPbrief, -5)
|
|
|
+ PCDScore(j, "city", c.Name, -5)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if districtOk {
|
|
|
+ PCDScore(j, "district", text, 4)
|
|
|
+ } else {
|
|
|
+ PCDScore(j, "district", text, -5)
|
|
|
+ }
|
|
|
+ if isOk {
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if pos_full == 3 { //街道全称
|
|
|
+ districts := e.NewStreetDistrictMap[text]
|
|
|
+ DealMultipleDistrict(e, j, districts, 2)
|
|
|
+ } else if pos_full == 4 { //居委会全称
|
|
|
+ districts := e.CommunityDistrictMap[text]
|
|
|
+ DealMultipleDistrict(e, j, districts, 2)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //qu.Debug("全称后--", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //简称匹配
|
|
|
+ for pos_sim, trie_sim := range e.Trie_Sims {
|
|
|
+ if trie_sim.Get(text) {
|
|
|
+ if pos_sim == 0 && p_sim == "" { //省简称
|
|
|
+ p_sim = text
|
|
|
+ PCDScore(j, "province", p_sim, 3)
|
|
|
+ break
|
|
|
+ } else if pos_sim == 1 && c_sim == "" { //市简称
|
|
|
+ if cbMap := e.CityBriefMap[text]; cbMap != nil {
|
|
|
+ tmpPbrief := cbMap.P.Brief
|
|
|
+ if p_sim == "" {
|
|
|
+ p_sim = tmpPbrief
|
|
|
+ c_sim = cbMap.Brief
|
|
|
+ PCDScore(j, "province", p_sim, 2)
|
|
|
+ PCDScore(j, "city", cbMap.Name, 2)
|
|
|
+ break
|
|
|
+ } else if p_sim == tmpPbrief {
|
|
|
+ c_sim = cbMap.Brief
|
|
|
+ PCDScore(j, "city", cbMap.Name, 3)
|
|
|
+ break
|
|
|
+ } else if p_sim != "" && p_sim != tmpPbrief {
|
|
|
+ //city不做处理
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else if pos_sim == 2 && d_sim == "" { //区简称
|
|
|
+ repeatPb := map[string]bool{}
|
|
|
+ repeatDb := map[string]bool{}
|
|
|
+ dfull_citys := e.NewDistrictSimAndAll[text]
|
|
|
+ //qu.Debug(text, dfull_citys, p_sim)
|
|
|
+ for _, dfull_city := range dfull_citys {
|
|
|
+ for dfull, c := range dfull_city { //dfull:简称对应的全称
|
|
|
+ tmpPbrief := c.P.Brief
|
|
|
+ if p_sim == tmpPbrief { //省份一致
|
|
|
+ d_sim = text
|
|
|
+ //PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
|
|
|
+ PCDScore(j, "district", dfull, 2)
|
|
|
+ if c_sim == "" {
|
|
|
+ c_sim = c.Brief
|
|
|
+ //PCDScoreByDistrictSim("c", c.Name, 2, pscore, cscore, dscore)
|
|
|
+ PCDScore(j, "city", c.Name, 2)
|
|
|
+ }
|
|
|
+ } else if p_sim == "" {
|
|
|
+ if !repeatDb[dfull] {
|
|
|
+ PCDScoreByDistrictSim("d", dfull, 1, pscore, cscore, dscore)
|
|
|
+ //PCDScore(j, "district", dfull, 1)
|
|
|
+ repeatDb[dfull] = true
|
|
|
+ }
|
|
|
+ if len(dfull_citys) == 1 {
|
|
|
+ //p_sim = tmpPbrief
|
|
|
+ //c_sim = c.Brief
|
|
|
+ //d_sim = text
|
|
|
+ PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
|
|
|
+ PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
|
|
|
+ //PCDScore(j, "province", p_sim, 2)
|
|
|
+ //PCDScore(j, "city", c.Name, 2)
|
|
|
+ } else {
|
|
|
+ if !repeatPb[tmpPbrief] {
|
|
|
+ PCDScoreByDistrictSim("p", tmpPbrief, 1, pscore, cscore, dscore)
|
|
|
+ //PCDScore(j, "province", tmpPbrief, 1)
|
|
|
+ repeatPb[tmpPbrief] = true
|
|
|
+ }
|
|
|
+ //PCDScore(j, "city", c.Name, 1)
|
|
|
+ PCDScoreByDistrictSim("c", c.Name, 1, pscore, cscore, dscore)
|
|
|
+ }
|
|
|
+ } else if p_sim != "" && p_sim != tmpPbrief {
|
|
|
+ if !repeatPb[tmpPbrief] {
|
|
|
+ PCDScoreByDistrictSim("p", tmpPbrief, -5, pscore, cscore, dscore)
|
|
|
+ //PCDScore(j, "province", tmpPbrief, -5)
|
|
|
+ repeatPb[tmpPbrief] = true
|
|
|
+ }
|
|
|
+ PCDScoreByDistrictSim("c", c.Name, -5, pscore, cscore, dscore)
|
|
|
+ //PCDScore(j, "city", c.Name, -5)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //qu.Debug("简称后--", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func (e *ExtractTask) NewGetCityByDetail(j *ju.Job) {
|
|
|
+ detailRune := []rune(j.Content)
|
|
|
+ detail := j.Content
|
|
|
+ if len(detailRune) > 600 {
|
|
|
+ start := detailRune[:300]
|
|
|
+ end := detailRune[len(detailRune)-300:]
|
|
|
+ detail = string(start) + string(end)
|
|
|
+ }
|
|
|
+ for _, text := range e.JB_SV.Cut(detail, true) {
|
|
|
+ if len([]rune(text)) > 1 {
|
|
|
+ //qu.Debug("text---", text)
|
|
|
+ //全称匹配
|
|
|
+ for pos_full, trie_full := range e.Trie_Fulls {
|
|
|
+ if trie_full.Get(text) {
|
|
|
+ if pos_full == 0 { //省全称
|
|
|
+ if tmpPbrief := e.ProvinceMap[text]; tmpPbrief != "" { //取简称
|
|
|
+ PCDScore(j, "province", tmpPbrief, 1)
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if pos_full == 1 { //市全称
|
|
|
+ if cfMap := e.CityFullMap[text]; cfMap != nil {
|
|
|
+ PCDScore(j, "province", cfMap.P.Brief, 1)
|
|
|
+ PCDScore(j, "city", cfMap.Name, 1)
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if pos_full == 2 { //区全称
|
|
|
+ citys := e.NewDistrictCityMap[text]
|
|
|
+ if len(citys) > 0 {
|
|
|
+ repeatPb := map[string]bool{}
|
|
|
+ PCDScore(j, "district", text, 1)
|
|
|
+ for _, c := range citys {
|
|
|
+ PCDScore(j, "city", c.Name, 1)
|
|
|
+ if !repeatPb[text] {
|
|
|
+ PCDScore(j, "province", c.P.Brief, 1)
|
|
|
+ repeatPb[text] = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else if pos_full == 3 { //街道全称
|
|
|
+ districts := e.NewStreetDistrictMap[text]
|
|
|
+ DealMultipleDistrict(e, j, districts, 1)
|
|
|
+ } else if pos_full == 4 { //居委会全称
|
|
|
+ districts := e.CommunityDistrictMap[text]
|
|
|
+ DealMultipleDistrict(e, j, districts, 1)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //qu.Debug("detail 全称---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ //简称匹配
|
|
|
+ for pos_sim, trie_sim := range e.Trie_Sims {
|
|
|
+ if trie_sim.Get(text) {
|
|
|
+ if pos_sim == 0 { //省简称
|
|
|
+ PCDScore(j, "province", text, 1)
|
|
|
+ break
|
|
|
+ } else if pos_sim == 1 { //市简称
|
|
|
+ if cbMap := e.CityBriefMap[text]; cbMap != nil {
|
|
|
+ PCDScore(j, "city", cbMap.Name, 1)
|
|
|
+ PCDScore(j, "province", cbMap.P.Brief, 1)
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } /* else if pos_sim == 2 { //区简称
|
|
|
+ repeatDb := map[string]bool{}
|
|
|
+ dfull_citys := e.NewDistrictSimAndAll[text]
|
|
|
+ for _, dfull_city := range dfull_citys {
|
|
|
+ for dfull, _ := range dfull_city { //dfull:简称对应的全称
|
|
|
+ if !repeatDb[dfull] {
|
|
|
+ PCDScore(j, "district", dfull, 1)
|
|
|
+ repeatDb[dfull] = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }*/
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //qu.Debug("detail 简称---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//街道、居委会对应多地市处理
|
|
|
+func DealMultipleDistrict(e *ExtractTask, j *ju.Job, districts []*District, score int) {
|
|
|
+ repeatPb := map[string]bool{}
|
|
|
+ repeatCb := map[string]bool{}
|
|
|
+ repeatDb := map[string]bool{}
|
|
|
+ for _, district := range districts {
|
|
|
+ tmpDistrict := district.Name
|
|
|
+ if !repeatDb[tmpDistrict] {
|
|
|
+ PCDScore(j, "district", tmpDistrict, score)
|
|
|
+ repeatDb[tmpDistrict] = true
|
|
|
+ }
|
|
|
+ citys := e.NewDistrictCityMap[tmpDistrict]
|
|
|
+ for _, c := range citys {
|
|
|
+ tmpCity := c.Name
|
|
|
+ tmpPbrief := c.P.Brief
|
|
|
+ if !repeatPb[tmpPbrief] {
|
|
|
+ PCDScore(j, "province", tmpPbrief, score)
|
|
|
+ repeatPb[tmpPbrief] = true
|
|
|
+ }
|
|
|
+ if !repeatCb[tmpCity] {
|
|
|
+ PCDScore(j, "city", tmpCity, score)
|
|
|
+ repeatCb[tmpCity] = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func NewGetCity(area, city string, e *ExtractTask, finishC, tmpcity []string) (string, []string) {
|
|
|
+ for _, c := range finishC { //取最高分与province匹配的city
|
|
|
+ if cfMap := e.CityFullMap[c]; cfMap != nil {
|
|
|
+ if cfMap.P.Brief == area {
|
|
|
+ // city = c
|
|
|
+ // break
|
|
|
+ tmpcity = append(tmpcity, c)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if len(tmpcity) == 1 {
|
|
|
+ city = tmpcity[0]
|
|
|
+ }
|
|
|
+ return city, tmpcity
|
|
|
+}
|
|
|
+func NewGetDistrict(area, city, district string, e *ExtractTask, finishD, tmpcity []string) (string, string) {
|
|
|
+ for _, d := range finishD { //取最高分与province匹配的district
|
|
|
+ citys := e.NewDistrictCityMap[d]
|
|
|
+ for _, c := range citys {
|
|
|
+ if len(tmpcity) == 0 { //没有city
|
|
|
+ if c.P.Brief == area {
|
|
|
+ city = c.Name
|
|
|
+ district = d
|
|
|
+ return city, district
|
|
|
+ }
|
|
|
+ } else if len(tmpcity) == 1 { //一个city
|
|
|
+ if c.Name == city && c.P.Brief == area {
|
|
|
+ district = d
|
|
|
+ return city, district
|
|
|
+ }
|
|
|
+ } else { //多个city
|
|
|
+ for _, tc := range tmpcity {
|
|
|
+ if tc == c.Name {
|
|
|
+ city = c.Name
|
|
|
+ district = d
|
|
|
+ return city, district
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // if len(citys) == 1 { //区对应一个市
|
|
|
+ // if c.P.Brief == area {
|
|
|
+ // district = d
|
|
|
+ // city = c.Name
|
|
|
+ // return city, district
|
|
|
+ // }
|
|
|
+ // } else {
|
|
|
+ // if c.P.Brief == area && c.Name == city {
|
|
|
+ // district = d
|
|
|
+ // return city, district
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return city, district
|
|
|
+}
|
|
|
+
|
|
|
+//计算province,city,district区或县匹配的得分
|
|
|
+func PCDScoreByDistrictSim(stype, t string, score int, ps, cs, ds *map[string]int) {
|
|
|
+ defer qu.Catch()
|
|
|
+ if t != "" {
|
|
|
+ if stype == "d" {
|
|
|
+ tmpscore := (*ds)[t]
|
|
|
+ (*ds)[t] = tmpscore + score
|
|
|
+ } else if stype == "c" {
|
|
|
+ tmpscore := (*cs)[t]
|
|
|
+ (*cs)[t] = tmpscore + score
|
|
|
+ } else if stype == "p" {
|
|
|
+ tmpscore := (*ps)[t]
|
|
|
+ (*ps)[t] = tmpscore + score
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func MergeScores(j *ju.Job, pscore, cscore, dscore *map[string]int) {
|
|
|
+ if len(j.AreaScore) > 0 {
|
|
|
+ for pt, ps := range *pscore {
|
|
|
+ j.AreaScore[pt] = j.AreaScore[pt] + ps
|
|
|
+ }
|
|
|
+ for ct, cs := range *cscore {
|
|
|
+ j.CityScore[ct] = j.CityScore[ct] + cs
|
|
|
+ }
|
|
|
+ for dt, ds := range *dscore {
|
|
|
+ j.DistrictScore[dt] = j.DistrictScore[dt] + ds
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+//province,city,district干扰项减分
|
|
|
+//func PCDSubtractScore(e *ExtractTask, j *ju.Job, stype, text string, score int) {
|
|
|
+// defer qu.Catch()
|
|
|
+// if text != "" {
|
|
|
+// if stype == "city" {
|
|
|
+// for cn, cscore := range j.CityScore {
|
|
|
+// if cn != text {
|
|
|
+// j.CityScore[cn] = cscore + score
|
|
|
+// //错误的city减分后对应的province也减分
|
|
|
+// for pb, pscore := range j.AreaScore {
|
|
|
+// if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
|
|
|
+// j.AreaScore[pb] = pscore + score
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// } else if stype == "province" {
|
|
|
+// for pb, pscore := range j.AreaScore {
|
|
|
+// if pb != text {
|
|
|
+// j.AreaScore[pb] = pscore + score
|
|
|
+// //错误的province减分后对应的city也要减分
|
|
|
+// for cn, cscore := range j.CityScore {
|
|
|
+// if cfMap := e.CityFullMap[cn]; cfMap != nil && cfMap.P.Brief == pb {
|
|
|
+// j.CityScore[cn] = cscore + score
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// }
|
|
|
+// // for name, tmpscore := range *whichMap {
|
|
|
+// // if name != text {
|
|
|
+// // (*whichMap)[name] = tmpscore + score
|
|
|
+// // }
|
|
|
+// // }
|
|
|
+// }
|
|
|
+//}
|