|
@@ -1,13 +1,7 @@
|
|
package extract
|
|
package extract
|
|
|
|
|
|
-import (
|
|
|
|
- . "jy/pretreated"
|
|
|
|
- ju "jy/util"
|
|
|
|
- qu "qfw/util"
|
|
|
|
- "strings"
|
|
|
|
-)
|
|
|
|
|
|
+import "regexp"
|
|
|
|
|
|
-//省
|
|
|
|
type Province struct {
|
|
type Province struct {
|
|
Name string
|
|
Name string
|
|
Brief string
|
|
Brief string
|
|
@@ -62,596 +56,10 @@ type AreaCode struct {
|
|
C []string
|
|
C []string
|
|
}
|
|
}
|
|
|
|
|
|
-//抽取city
|
|
|
|
-func (e *ExtractTask) ExtractCity(j *ju.Job, resulttmp map[string]interface{}, id string) {
|
|
|
|
- /*
|
|
|
|
- 高准确率:
|
|
|
|
- 1.爬虫数据jsondata
|
|
|
|
- 2.采购单位库
|
|
|
|
- 3.邮编
|
|
|
|
- 4.固话
|
|
|
|
- 5.site(todo)
|
|
|
|
- 低准确率:(全称库匹配到不走简称库)
|
|
|
|
- 1.city全称库(buyeraddr;title,projectname)
|
|
|
|
- 2.city简称库(buyeraddr;title,projectname)
|
|
|
|
- */
|
|
|
|
- defer qu.Catch()
|
|
|
|
- //初始化
|
|
|
|
- if j.FullAreaScore == nil {
|
|
|
|
- j.FullAreaScore = make(map[string]float64)
|
|
|
|
- }
|
|
|
|
- if j.FullCityScore == nil {
|
|
|
|
- j.FullCityScore = make(map[string]float64)
|
|
|
|
- }
|
|
|
|
- if j.FullDistrictScore == nil {
|
|
|
|
- j.FullDistrictScore = make(map[string]float64)
|
|
|
|
- }
|
|
|
|
- sm := NewSortMap()
|
|
|
|
- //高精度抽取city
|
|
|
|
- //存储每个流程的抽取结果
|
|
|
|
- area1 := make([]map[string]string, 4)
|
|
|
|
- city1 := make([]map[string]string, 4)
|
|
|
|
- district1 := make([]map[string]string, 4)
|
|
|
|
-
|
|
|
|
- //jsondata
|
|
|
|
- p0, c0, d0, p, c, d := e.GetCityByJsonData(j)
|
|
|
|
- area1 = append(area1, map[string]string{"a_c_d": p})
|
|
|
|
- city1 = append(city1, map[string]string{"a_c_d": c})
|
|
|
|
- district1 = append(district1, map[string]string{"a_c_d": d})
|
|
|
|
- area1[0] = map[string]string{"jsondata": p0}
|
|
|
|
- city1[0] = map[string]string{"jsondata": c0}
|
|
|
|
- district1[0] = map[string]string{"jsondata": d0}
|
|
|
|
- //qu.Debug("=====jsondata打分---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
|
- //采购单位库
|
|
|
|
- buyer, _ := resulttmp["buyer"].(string)
|
|
|
|
- p1, c1, d1 := e.GetCityByBuyer(j, buyer)
|
|
|
|
- //qu.Debug("buyer p--", p1, "c--", c1, "d--", d1)
|
|
|
|
- area1[1] = map[string]string{"buyer": p1}
|
|
|
|
- city1[1] = map[string]string{"buyer": c1}
|
|
|
|
- district1[1] = map[string]string{"buyer": d1}
|
|
|
|
- //qu.Debug("=====采购单位库打分---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
|
- //postcode邮编
|
|
|
|
- buyerzipcode, _ := resulttmp["buyerzipcode"].(string)
|
|
|
|
- p2, c2, d2 := e.GetCityByPostCode(j, buyerzipcode)
|
|
|
|
- //qu.Debug("postcode p--", p2, "c--", c2, "d--", d2)
|
|
|
|
- area1[2] = map[string]string{"postcode": p2}
|
|
|
|
- city1[2] = map[string]string{"postcode": c2}
|
|
|
|
- district1[2] = map[string]string{"postcode": d2}
|
|
|
|
- //qu.Debug("=====postcode邮编打分---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
|
- //areacode固话区号
|
|
|
|
- buyertel, _ := resulttmp["buyertel"].(string)
|
|
|
|
- p3, c3, d3 := e.GetCityByAreaCode(j, buyertel)
|
|
|
|
- //qu.Debug("areacode p--", p3, "c--", c3, "d--", d3, buyertel)
|
|
|
|
- area1[3] = map[string]string{"areacode": p3}
|
|
|
|
- city1[3] = map[string]string{"areacode": c3}
|
|
|
|
- district1[3] = map[string]string{"areacode": d3}
|
|
|
|
- //qu.Debug("=====areacode固话区号打分---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
|
- HighPreCity := make(map[string]interface{})
|
|
|
|
- HighPreCity["area"] = area1
|
|
|
|
- HighPreCity["city"] = city1
|
|
|
|
- HighPreCity["district"] = district1
|
|
|
|
- //低精度抽取city
|
|
|
|
- //buyeraddr,title,projectname
|
|
|
|
- buyeraddr, _ := resulttmp["buyeraddr"].(string)
|
|
|
|
- title, _ := resulttmp["title"].(string)
|
|
|
|
- projectname, _ := resulttmp["projectname"].(string)
|
|
|
|
- //qu.Debug(buyeraddr, "--", buyer, "--", title, "--", projectname)
|
|
|
|
- sm.AddKey("buyeraddr", buyeraddr)
|
|
|
|
- sm.AddKey("buyer", buyer)
|
|
|
|
- sm.AddKey("title", title)
|
|
|
|
- sm.AddKey("projectname", projectname)
|
|
|
|
- area2, city2, district2 := e.GetCityByOthers(j, sm)
|
|
|
|
- LowPreCity := make(map[string]interface{})
|
|
|
|
- LowPreCity["area"] = area2
|
|
|
|
- LowPreCity["city"] = city2
|
|
|
|
- LowPreCity["district"] = district2
|
|
|
|
- // resulttmp["highprecity"] = HighPreCity
|
|
|
|
- // resulttmp["lowprecity"] = LowPreCity
|
|
|
|
- //qu.Debug("最终打分---", j.AreaScore, j.CityScore, j.DistrictScore)
|
|
|
|
- //最终抽取结果
|
|
|
|
- finishP := HighestScoreArr(j.FullAreaScore)
|
|
|
|
- finishC := HighestScoreArr(j.FullCityScore)
|
|
|
|
- finishD := HighestScoreArr(j.FullDistrictScore)
|
|
|
|
-
|
|
|
|
- // area, _ := resulttmp["area"].(string)
|
|
|
|
- // city, _ := resulttmp["city"].(string)
|
|
|
|
- // district, _ := resulttmp["district"].(string)
|
|
|
|
- // qu.Debug("之前结果结果===", area, city, district)
|
|
|
|
- arearesult := ""
|
|
|
|
- cityresult := ""
|
|
|
|
- districtresult := ""
|
|
|
|
-
|
|
|
|
- if len(finishP) == 1 { //最高分一个
|
|
|
|
- arearesult = finishP[0] //抽取结果直接赋值
|
|
|
|
- cityresult = GetCity(arearesult, cityresult, e, finishC)
|
|
|
|
- cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
|
|
|
|
- } else if len(finishP) > 1 { //province最高分多个
|
|
|
|
- if len(finishC) == 1 {
|
|
|
|
- cityresult = finishC[0]
|
|
|
|
- if cfMap := e.CityFullMap[cityresult]; cfMap != nil {
|
|
|
|
- arearesult = cfMap.P.Brief
|
|
|
|
- cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
|
|
|
|
- }
|
|
|
|
- } else { //对应的city有多个(多个province和city)
|
|
|
|
- arearesult = finishP[0] //抽取结果直接赋值
|
|
|
|
- cityresult = GetCity(arearesult, cityresult, e, finishC)
|
|
|
|
- cityresult, districtresult = GetDistrict(arearesult, cityresult, districtresult, e, finishD)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
|
|
|
|
- if arearesult == "" {
|
|
|
|
- arearesult = "全国"
|
|
|
|
- } else if cityresult == "" {
|
|
|
|
- if pbMap := e.ProvinceBriefMap[arearesult]; pbMap != nil {
|
|
|
|
- cityresult = pbMap.Cap
|
|
|
|
- resulttmp["defaultpcap"] = true
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- //qu.Debug("结果===", arearesult, "--", cityresult, "--", districtresult)
|
|
|
|
- resulttmp["area"] = arearesult
|
|
|
|
- resulttmp["city"] = cityresult
|
|
|
|
- resulttmp["district"] = districtresult
|
|
|
|
-}
|
|
|
|
-func (e *ExtractTask) GetCityByJsonData(j *ju.Job) (province, city, district, p, c, d string) {
|
|
|
|
- defer qu.Catch()
|
|
|
|
- jsondata := *j.Jsondata
|
|
|
|
- if jsondata != nil { //jsondata中获取province和city
|
|
|
|
- if acd, ok := jsondata["area_city_district"].(string); ok && acd != "" {
|
|
|
|
- flag := false
|
|
|
|
- p, flag = GetPCDByAreaDFA(p, acd, e, j, flag)
|
|
|
|
- if !flag {
|
|
|
|
- p, c, flag = GetPCDByCityDFA(p, c, acd, e, j, flag)
|
|
|
|
- }
|
|
|
|
- if !flag {
|
|
|
|
- p, city, c = GetPCDByDistrictDFA(p, c, d, acd, e, j)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- city, _ = jsondata["city"].(string) //city全称或者简称
|
|
|
|
- province, _ = jsondata["area"].(string) //province简称
|
|
|
|
- district, _ = jsondata["district"].(string) //district全称
|
|
|
|
- }
|
|
|
|
- PCDScore(j, "district", district, 5, true) //district打分
|
|
|
|
- bp := false
|
|
|
|
- if province != "" {
|
|
|
|
- if e.ProvinceBriefMap[province] != nil { //判断爬虫的省份是否正确 (全国)
|
|
|
|
- bp = true //省份正确
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- pbrief := ""
|
|
|
|
- if city != "" {
|
|
|
|
- cityfullmap := e.CityFullMap[city] //判断city全称是否正确
|
|
|
|
- if cityfullmap != nil {
|
|
|
|
- pbrief = cityfullmap.P.Brief //province简称
|
|
|
|
- } else {
|
|
|
|
- citybriefmap := e.CityBriefMap[city] //判断city简称是否正确
|
|
|
|
- if citybriefmap != nil {
|
|
|
|
- city = citybriefmap.Name //city简称替换为全称
|
|
|
|
- pbrief = citybriefmap.P.Brief
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- if bp {
|
|
|
|
- if pbrief == province { //爬虫的province和city匹配
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- } else { //pbrief不匹配province(此时city为空或者错误)
|
|
|
|
- city = ""
|
|
|
|
- }
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- } else { //省份错误或为空,取city的对应的pbrief为province
|
|
|
|
- if pbrief != "" {
|
|
|
|
- province = pbrief
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- } else {
|
|
|
|
- province = ""
|
|
|
|
- city = ""
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return
|
|
|
|
-
|
|
|
|
-}
|
|
|
|
-func (e *ExtractTask) GetCityByBuyer(j *ju.Job, buyer string) (province, city, district string) {
|
|
|
|
- defer qu.Catch()
|
|
|
|
- return
|
|
|
|
-}
|
|
|
|
-func (e *ExtractTask) GetCityByPostCode(j *ju.Job, postcode string) (province, city, district string) {
|
|
|
|
- defer qu.Catch()
|
|
|
|
- pc := e.PostCodeMap[postcode]
|
|
|
|
- if pc != nil {
|
|
|
|
- province = pc.P
|
|
|
|
- city = pc.C
|
|
|
|
- districtTmp := pc.D
|
|
|
|
- if len(districtTmp) == 1 { //对应多个district舍去
|
|
|
|
- district = districtTmp[0]
|
|
|
|
- PCDScore(j, "district", district, 5, true)
|
|
|
|
- }
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- }
|
|
|
|
- return
|
|
|
|
-}
|
|
|
|
-func (e *ExtractTask) GetCityByAreaCode(j *ju.Job, buyertel string) (province, city, district string) {
|
|
|
|
- defer qu.Catch()
|
|
|
|
- if len(buyertel) >= 11 {
|
|
|
|
- if strings.HasPrefix(buyertel, "0") { //区号除了澳门853其他都是以0开头
|
|
|
|
- n := 4
|
|
|
|
- L:
|
|
|
|
- areacode := buyertel[:n]
|
|
|
|
- ac := e.AreaCodeMap[areacode]
|
|
|
|
- if ac != nil {
|
|
|
|
- province = ac.P
|
|
|
|
- citytmp := ac.C
|
|
|
|
- if len(citytmp) == 1 { //对应多个city舍去
|
|
|
|
- city = citytmp[0]
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- }
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- } else {
|
|
|
|
- n = n - 1
|
|
|
|
- if n >= 3 {
|
|
|
|
- goto L
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- } else if buyertel[:3] == "853" { //澳门
|
|
|
|
- province = "澳门"
|
|
|
|
- city = "澳门"
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return
|
|
|
|
-}
|
|
|
|
-func (e *ExtractTask) GetCityByOthers(j *ju.Job, sm *SortMap) ([]map[string]string, []map[string]string, []map[string]string) {
|
|
|
|
- //存储每个流程的抽取结果
|
|
|
|
- area2 := []map[string]string{}
|
|
|
|
- city2 := []map[string]string{}
|
|
|
|
- district2 := []map[string]string{}
|
|
|
|
- isExtPC := false
|
|
|
|
- for _, from := range sm.Keys { //buyeraddr;title;projectname
|
|
|
|
- str, _ := sm.Map[from].(string)
|
|
|
|
- //分别记录buyeraddr;title;projectname全称匹配的打分情况
|
|
|
|
- pscore1 := make(map[string]int)
|
|
|
|
- cscore1 := make(map[string]int)
|
|
|
|
- dscore1 := make(map[string]int)
|
|
|
|
- //优先province,city,district,street全称匹配
|
|
|
|
- for pos, GET := range []*ju.DFA{e.ProvinceAllGet, e.CityAllGet, e.DistrictAllGet, e.StreetGet} {
|
|
|
|
- word := GET.CheckSensitiveWord(str)
|
|
|
|
- if word != "" {
|
|
|
|
- if pos == 0 { //province
|
|
|
|
- pbrief := e.ProvinceMap[word] //取province简称
|
|
|
|
- OtherScore("p", []string{pbrief}, &pscore1, &cscore1, &dscore1)
|
|
|
|
- } else if pos == 1 { //city
|
|
|
|
- p := ""
|
|
|
|
- cityfullmap := e.CityFullMap[word]
|
|
|
|
- if cityfullmap != nil {
|
|
|
|
- p = cityfullmap.P.Brief //取province简称
|
|
|
|
- }
|
|
|
|
- OtherScore("c", []string{p, word}, &pscore1, &cscore1, &dscore1)
|
|
|
|
- } else if pos == 2 { //district
|
|
|
|
- p, c := "", ""
|
|
|
|
- dcitymap := e.DistrictCityMap[word] //区对应的city
|
|
|
|
-
|
|
|
|
- if dcitymap != nil {
|
|
|
|
- c = dcitymap.Name //city全称
|
|
|
|
- p = dcitymap.P.Brief //province简称
|
|
|
|
- }
|
|
|
|
- tmpArr := []string{p, c, word}
|
|
|
|
- if word == c { //河南济源市
|
|
|
|
- tmpArr = []string{p, c}
|
|
|
|
- }
|
|
|
|
- OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1)
|
|
|
|
- } else if pos == 3 { //street
|
|
|
|
- p, c, d := "", "", ""
|
|
|
|
- sdmap := e.StreetDistrictMap[word] //对应的区
|
|
|
|
-
|
|
|
|
- if sdmap != nil {
|
|
|
|
- d = sdmap.Name
|
|
|
|
- c = sdmap.C.Name
|
|
|
|
- p = sdmap.C.P.Brief
|
|
|
|
- }
|
|
|
|
- tmpArr := []string{p, c, d}
|
|
|
|
- if c == d { //河南济源市
|
|
|
|
- tmpArr = []string{p, c}
|
|
|
|
- }
|
|
|
|
- OtherScore("d", tmpArr, &pscore1, &cscore1, &dscore1)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- //取最高分的province,city,district
|
|
|
|
- ph1 := HighestScore(pscore1)
|
|
|
|
- ch1 := HighestScore(cscore1)
|
|
|
|
- dh1 := HighestScore(dscore1)
|
|
|
|
- isMatch := IsMatch(ph1, ch1, e) //最高分p和最高分c可能不对应
|
|
|
|
- if ch1 != "" && ph1 != "" && isMatch {
|
|
|
|
- isExtPC = true
|
|
|
|
- }
|
|
|
|
- //是否相互匹配
|
|
|
|
- area2 = append(area2, map[string]string{from + "_all": ph1})
|
|
|
|
- city2 = append(city2, map[string]string{from + "_all": ch1})
|
|
|
|
- district2 = append(district2, map[string]string{from + "_all": dh1})
|
|
|
|
- //buyeraddr,title,projectname匹配对应的结果加入最终得分
|
|
|
|
- if isMatch {
|
|
|
|
- if from == "buyeraddr" || from == "buyer" { //全称匹配,buyeraddr和buyer3分,title和projectname2分
|
|
|
|
- PCDScore(j, "province", ph1, 3, true)
|
|
|
|
- PCDScore(j, "city", ch1, 3, true)
|
|
|
|
- PCDScore(j, "district", dh1, 3, true)
|
|
|
|
- } else {
|
|
|
|
- PCDScore(j, "province", ph1, 2, true)
|
|
|
|
- PCDScore(j, "city", ch1, 2, true)
|
|
|
|
- PCDScore(j, "district", dh1, 2, true)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- }
|
|
|
|
- //判断全称是否抽出了province和city,一个未抽出走简称抽取
|
|
|
|
- if !isExtPC {
|
|
|
|
- for _, from := range sm.Keys { //buyeraddr;title;projectname
|
|
|
|
- str, _ := sm.Map[from].(string)
|
|
|
|
- pscore2 := make(map[string]int)
|
|
|
|
- cscore2 := make(map[string]int)
|
|
|
|
- dscore2 := make(map[string]int)
|
|
|
|
- for pos, GET := range []*ju.DFA{e.ProvinceSimGet, e.CitySimGet, e.DistrictSimGet} {
|
|
|
|
- word := GET.CheckSensitiveWord(str)
|
|
|
|
- if word != "" {
|
|
|
|
- if pos == 0 { //province
|
|
|
|
- OtherScore("p", []string{word}, &pscore2, &cscore2, &dscore2)
|
|
|
|
- } else if pos == 1 { //city
|
|
|
|
- p, c := "", ""
|
|
|
|
- citybriefmap := e.CityBriefMap[word]
|
|
|
|
- if citybriefmap != nil {
|
|
|
|
- p = citybriefmap.P.Brief
|
|
|
|
- c = citybriefmap.Name
|
|
|
|
- }
|
|
|
|
- OtherScore("c", []string{p, c}, &pscore2, &cscore2, &dscore2)
|
|
|
|
- } else if pos == 2 { //district
|
|
|
|
- p, c := "", ""
|
|
|
|
- d := e.DistrictSimAndAll[word]
|
|
|
|
-
|
|
|
|
- dcitymap := e.DistrictCityMap[word]
|
|
|
|
- if dcitymap != nil {
|
|
|
|
- c = dcitymap.Name
|
|
|
|
- p = dcitymap.P.Brief
|
|
|
|
- }
|
|
|
|
- OtherScore("d", []string{p, c, d}, &pscore2, &cscore2, &dscore2)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- //取最高分的province,city,district
|
|
|
|
- ph2 := HighestScore(pscore2)
|
|
|
|
- ch2 := HighestScore(cscore2)
|
|
|
|
- dh2 := HighestScore(dscore2)
|
|
|
|
- area2 = append(area2, map[string]string{from + "_sim": ph2})
|
|
|
|
- city2 = append(city2, map[string]string{from + "_sim": ch2})
|
|
|
|
- district2 = append(district2, map[string]string{from + "_sim": dh2})
|
|
|
|
- //buyeraddr,title,projectname匹配对应的结果加入最终得分
|
|
|
|
- if from == "buyeraddr" {
|
|
|
|
- PCDScore(j, "province", ph2, 2, true)
|
|
|
|
- PCDScore(j, "city", ch2, 2, true)
|
|
|
|
- PCDScore(j, "district", dh2, 2, true)
|
|
|
|
- } else {
|
|
|
|
- PCDScore(j, "province", ph2, 1, true)
|
|
|
|
- PCDScore(j, "city", ch2, 1, true)
|
|
|
|
- PCDScore(j, "district", dh2, 1, true)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return area2, city2, district2
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-func IsMatch(p, c string, e *ExtractTask) bool {
|
|
|
|
- ism := false
|
|
|
|
- if p != "" && c == "" {
|
|
|
|
- return true
|
|
|
|
- }
|
|
|
|
- if cfMap := e.CityFullMap[c]; cfMap != nil {
|
|
|
|
- if cfMap.P.Brief == p {
|
|
|
|
- ism = true
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return ism
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-//计算province,city,district得分
|
|
|
|
-func PCDScore(j *ju.Job, stype, text string, score float64, isfull bool) {
|
|
|
|
- defer qu.Catch()
|
|
|
|
- if text != "" {
|
|
|
|
- if stype == "district" {
|
|
|
|
- tmpdistrict := make(map[string]float64)
|
|
|
|
- if isfull {
|
|
|
|
- tmpdistrict = j.FullDistrictScore
|
|
|
|
- } else {
|
|
|
|
- tmpdistrict = j.SimDistrictScore
|
|
|
|
- }
|
|
|
|
- scoretmp := tmpdistrict[text]
|
|
|
|
- tmpdistrict[text] = scoretmp + score
|
|
|
|
- } else if stype == "city" {
|
|
|
|
- tmpcity := make(map[string]float64)
|
|
|
|
- if isfull {
|
|
|
|
- tmpcity = j.FullCityScore
|
|
|
|
- } else {
|
|
|
|
- tmpcity = j.SimCityScore
|
|
|
|
- }
|
|
|
|
- scoretmp := tmpcity[text]
|
|
|
|
- tmpcity[text] = scoretmp + score
|
|
|
|
- } else if stype == "province" {
|
|
|
|
- tmpprovince := make(map[string]float64)
|
|
|
|
- if isfull {
|
|
|
|
- tmpprovince = j.FullAreaScore
|
|
|
|
- } else {
|
|
|
|
- tmpprovince = j.SimAreaScore
|
|
|
|
- }
|
|
|
|
- scoretmp := tmpprovince[text]
|
|
|
|
- tmpprovince[text] = scoretmp + score
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-func OtherScore(stype string, text []string, ps, cs, ds *map[string]int) {
|
|
|
|
- defer qu.Catch()
|
|
|
|
- for i, t := range text {
|
|
|
|
- if t != "" {
|
|
|
|
- if i == 0 { //p
|
|
|
|
- tmpscore := (*ps)[t]
|
|
|
|
- (*ps)[t] = tmpscore + 1
|
|
|
|
- } else if i == 1 { //c
|
|
|
|
- tmpscore := (*cs)[t]
|
|
|
|
- (*cs)[t] = tmpscore + 1
|
|
|
|
- } else if i == 2 { //d
|
|
|
|
- tmpscore := (*ds)[t]
|
|
|
|
- (*ds)[t] = tmpscore + 1
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-func HighestScore(m map[string]int) string {
|
|
|
|
- result := ""
|
|
|
|
- tmpscore := 0
|
|
|
|
- for str, score := range m {
|
|
|
|
- if str != "" && tmpscore < score {
|
|
|
|
- result = str
|
|
|
|
- tmpscore = score
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return result
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-func HighestScoreArr(m map[string]float64) []string {
|
|
|
|
- result := make(map[float64][]string)
|
|
|
|
- tmpscore := 0.0
|
|
|
|
- for str, score := range m {
|
|
|
|
- if str != "" && tmpscore <= score {
|
|
|
|
- if result[tmpscore] != nil && tmpscore != score {
|
|
|
|
- delete(result, tmpscore)
|
|
|
|
- }
|
|
|
|
- if r := result[score]; r != nil {
|
|
|
|
- r = append(r, str)
|
|
|
|
- result[score] = r
|
|
|
|
- } else {
|
|
|
|
- result[score] = []string{str}
|
|
|
|
- }
|
|
|
|
- tmpscore = score
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return result[tmpscore]
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-func GetCity(area, city string, e *ExtractTask, finishC []string) string {
|
|
|
|
- for _, c := range finishC { //取最高分与province匹配的city
|
|
|
|
- if cfMap := e.CityFullMap[c]; cfMap != nil {
|
|
|
|
- if cfMap.P.Brief == area {
|
|
|
|
- city = c
|
|
|
|
- break
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return city
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-func GetDistrict(area, city, district string, e *ExtractTask, finishD []string) (string, string) {
|
|
|
|
- for _, d := range finishD { //取最高分与province匹配的district
|
|
|
|
- if dcMap := e.DistrictCityMap[d]; dcMap != nil {
|
|
|
|
- if dcMap.P.Brief == area {
|
|
|
|
- district = d
|
|
|
|
- tmpcity := dcMap.Name
|
|
|
|
- if city != tmpcity {
|
|
|
|
- if cfMap := e.CityFullMap[tmpcity]; cfMap != nil {
|
|
|
|
- if cfMap.P.Brief == area {
|
|
|
|
- city = tmpcity
|
|
|
|
- break
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return city, district
|
|
|
|
|
|
+var AgencyReg = []*regexp.Regexp{
|
|
|
|
+ regexp.MustCompile("(?s)(代理(机构|人|单位|公司)|中标供应商).{0,30}"),
|
|
|
|
+ regexp.MustCompile(".{2,15}((招标)?代理|咨询|政府采购)"),
|
|
}
|
|
}
|
|
|
|
|
|
-func GetPCDByAreaDFA(province, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, bool) {
|
|
|
|
- if word := e.ProvinceSimGet.CheckSensitiveWord(acd); word != "" { //取省
|
|
|
|
- if pbMap := e.ProvinceBriefMap[word]; pbMap != nil {
|
|
|
|
- province = pbMap.Brief
|
|
|
|
- if province == acd || pbMap.Name == acd { //用于判断area_city_district是否只有省份信息,flag为true就不在匹配area_city_district中的city和district
|
|
|
|
- flag = true
|
|
|
|
- }
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return province, flag
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-func GetPCDByCityDFA(province, city, acd string, e *ExtractTask, j *ju.Job, flag bool) (string, string, bool) {
|
|
|
|
- for pos, GET := range []*ju.DFA{e.CityAllGet, e.CitySimGet} { //取市
|
|
|
|
- if word := GET.CheckSensitiveWord(acd); word != "" {
|
|
|
|
- if pos == 0 { //全称
|
|
|
|
- if cfMap := e.CityFullMap[word]; cfMap != nil {
|
|
|
|
- if province != "" && cfMap.P.Brief == province { //acd有province信息
|
|
|
|
- city = cfMap.Name
|
|
|
|
- if acd == province+city || acd == cfMap.P.Name+city {
|
|
|
|
- flag = true
|
|
|
|
- }
|
|
|
|
- } else if province == "" { //acd有city;city和district信息
|
|
|
|
- city = cfMap.Name
|
|
|
|
- province = cfMap.P.Brief
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- if acd == city {
|
|
|
|
- flag = true
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- break
|
|
|
|
- }
|
|
|
|
- } else { //简称
|
|
|
|
- if cbMap := e.CityBriefMap[word]; cbMap != nil {
|
|
|
|
- if province != "" && cbMap.P.Brief == province {
|
|
|
|
- city = cbMap.Name
|
|
|
|
- if acd == province+city || acd == cbMap.P.Name+city {
|
|
|
|
- flag = true
|
|
|
|
- }
|
|
|
|
- } else if province == "" {
|
|
|
|
- city = cbMap.Name
|
|
|
|
- province = cbMap.P.Brief
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- if acd == city {
|
|
|
|
- flag = true
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- break
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return province, city, flag
|
|
|
|
-}
|
|
|
|
-func GetPCDByDistrictDFA(province, city, district, acd string, e *ExtractTask, j *ju.Job) (string, string, string) {
|
|
|
|
- //area_city_district字段不会单独存区信息(省市,省,市,省区,省市区)
|
|
|
|
- for pos, GET := range []*ju.DFA{e.DistrictAllGet, e.DistrictSimGet} { //取区
|
|
|
|
- if word := GET.CheckSensitiveWord(acd); word != "" {
|
|
|
|
- if dcMap := e.DistrictCityMap[word]; dcMap != nil {
|
|
|
|
- district = word
|
|
|
|
- if pos == 1 { //简称换为全称
|
|
|
|
- district = e.DistrictSimAndAll[district]
|
|
|
|
- }
|
|
|
|
- if city == "" && dcMap.P.Brief == province { //只有province和district(are_city_district:河南省二七区)
|
|
|
|
- city = dcMap.Name
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- } else if province == "" { //province和city都没有(are_city_district:二七区)
|
|
|
|
- city = dcMap.Name
|
|
|
|
- province = dcMap.P.Brief
|
|
|
|
- PCDScore(j, "city", city, 5, true)
|
|
|
|
- PCDScore(j, "province", province, 5, true)
|
|
|
|
- }
|
|
|
|
- PCDScore(j, "district", district, 5, true)
|
|
|
|
- break
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- return province, city, district
|
|
|
|
-}
|
|
|
|
|
|
+var xjbtReg *regexp.Regexp = regexp.MustCompile("^(新疆生产建设兵团|新疆兵团)")
|
|
|
|
+var sensitiveReg = regexp.MustCompile("(上一[条篇]|下一[条篇])[::].*")
|