package extract import ( . "jy/pretreated" ju "jy/util" qu "qfw/util" "strings" ) // 标准化校验后存值 func (e *ExtractTask) ExtractRegionClean(tmp *map[string]interface{}) { //标准化校验 update_check := make(map[string]interface{}, 0) e.GetCheckFinallyRegionInfo(*tmp, &update_check) for k, v := range update_check { if k == "area" || k == "city" || k == "district" { (*tmp)[k] = v } if k == "modifycheck" && v != nil { (*tmp)[k] = v } } } // 抽取地域信息 func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}, isLog bool) { defer qu.Catch() //日志记录 logRecordInfo := []map[string]interface{}{} f_area, f_city, f_district := "", "", "" all_regions := map[string]map[string]map[string]string{} //jsondata ~ 前置条件 e.GetRegionByTentativeJsonData(j, &all_regions) if isLog && len(all_regions) > 0 { valueArr := []string{} valueArr = append(valueArr, qu.ObjToString((*j.Jsondata)["area_city_district"])) LogProcessRecordingForTentative("jsondata", valueArr, all_regions, &logRecordInfo) } b := ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district) if b { CompleteRegionInfo(&f_area, &f_city, &f_district) //最终赋值 (*tmp)["area"] = f_area (*tmp)["city"] = f_city (*tmp)["district"] = f_district (*tmp)["regions_log"] = logRecordInfo return } //是否三大运营商-前置条件2 e.GetRegionByTentativeOperator(qu.ObjToString((*tmp)["winner"]), &all_regions) if isLog && len(all_regions) > 0 { valueArr := []string{} valueArr = append(valueArr, qu.ObjToString((*tmp)["winner"])) LogProcessRecordingForTentative("运营商", valueArr, all_regions, &logRecordInfo) } b = ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district) if b { CompleteRegionInfo(&f_area, &f_city, &f_district) //最终赋值 (*tmp)["area"] = f_area (*tmp)["city"] = f_city (*tmp)["district"] = f_district (*tmp)["regions_log"] = logRecordInfo return } //字段可控 RegionFieldsArr := ju.DefaultRegions //采购单位比较特殊~需要根据站点类型进行重新组合 if e.IsConsecutionRegion(qu.ObjToString((*tmp)["site"])) { RegionFieldsArr = ju.AdjustmentRegions } for _, v := range RegionFieldsArr { keyArr := strings.Split(v, ",") isExists, textValues, field_regions, old_regions, new_regions := e.GetRegionByGroupInfo(keyArr, *tmp) if isExists { //是否存在抽取有效值 AnalysisIsUniqueInfo(new_regions, &all_regions) if isLog { //日志记录 LogProcessRecordingForGroupInfo(strings.Join(keyArr, "_"), textValues, field_regions, old_regions, all_regions, &logRecordInfo) } b = ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district) if b { CompleteRegionInfo(&f_area, &f_city, &f_district) //最终赋值 (*tmp)["area"] = f_area (*tmp)["city"] = f_city (*tmp)["district"] = f_district (*tmp)["regions_log"] = logRecordInfo return } } } //未提前结束~筛选出~最终的 ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district) //給地域做建议的清洗完善 CompleteRegionInfo(&f_area, &f_city, &f_district) //用到的字段 projectname := qu.ObjToString((*tmp)["projectname"]) buyer := qu.ObjToString((*tmp)["buyer"]) site := qu.ObjToString((*tmp)["site"]) //新疆兵团补充地域~ if XjbtReg.MatchString(buyer) && f_city == "" { if a, c, d, ok := e.NewVerifyXjCorpsInfo(buyer); ok { f_area, f_city, f_district = a, c, d if isLog { LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{ "sup_xjbt": f_area + "~" + f_city + "~" + f_district, }) } } } //此时进行特殊链路新增、补充原则 if f_city == "" { e.LinkSpecialRuleFullStep(projectname, &f_area, &f_city, &f_district) } //企业补充城市校验逻辑 if buyer != "" && f_city == "" && (f_area == "全国" || f_area == "") { LinkSpecialQyxyStep(buyer, &f_area, &f_city, &f_district) if f_area != "" && f_area != "全国" && isLog { LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{ "sup_qy_link": f_area + "~" + f_city + "~" + f_district, }) } } if f_city == "" { e.LinkSpecialRuleBriefStep(projectname, &f_area, &f_city, &f_district) } if f_city == "" { e.LinkSpecialRuleBriefStep(buyer, &f_area, &f_city, &f_district) } if isLog { LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{ "sup_link": f_area + "~" + f_city + "~" + f_district, }) } //正文补充地域~ if f_area == "全国" || f_area == "" || f_city == "" { if b := e.NewVerifySensitiveInfo(qu.ObjToString((*j.Data)["detail"]), &f_area, &f_city, &f_district); b { if isLog { LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{ "sup_detail": f_area + "~" + f_city + "~" + f_district, }) } } } //最终站点补充 if f_area == "全国" || f_area == "" { if sc := e.SiteCityMap[site]; sc != nil && sc.Q != "" { f_area = sc.Q if isLog { LogProcessRecordingForSupplement(&logRecordInfo, map[string]interface{}{ "sup_site": f_area + "~" + f_city + "~" + f_district, }) } } } //最终在清洗一遍数据 CompleteRegionInfo(&f_area, &f_city, &f_district) //最终赋值 (*tmp)["area"] = f_area (*tmp)["city"] = f_city (*tmp)["district"] = f_district (*tmp)["regions_log"] = logRecordInfo } // 对组进行分析处理 func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]interface{}) (bool, []string, map[string]interface{}, map[string]map[string]map[string]string, map[string]map[string]map[string]string) { old_regions := map[string]map[string]map[string]string{} isExists := false textArr := []string{} field_regions := map[string]interface{}{} for _, key := range keyArr { text := "" if key == "site_area" || key == "site_city" { text = qu.ObjToString(tmp["site"]) } else if key == "buyer_filiale" { text = GetFilialeByBuyerInfo(qu.ObjToString(tmp["buyer"])) } else if key == "projectname" { text = CleanRegionProjectNameInfo(qu.ObjToString(tmp[key]), qu.ObjToString(tmp["buyer"])) } else { text = qu.ObjToString(tmp[key]) } textArr = append(textArr, text) if text != "" { isExists = true } else { continue //无值不用提取 } valuesArr := []map[string]interface{}{} if key == "buyerzipcode" { valuesArr = e.GetRegionByPostCode(text, &old_regions) } else if key == "buyertel" { valuesArr = e.GetRegionByTelNumber(text, &old_regions) } else if key == "site_area" { valuesArr = e.GetRegionBySite(text, &old_regions, 1) } else if key == "site_city" { valuesArr = e.GetRegionBySite(text, &old_regions, 2) } else if key == "buyer_filiale" { valuesArr = e.GetRegionFromText(text, &old_regions, false, false, 2) } else { isAddress, isBrief := false, false if key == "projectaddr" || key == "addressing" || key == "bidopenaddress" || key == "buyeraddr" { isAddress = true } valuesArr = e.GetRegionFromText(text, &old_regions, isAddress, isBrief, 2) } field_regions[key] = valuesArr } //校验当前组的合理性 new_regions := ReasonableGroupRegionInfo(old_regions) return isExists, textArr, field_regions, old_regions, new_regions } // 邮政编号 func (e *ExtractTask) GetRegionByPostCode(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} { regionsArr := []map[string]interface{}{} pc := e.PostCodeMap[text] if pc != nil { if len(pc.D) == 1 { UpdateRegionsInfo(pc.P, pc.C, pc.D[0], regions) regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": pc.D[0]}) } else { UpdateRegionsInfo(pc.P, pc.C, "", regions) regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": ""}) } } return regionsArr } // 固话号码 func (e *ExtractTask) GetRegionByTelNumber(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} { regionsArr := []map[string]interface{}{} if len(text) >= 11 { if strings.HasPrefix(text, "0") { //区号除了澳门853其他都是以0开头 n := 4 L: areacode := text[:n] ac := e.AreaCodeMap[areacode] if ac != nil { if len(ac.C) == 1 { UpdateRegionsInfo(ac.P, ac.C[0], "", regions) regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": ac.C[0], "district": ""}) } else { UpdateRegionsInfo(ac.P, "", "", regions) regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": "", "district": ""}) } } else { n = n - 1 if n >= 3 { goto L } } } } return regionsArr } // 初步确认~采集 func (e *ExtractTask) GetRegionByTentativeJsonData(j *ju.Job, all_regions *map[string]map[string]map[string]string) { area, city, district := "", "", "" regions := map[string]map[string]map[string]string{} if j.Jsondata != nil { jsondata := *j.Jsondata if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" { e.GetRegionFromText(a_c_d, ®ions, false, false, 1) } } if len(regions) == 1 { for k, v := range regions { area = k if len(v) == 1 { for k1, v1 := range v { city = k1 if len(v1) == 1 { for k2, _ := range v1 { district = k2 } } else { break } } } else { break } } } if area != "" { //组装结构 city_info := map[string]map[string]string{} district_info := map[string]string{} if city != "" { if district != "" { district_info[district] = district } city_info[city] = district_info } (*all_regions)[area] = city_info } } // 简称全程标准化的校验~ func (e *ExtractTask) StandardizedegionInfo(area *string, city *string, district *string) { //特殊市补充 if *area == "北京" { *city = "北京市" } else if *area == "天津" { *city = "天津市" } else if *area == "上海" { *city = "上海市" } else if *area == "重庆" { *city = "重庆市" } //非空与空~是否标准校验 if *area == "" { *city = "" *district = "" } else { if province := e.ProvinceMap[*area]; province != "" { *area = province } if *city == "" { *district = "" } else { if csMap := e.CityBriefMap[*city]; csMap != nil { if csMap.P.Brief == *area && csMap.Name != "" { *city = csMap.Name } else { *city = "" *district = "" } } else { if e.CityMap[*city] == "" { *city = "" *district = "" } } if *district != "" { citysArr := e.DistrictSimAndAll[*district] if len(citysArr) == 1 { full_city := citysArr[0] for d, _ := range full_city { *district = d } } else if len(citysArr) > 1 { *district = "" } else if len(citysArr) == 0 { fullArr := e.DistrictCityMap[*district] if len(fullArr) == 0 { *district = "" } } else { } } } } } // 站点取值 from 1-省 2-省市 func (e *ExtractTask) GetRegionBySite(site string, regions *map[string]map[string]map[string]string, from int) []map[string]interface{} { regionArr := []map[string]interface{}{} area, city, district := "", "", "" if scMap := e.SiteCityMap[site]; scMap != nil { if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" { area = scMap.P } if scMap.C != "" && scMap.C != "null" && area != "" { city = scMap.C } } e.StandardizedegionInfo(&area, &city, &district) if from == 1 && area != "" && area != "全国" { UpdateRegionsInfo(area, "", "", regions) regionArr = append(regionArr, map[string]interface{}{"area": area, "city": "", "district": ""}) } if from == 2 && area != "" && area != "全国" && city != "" { UpdateRegionsInfo(area, city, "", regions) regionArr = append(regionArr, map[string]interface{}{"area": area, "city": city, "district": ""}) } return regionArr } // 新疆兵团 func (e *ExtractTask) NewVerifyXjCorpsInfo(buyer string) (new_a, new_c, new_d string, ok bool) { buyer = strings.ReplaceAll(buyer, "新疆兵团", "新疆生产建设兵团") ok = false for _, info := range e.XjbtCityArr { name := qu.ObjToString(info["name"]) alias := qu.ObjToString(info["alias"]) if strings.Contains(buyer, name) || strings.Contains(buyer, alias) { new_a = qu.ObjToString(info["area"]) new_c = qu.ObjToString(info["city"]) new_d = qu.ObjToString(info["district"]) ok = true list := ju.IsMarkInterfaceMap(info["list"]) for _, c := range list { c_name := qu.ObjToString(c["name"]) if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) { new_a = qu.ObjToString(c["area"]) new_c = qu.ObjToString(c["city"]) new_d = qu.ObjToString(c["district"]) break } } break } } return new_a, new_c, new_d, ok } // 敏感词识别 func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *string, district *string) bool { detail = SensitiveReg.ReplaceAllString(detail, "") detail = TextAfterRemoveTable(detail) detail = CleanDetailReg1.ReplaceAllString(detail, "") //全称城市 fullCityArr := e.SensitiveFullCity.FindAll(detail) if len(fullCityArr) == 1 { for _, v := range fullCityArr { if cityMap := e.CityFullMap[v]; cityMap != nil { if *area == "" || *area == "全国" || cityMap.P.Brief == *area { *area = cityMap.P.Brief *city = cityMap.Name return true } } } } //全称区县 fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail) if len(fullDistrictArr) == 1 { for _, v := range fullDistrictArr { if citys := e.DistrictCityMap[v]; len(citys) == 1 { if *area == "" || *area == "全国" || citys[0].P.Brief == *area { *area = citys[0].P.Brief *city = citys[0].Name *district = v return true } } } } //简称城市 simCityArr := e.SensitiveSimCity.FindAll(detail) if len(simCityArr) == 1 { for _, v := range simCityArr { if cityMap := e.CityBriefMap[v]; cityMap != nil { if *area == "" || *area == "全国" { *area = cityMap.P.Brief if !strings.Contains(*area, v) { *city = cityMap.Name } return true } if cityMap.P.Brief == *area && !strings.Contains(*area, v) { *area = cityMap.P.Brief *city = cityMap.Name return true } } } } //疑似固话提取~ if *area == "" || *area == "全国" { fixedTelArr := FixedTelReg.FindAllString(detail, -1) if len(fixedTelArr) > 0 { codeArr := resetFixedTelInfo(fixedTelArr) if len(codeArr) == 1 { for _, v := range codeArr { if ac := e.AreaCodeMap[v]; ac != nil { *area = ac.P return true } } } } } return false } func resetFixedTelInfo(telArr []string) []string { codeArr := []string{} telsMap := map[string]string{} for _, v := range telArr { if v != "" { arr := strings.Split(v, "-") code := qu.ObjToString(arr[0]) if telsMap[code] == "" { telsMap[code] = code codeArr = append(codeArr, code) } } } return codeArr } // 初步确认~运营商 func (e *ExtractTask) GetRegionByTentativeOperator(winner string, all_regions *map[string]map[string]map[string]string) { area, city, district := "", "", "" regions := map[string]map[string]map[string]string{} if OperatorReg.MatchString(winner) { e.GetRegionFromText(winner, ®ions, false, false, 2) } if len(regions) == 1 { for k, v := range regions { area = k if len(v) == 1 { for k1, v1 := range v { city = k1 if len(v1) == 1 { for k2, _ := range v1 { district = k2 } } else { break } } } else { break } } } if area != "" { //组装结构 //舍弃运营商的数据-area不一致 if (*all_regions)[area] == nil && len((*all_regions)) > 0 { return } city_info := map[string]map[string]string{} if (*all_regions)[area] != nil { city_info = (*all_regions)[area] } district_info := map[string]string{} if city != "" { //舍弃运营商的数据-city不一致 if city_info[city] == nil && len(city_info) > 0 { return } if district != "" { district_info[district] = district } city_info[city] = district_info } (*all_regions)[area] = city_info } } //初步确认~站点 //func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) { // area, city, district := "", "", "" // site, _ := (*j.Data)["site"].(string) // if scMap := e.SiteCityMap[site]; scMap != nil { // if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" { // area = scMap.P // } // if scMap.C != "" && scMap.C != "null" && area != "" { // city = scMap.C // } // //if scMap.D != "" && scMap.D != "null" && city != "" { // // district = scMap.D // //} // } // // //对省市区进行标准化校验~简称全程的问题 // e.StandardizedegionInfo(&area, &city, &district) // // //取出唯一数据 // j_area, j_city, j_district := "", "", "" // is_adjust := false // if len(*all_regions) == 1 { //有值~只进行补充操作 // for k, v := range *all_regions { // j_area = k // for k1, v1 := range v { // j_city = k1 // for k2, _ := range v1 { // j_district = k2 // } // } // } // if j_area == area && area != "" { // if city != "" { // if j_city == "" { // is_adjust = true // } else if j_city == city { // if district != "" && j_district == "" { // is_adjust = true // } // } // } // } // } else { // is_adjust = true // } // if is_adjust && area != "" { //进行调整 // city_info := map[string]map[string]string{} // district_info := map[string]string{} // if city != "" { // if district != "" { // district_info[district] = district // } // city_info[city] = district_info // } // (*all_regions)[area] = city_info // } //}