extractcity_new.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. package extract
  2. import (
  3. . "jy/pretreated"
  4. ju "jy/util"
  5. qu "qfw/util"
  6. "strings"
  7. )
  8. //抽取地域信息
  9. func (e *ExtractTask) ExtractRegionInfo(j *ju.Job, tmp *map[string]interface{}, isLog bool) {
  10. defer qu.Catch()
  11. //日志记录
  12. logRecordInfo := []map[string]interface{}{}
  13. f_area, f_city, f_district := "", "", ""
  14. all_regions := map[string]map[string]map[string]string{}
  15. //jsondata ~ 前置条件
  16. e.GetRegionByTentativeJsonData(j, &all_regions)
  17. if isLog && len(all_regions) > 0 {
  18. valueArr := []string{}
  19. valueArr = append(valueArr, qu.ObjToString((*j.Jsondata)["area_city_district"]))
  20. LogProcessRecordingForTentative("jsondata", valueArr, all_regions, &logRecordInfo)
  21. }
  22. b := ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
  23. if b {
  24. CompleteRegionInfo(&f_area, &f_city, &f_district)
  25. //最终赋值
  26. (*tmp)["area"] = f_area
  27. (*tmp)["city"] = f_city
  28. (*tmp)["district"] = f_district
  29. (*tmp)["regions_log"] = logRecordInfo
  30. return
  31. }
  32. //字段可控
  33. RegionFieldsArr := ju.DefaultRegions
  34. //采购单位比较特殊~需要根据站点类型进行重新组合
  35. if e.IsConsecutionRegion(qu.ObjToString((*tmp)["site"])) {
  36. RegionFieldsArr = ju.AdjustmentRegions
  37. }
  38. for _, v := range RegionFieldsArr {
  39. keyArr := strings.Split(v, ",")
  40. isExists, textValues, field_regions, old_regions, new_regions := e.GetRegionByGroupInfo(keyArr, *tmp)
  41. if isExists { //是否存在抽取有效值
  42. AnalysisIsUniqueInfo(new_regions, &all_regions)
  43. if isLog { //日志记录
  44. LogProcessRecordingForGroupInfo(strings.Join(keyArr, "_"), textValues, field_regions, old_regions, all_regions, &logRecordInfo)
  45. }
  46. b = ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
  47. if b {
  48. CompleteRegionInfo(&f_area, &f_city, &f_district)
  49. //最终赋值
  50. (*tmp)["area"] = f_area
  51. (*tmp)["city"] = f_city
  52. (*tmp)["district"] = f_district
  53. (*tmp)["regions_log"] = logRecordInfo
  54. return
  55. }
  56. }
  57. }
  58. //未提前结束~筛选出~最终的
  59. ConfirmUniqueRegionInfo(all_regions, &f_area, &f_city, &f_district)
  60. //給地域做建议的清洗完善
  61. CompleteRegionInfo(&f_area, &f_city, &f_district)
  62. //用到的字段
  63. projectname := qu.ObjToString((*tmp)["projectname"])
  64. buyer := qu.ObjToString((*tmp)["buyer"])
  65. site := qu.ObjToString((*tmp)["site"])
  66. //新疆兵团补充地域~
  67. if xjbtReg.MatchString(buyer) && f_city == "" {
  68. if a, c, d, ok := e.NewVerifyXjCorpsInfo(buyer); ok {
  69. f_area, f_city, f_district = a, c, d
  70. }
  71. }
  72. //此时进行特殊链路新增、补充原则
  73. if f_city == "" {
  74. e.LinkSpecialRuleFullStep(projectname, &f_area, &f_city, &f_district)
  75. }
  76. if f_city == "" {
  77. e.LinkSpecialRuleBriefStep(projectname, &f_area, &f_city, &f_district)
  78. }
  79. if f_city == "" {
  80. e.LinkSpecialRuleBriefStep(buyer, &f_area, &f_city, &f_district)
  81. }
  82. //正文补充地域~
  83. if f_area == "全国" || f_area == "" || f_city == "" {
  84. if b := e.NewVerifySensitiveInfo(qu.ObjToString((*j.Data)["detail"]), &f_area, &f_city, &f_district); b {
  85. (*tmp)["is_sensitive"] = 1
  86. }
  87. }
  88. //最终站点补充
  89. if f_area == "全国" || f_area == "" {
  90. if sc := e.SiteCityMap[site]; sc != nil && sc.Q != "" {
  91. f_area = sc.Q
  92. }
  93. }
  94. //最终在清洗一遍数据
  95. CompleteRegionInfo(&f_area, &f_city, &f_district)
  96. //最终赋值
  97. (*tmp)["area"] = f_area
  98. (*tmp)["city"] = f_city
  99. (*tmp)["district"] = f_district
  100. (*tmp)["regions_log"] = logRecordInfo
  101. }
  102. //对组进行分析处理
  103. func (e *ExtractTask) GetRegionByGroupInfo(keyArr []string, tmp map[string]interface{}) (bool, []string, map[string]interface{}, map[string]map[string]map[string]string, map[string]map[string]map[string]string) {
  104. old_regions := map[string]map[string]map[string]string{}
  105. isExists := false
  106. textArr := []string{}
  107. field_regions := map[string]interface{}{}
  108. for _, key := range keyArr {
  109. text := ""
  110. if key == "site_area" || key == "site_city" {
  111. text = qu.ObjToString(tmp["site"])
  112. } else if key == "buyer_filiale" {
  113. text = GetFilialeByBuyerInfo(qu.ObjToString(tmp["buyer"]))
  114. } else if key == "projectname" {
  115. text = CleanRegionProjectNameInfo(qu.ObjToString(tmp[key]), qu.ObjToString(tmp["buyer"]))
  116. } else {
  117. text = qu.ObjToString(tmp[key])
  118. }
  119. textArr = append(textArr, text)
  120. if text != "" {
  121. isExists = true
  122. } else {
  123. continue //无值不用提取
  124. }
  125. valuesArr := []map[string]interface{}{}
  126. if key == "buyerzipcode" {
  127. valuesArr = e.GetRegionByPostCode(text, &old_regions)
  128. } else if key == "buyertel" {
  129. valuesArr = e.GetRegionByTelNumber(text, &old_regions)
  130. } else if key == "site_area" {
  131. valuesArr = e.GetRegionBySite(text, &old_regions, 1)
  132. } else if key == "site_city" {
  133. valuesArr = e.GetRegionBySite(text, &old_regions, 2)
  134. } else if key == "buyer_filiale" {
  135. valuesArr = e.GetRegionFromText(text, &old_regions, false, false, 2)
  136. } else {
  137. isAddress, isBrief := false, false
  138. if key == "projectaddr" || key == "addressing" || key == "bidopenaddress" || key == "buyeraddr" {
  139. isAddress = true
  140. }
  141. valuesArr = e.GetRegionFromText(text, &old_regions, isAddress, isBrief, 2)
  142. }
  143. field_regions[key] = valuesArr
  144. }
  145. //校验当前组的合理性
  146. new_regions := ReasonableGroupRegionInfo(old_regions)
  147. return isExists, textArr, field_regions, old_regions, new_regions
  148. }
  149. //邮政编号
  150. func (e *ExtractTask) GetRegionByPostCode(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} {
  151. regionsArr := []map[string]interface{}{}
  152. pc := e.PostCodeMap[text]
  153. if pc != nil {
  154. if len(pc.D) == 1 {
  155. UpdateRegionsInfo(pc.P, pc.C, pc.D[0], regions)
  156. regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": pc.D[0]})
  157. } else {
  158. UpdateRegionsInfo(pc.P, pc.C, "", regions)
  159. regionsArr = append(regionsArr, map[string]interface{}{"area": pc.P, "city": pc.C, "district": ""})
  160. }
  161. }
  162. return regionsArr
  163. }
  164. //固话号码
  165. func (e *ExtractTask) GetRegionByTelNumber(text string, regions *map[string]map[string]map[string]string) []map[string]interface{} {
  166. regionsArr := []map[string]interface{}{}
  167. if len(text) >= 11 {
  168. if strings.HasPrefix(text, "0") { //区号除了澳门853其他都是以0开头
  169. n := 4
  170. L:
  171. areacode := text[:n]
  172. ac := e.AreaCodeMap[areacode]
  173. if ac != nil {
  174. if len(ac.C) == 1 {
  175. UpdateRegionsInfo(ac.P, ac.C[0], "", regions)
  176. regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": ac.C[0], "district": ""})
  177. } else {
  178. UpdateRegionsInfo(ac.P, "", "", regions)
  179. regionsArr = append(regionsArr, map[string]interface{}{"area": ac.P, "city": "", "district": ""})
  180. }
  181. } else {
  182. n = n - 1
  183. if n >= 3 {
  184. goto L
  185. }
  186. }
  187. }
  188. }
  189. return regionsArr
  190. }
  191. //初步确认~采集
  192. func (e *ExtractTask) GetRegionByTentativeJsonData(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
  193. area, city, district := "", "", ""
  194. regions := map[string]map[string]map[string]string{}
  195. if j.Jsondata != nil {
  196. jsondata := *j.Jsondata
  197. if a_c_d, ok := jsondata["area_city_district"].(string); ok && a_c_d != "" {
  198. e.GetRegionFromText(a_c_d, &regions, false, false, 1)
  199. }
  200. }
  201. if len(regions) == 1 {
  202. for k, v := range regions {
  203. area = k
  204. if len(v) == 1 {
  205. for k1, v1 := range v {
  206. city = k1
  207. if len(v1) == 1 {
  208. for k2, _ := range v1 {
  209. district = k2
  210. }
  211. } else {
  212. break
  213. }
  214. }
  215. } else {
  216. break
  217. }
  218. }
  219. }
  220. if area != "" { //组装结构
  221. city_info := map[string]map[string]string{}
  222. district_info := map[string]string{}
  223. if city != "" {
  224. if district != "" {
  225. district_info[district] = district
  226. }
  227. city_info[city] = district_info
  228. }
  229. (*all_regions)[area] = city_info
  230. }
  231. }
  232. //简称全程标准化的校验~
  233. func (e *ExtractTask) StandardizedegionInfo(area *string, city *string, district *string) {
  234. //特殊市补充
  235. if *area == "北京" {
  236. *city = "北京市"
  237. } else if *area == "天津" {
  238. *city = "天津市"
  239. } else if *area == "上海" {
  240. *city = "上海市"
  241. } else if *area == "重庆" {
  242. *city = "重庆市"
  243. }
  244. //非空与空~是否标准校验
  245. if *area == "" {
  246. *city = ""
  247. *district = ""
  248. } else {
  249. if province := e.ProvinceMap[*area]; province != "" {
  250. *area = province
  251. }
  252. if *city == "" {
  253. *district = ""
  254. } else {
  255. if csMap := e.CityBriefMap[*city]; csMap != nil {
  256. if csMap.P.Brief == *area && csMap.Name != "" {
  257. *city = csMap.Name
  258. } else {
  259. *city = ""
  260. *district = ""
  261. }
  262. } else {
  263. if e.CityMap[*city] == "" {
  264. *city = ""
  265. *district = ""
  266. }
  267. }
  268. if *district != "" {
  269. citysArr := e.DistrictSimAndAll[*district]
  270. if len(citysArr) == 1 {
  271. full_city := citysArr[0]
  272. for d, _ := range full_city {
  273. *district = d
  274. }
  275. } else if len(citysArr) > 1 {
  276. *district = ""
  277. } else if len(citysArr) == 0 {
  278. fullArr := e.DistrictCityMap[*district]
  279. if len(fullArr) == 0 {
  280. *district = ""
  281. }
  282. } else {
  283. }
  284. }
  285. }
  286. }
  287. }
  288. //站点取值 from 1-省 2-省市
  289. func (e *ExtractTask) GetRegionBySite(site string, regions *map[string]map[string]map[string]string, from int) []map[string]interface{} {
  290. regionArr := []map[string]interface{}{}
  291. area, city, district := "", "", ""
  292. if scMap := e.SiteCityMap[site]; scMap != nil {
  293. if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
  294. area = scMap.P
  295. }
  296. if scMap.C != "" && scMap.C != "null" && area != "" {
  297. city = scMap.C
  298. }
  299. }
  300. e.StandardizedegionInfo(&area, &city, &district)
  301. if from == 1 && area != "" && area != "全国" {
  302. UpdateRegionsInfo(area, "", "", regions)
  303. regionArr = append(regionArr, map[string]interface{}{"area": area, "city": "", "district": ""})
  304. }
  305. if from == 2 && area != "" && area != "全国" && city != "" {
  306. UpdateRegionsInfo(area, city, "", regions)
  307. regionArr = append(regionArr, map[string]interface{}{"area": area, "city": city, "district": ""})
  308. }
  309. return regionArr
  310. }
  311. //新疆兵团
  312. func (e *ExtractTask) NewVerifyXjCorpsInfo(buyer string) (new_a, new_c, new_d string, ok bool) {
  313. buyer = strings.ReplaceAll(buyer, "新疆兵团", "新疆生产建设兵团")
  314. ok = false
  315. for _, info := range e.XjbtCityArr {
  316. name := qu.ObjToString(info["name"])
  317. alias := qu.ObjToString(info["alias"])
  318. if strings.Contains(buyer, name) || strings.Contains(buyer, alias) {
  319. new_a = qu.ObjToString(info["area"])
  320. new_c = qu.ObjToString(info["city"])
  321. new_d = qu.ObjToString(info["district"])
  322. ok = true
  323. if res, ok := info["list"].([]interface{}); ok {
  324. list := qu.ObjArrToMapArr(res)
  325. for _, c := range list {
  326. c_name := qu.ObjToString(c["name"])
  327. if strings.Contains(buyer, name+c_name) || strings.Contains(buyer, alias+c_name) {
  328. new_a = qu.ObjToString(c["area"])
  329. new_c = qu.ObjToString(c["city"])
  330. new_d = qu.ObjToString(c["district"])
  331. break
  332. }
  333. }
  334. }
  335. break
  336. }
  337. }
  338. return new_a, new_c, new_d, ok
  339. }
  340. //敏感词识别
  341. func (e *ExtractTask) NewVerifySensitiveInfo(detail string, area *string, city *string, district *string) bool {
  342. detail = sensitiveReg.ReplaceAllString(detail, "")
  343. detail = TextAfterRemoveTable(detail)
  344. //全称城市
  345. fullCityArr := e.SensitiveFullCity.FindAll(detail)
  346. if len(fullCityArr) == 1 {
  347. for _, v := range fullCityArr {
  348. if cityMap := e.CityFullMap[v]; cityMap != nil {
  349. if *area == "" || *area == "全国" || cityMap.P.Brief == *area {
  350. *area = cityMap.P.Brief
  351. *city = cityMap.Name
  352. return true
  353. }
  354. }
  355. }
  356. }
  357. //全称区县
  358. fullDistrictArr := e.SensitiveFullDistrict.FindAll(detail)
  359. if len(fullDistrictArr) == 1 {
  360. for _, v := range fullDistrictArr {
  361. if citys := e.DistrictCityMap[v]; len(citys) == 1 {
  362. if *area == "" || *area == "全国" || citys[0].P.Brief == *area {
  363. *area = citys[0].P.Brief
  364. *city = citys[0].Name
  365. *district = v
  366. return true
  367. }
  368. }
  369. }
  370. }
  371. //简称城市
  372. simCityArr := e.SensitiveSimCity.FindAll(detail)
  373. if len(simCityArr) == 1 {
  374. for _, v := range simCityArr {
  375. if cityMap := e.CityBriefMap[v]; cityMap != nil {
  376. if *area == "" || *area == "全国" {
  377. *area = cityMap.P.Brief
  378. if !strings.Contains(*area, v) {
  379. *city = cityMap.Name
  380. }
  381. return true
  382. }
  383. if cityMap.P.Brief == *area && !strings.Contains(*area, v) {
  384. *area = cityMap.P.Brief
  385. *city = cityMap.Name
  386. return true
  387. }
  388. }
  389. }
  390. }
  391. //疑似固话提取~
  392. fixedTelArr := FixedTelReg.FindAllString(detail, -1)
  393. if len(fixedTelArr) > 0 {
  394. codeArr := resetFixedTelInfo(fixedTelArr)
  395. if len(codeArr) == 1 {
  396. for _, v := range codeArr {
  397. if ac := e.AreaCodeMap[v]; ac != nil {
  398. *area = ac.P
  399. return true
  400. }
  401. }
  402. }
  403. }
  404. return false
  405. }
  406. func resetFixedTelInfo(telArr []string) []string {
  407. codeArr := []string{}
  408. telsMap := map[string]string{}
  409. for _, v := range telArr {
  410. if v != "" {
  411. arr := strings.Split(v, "-")
  412. code := qu.ObjToString(arr[0])
  413. if telsMap[code] == "" {
  414. telsMap[code] = code
  415. codeArr = append(codeArr, code)
  416. }
  417. }
  418. }
  419. return codeArr
  420. }
  421. //初步确认~站点
  422. //func (e *ExtractTask) GetRegionByTentativeSite(j *ju.Job, all_regions *map[string]map[string]map[string]string) {
  423. // area, city, district := "", "", ""
  424. // site, _ := (*j.Data)["site"].(string)
  425. // if scMap := e.SiteCityMap[site]; scMap != nil {
  426. // if scMap.P != "" && scMap.P != "全国" && scMap.P != "null" {
  427. // area = scMap.P
  428. // }
  429. // if scMap.C != "" && scMap.C != "null" && area != "" {
  430. // city = scMap.C
  431. // }
  432. // //if scMap.D != "" && scMap.D != "null" && city != "" {
  433. // // district = scMap.D
  434. // //}
  435. // }
  436. //
  437. // //对省市区进行标准化校验~简称全程的问题
  438. // e.StandardizedegionInfo(&area, &city, &district)
  439. //
  440. // //取出唯一数据
  441. // j_area, j_city, j_district := "", "", ""
  442. // is_adjust := false
  443. // if len(*all_regions) == 1 { //有值~只进行补充操作
  444. // for k, v := range *all_regions {
  445. // j_area = k
  446. // for k1, v1 := range v {
  447. // j_city = k1
  448. // for k2, _ := range v1 {
  449. // j_district = k2
  450. // }
  451. // }
  452. // }
  453. // if j_area == area && area != "" {
  454. // if city != "" {
  455. // if j_city == "" {
  456. // is_adjust = true
  457. // } else if j_city == city {
  458. // if district != "" && j_district == "" {
  459. // is_adjust = true
  460. // }
  461. // }
  462. // }
  463. // }
  464. // } else {
  465. // is_adjust = true
  466. // }
  467. // if is_adjust && area != "" { //进行调整
  468. // city_info := map[string]map[string]string{}
  469. // district_info := map[string]string{}
  470. // if city != "" {
  471. // if district != "" {
  472. // district_info[district] = district
  473. // }
  474. // city_info[city] = district_info
  475. // }
  476. // (*all_regions)[area] = city_info
  477. // }
  478. //}