colonkv.go 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785
  1. //识别冒号kv
  2. package pretreated
  3. import (
  4. "jy/clear"
  5. . "jy/util"
  6. qutil "qfw/util"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. )
  11. type ColonkvEntity struct{}
  12. var (
  13. colonkvEntity = &ColonkvEntity{}
  14. regReplKV = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?")
  15. regReplKV2 = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
  16. regKV = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
  17. filterK = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
  18. filterValue = regexp.MustCompile("^(无)$")
  19. regReplKey = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
  20. BlockTagMap = map[string]bool{
  21. "招标范围": true,
  22. "资格要求": true,
  23. }
  24. brackets = map[string]string{
  25. "(": ")",
  26. "(": ")",
  27. "": "",
  28. "[": "]",
  29. "【": "】",
  30. "{": "}",
  31. "{": "}",
  32. "《": "》",
  33. "<": ">",
  34. }
  35. //
  36. PersonReg = regexp.MustCompile("[\u4e00-\u9fa5]{2,5}")
  37. //
  38. TelMustReg = regexp.MustCompile("^" + PhoneReg.String() + "$")
  39. PersonMustReg = regexp.MustCompile("^" + PersonReg.String() + "$")
  40. AddressReg = regexp.MustCompile("[省市县区路号楼]")
  41. BracketsTextReg = regexp.MustCompile("[((]([^((]+)[))]")
  42. ContactBuyerTitleReg = regexp.MustCompile("采购联系事项")
  43. ContactAgencyTitleReg = regexp.MustCompile("招标联系事项")
  44. )
  45. //一行多个冒号kv处理
  46. func (ce *ColonkvEntity) divisionMoreKV(con string) string {
  47. con = regReplKV.ReplaceAllStringFunc(con, func(temp string) string {
  48. //分kv的时候出现括号不成对出现的情况,分错了跳过
  49. matchText := regReplKV.FindStringSubmatch(con)[1]
  50. for k, v := range brackets {
  51. if strings.Count(matchText, k) != strings.Count(matchText, v) {
  52. return temp
  53. }
  54. }
  55. return regReplKV.ReplaceAllString(temp, "$1\n\n$2")
  56. })
  57. con = regReplKV2.ReplaceAllString(con, "$1\n\n$2")
  58. return con
  59. }
  60. //获取冒号kv入口
  61. func (ce *ColonkvEntity) entrance(con, title string, from int) ([]*Kv, map[string]string) {
  62. kvs := ce.GetKvs(con, title, from)
  63. kv := map[string]string{}
  64. for _, v := range kvs {
  65. if strings.TrimSpace(v.Value) == "" {
  66. continue
  67. }
  68. kv[v.Key] = v.Value
  69. }
  70. return kvs, kv
  71. }
  72. //获取有序的kv
  73. func (ce *ColonkvEntity) GetKvs(con, title string, from int) []*Kv {
  74. con = ce.processText(con)
  75. kvs := ce.getColonKv(con, title, from)
  76. return kvs
  77. }
  78. //处理正文
  79. func (ce *ColonkvEntity) processText(con string) string {
  80. con = ce.divisionMoreKV(con)//一行多个冒号kv处理
  81. for {
  82. tmp := con
  83. con = ce.divisionMoreKV(con)
  84. if tmp == con {
  85. break
  86. }
  87. }
  88. return con
  89. }
  90. //分冒号kv
  91. //from 1--全文 2,3--table td
  92. func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv {
  93. if from == 2 || from == 3 {
  94. con = RemoveWarpOfTdVal(con)
  95. }
  96. findkvs := []*Kv{}
  97. lines := SspacekvEntity.getLines(con)
  98. for index, line := range lines {
  99. res := regKV.FindAllStringSubmatch(line, -1)
  100. if len(res) > 0 {
  101. for _, v := range res {
  102. key, val := "", ""
  103. if len(v) == 3 {
  104. key = v[1]
  105. val = v[2]
  106. } else if len(v) == 4 {
  107. key = v[2]
  108. val = v[3]
  109. }
  110. //Debug("KV-key", key, val)
  111. //Debug("KV-key", key, val)
  112. //地址、联系人可能会重复 单位、代理机构的\时间、地点
  113. if strings.TrimSpace(key) != "" {
  114. prevLine, nextLine := "", ""
  115. if index > 0 {
  116. prevLine = lines[index-1]
  117. }
  118. if index < len(lines)-1 {
  119. nextLine = lines[index+1]
  120. }
  121. findkvs = append(findkvs, &Kv{
  122. Key: key,
  123. Value: val,
  124. Line: line,
  125. PrevLine: prevLine,
  126. NextLine: nextLine,
  127. Title: title,
  128. })
  129. splitkeys := strings.Split(key, "/")
  130. splitvalues := strings.Split(val, "/")
  131. if len(splitkeys) > 1 && len(splitkeys) == len(splitvalues) {
  132. for splitindex, splitkey := range splitkeys {
  133. findkvs = append(findkvs, &Kv{
  134. Key: splitkey,
  135. Value: splitvalues[splitindex],
  136. Line: line,
  137. PrevLine: prevLine,
  138. NextLine: nextLine,
  139. Title: title,
  140. })
  141. }
  142. }
  143. }
  144. }
  145. }
  146. }
  147. return findkvs
  148. }
  149. //冒号kv和空格kv结合
  150. func (ce *ColonkvEntity) getColonSpaceKV(con string) []*Kv {
  151. con = colonkvEntity.processText(con)
  152. lines := SspacekvEntity.getLines(con)
  153. kvMaps := []*Kv{}
  154. for _, line := range lines {
  155. kvs := colonkvEntity.getColonKv(line, "", 1)
  156. if len(kvs) == 0 {
  157. kv := SspacekvEntity.divideKV(line)
  158. if kv != nil {
  159. kvMaps = append(kvMaps, kv...)
  160. }
  161. } else {
  162. kvMaps = append(kvMaps, kvs...)
  163. }
  164. }
  165. return kvMaps
  166. }
  167. /*
  168. 五、递交响应文件时间及地点
  169. 1、时间:2016年5月20日14时00分至2016年5月20日14时30分(北京时间)
  170. 2、地点:烟台开发区公共资源交易中心A座5楼会议室(金沙江路83号)
  171. key 时间 处理成 递交响应文件时间
  172. */
  173. func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
  174. needKey := "时间"
  175. if key != needKey {
  176. return key
  177. }
  178. titles := regSplit.Split(title, -1)
  179. for _, v := range titles {
  180. if strings.HasSuffix(v, needKey) {
  181. return v
  182. }
  183. }
  184. return key
  185. }
  186. //根据配置文件中的规则,格式化正文
  187. func formatText(content, key string) string {
  188. for _, v := range FormatTextMap[key] {
  189. reg, _ := v["reg"].(*regexp.Regexp)
  190. separator, isString := v["separator"].(string)
  191. separators, isArray := v["separator"].([]interface{})
  192. if isArray {
  193. content = reg.ReplaceAllStringFunc(content, func(temp string) string {
  194. for _, sv := range separators {
  195. separator, _ := sv.(string)
  196. if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 {
  197. temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1])
  198. }
  199. }
  200. return temp
  201. })
  202. } else if isString {
  203. if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 {
  204. content = reg.ReplaceAllStringFunc(content, func(temp string) string {
  205. temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1])
  206. return temp
  207. })
  208. } else {
  209. content = reg.ReplaceAllString(content, separator)
  210. }
  211. }
  212. //Debug(v["reg"], content)
  213. }
  214. return content
  215. }
  216. func IsContactKvHandle(value string, m map[string]bool) bool {
  217. for k, _ := range m {
  218. if k != value && (strings.HasPrefix(k, value) || strings.HasPrefix(value, k)) {
  219. continue
  220. }
  221. if strings.Contains(value, k) || strings.Contains(k, value) {
  222. return true
  223. }
  224. }
  225. return false
  226. }
  227. //kv关于联系人信息的处理
  228. //采购人>集中采购机构
  229. /*
  230. func FormatContactKv(kvs *[]*Kv, title string, buyers []string) {
  231. ////////////////////////////
  232. //处理联系人信息
  233. var indexMap map[int]string
  234. var matchMap map[string]map[string]bool
  235. if contactFormat == nil || title != "" {
  236. indexMap = map[int]string{}
  237. matchMap = map[string]map[string]bool{}
  238. } else {
  239. indexMap = contactFormat.IndexMap
  240. matchMap = contactFormat.MatchMap
  241. }
  242. ////////////////////////////
  243. totalIndexMap := map[string]bool{}
  244. ascFind := true
  245. ascFindFlag := len(indexMap) == 0 && buyers == nil
  246. //采购人在联系人、电话后面的处理
  247. isCanAddToIndexMap := false
  248. for _, kv := range *kvs {
  249. k := FilterContactKey(kv.Key)
  250. k_length := len([]rune(k))
  251. if k_length < 2 || k_length > 15 {
  252. continue
  253. }
  254. isContinue := ContactInfoMustReg.MatchString(k)
  255. if (isContinue || (ContactInfoVagueReg.MatchString(k) && IsMapHasValue(k, ContactType))) && ascFindFlag {
  256. if len(indexMap) > 0 {
  257. ascFind = true
  258. ascFindFlag = false
  259. indexMap = map[int]string{}
  260. }
  261. isCanAddToIndexMap = true
  262. }
  263. for _, ct_k := range HasOrderContactType(k) {
  264. if !ContactType[ct_k].MatchString(k) {
  265. continue
  266. }
  267. totalIndexMap[ct_k] = true
  268. /////////////////////////////
  269. if isContinue || !ascFindFlag {
  270. continue
  271. }
  272. if isCanAddToIndexMap && len(indexMap) == 0 {
  273. indexMap[1] = ct_k
  274. ascFind = false
  275. }
  276. }
  277. }
  278. mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
  279. titleMatch := false
  280. if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" {
  281. titleMatch = true
  282. mustMatchFirst = false
  283. indexMap = map[int]string{1: titleMatchType}
  284. }
  285. // if buyers == nil {
  286. // Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
  287. // }
  288. //Debug("buyers-------", buyers)
  289. // if buyers == nil {
  290. // for _, kv := range *kvs {
  291. // Debug("bbbbbbbbbb", kv.Key, kv.Value)
  292. // }
  293. // }
  294. startIndex := 0
  295. prevKey := ""
  296. index, notmatchCount, allMatchCount := 0, 0, 0
  297. weightMap := map[string]map[string]interface{}{} //权重
  298. mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
  299. kvsTemp := make([]*Kv, len(*kvs))
  300. copy(kvsTemp, *kvs)
  301. for kv_index, kv := range *kvs {
  302. isBreak := true
  303. v := strings.TrimSpace(kv.Value)
  304. //根据采购单位分析
  305. isContinue := false
  306. k := FilterContactKey(kv.Key)
  307. k_length := len([]rune(k))
  308. if buyers != nil {
  309. for _, buyer := range buyers {
  310. if buyer == "" {
  311. continue
  312. }
  313. prevLine := kv.PrevLine
  314. prevLine = strings.TrimSpace(prevLine)
  315. prevLine = strings.Split(prevLine, " ")[0]
  316. buyerLenght, prevLineLength := len([]rune(buyer)), len([]rune(prevLine))
  317. prevNotEqual := true
  318. if kv_index > 0 {
  319. prevNotEqual = strings.TrimSpace(((*kvs)[kv_index-1]).Value) != buyer
  320. }
  321. matchBuyerSuccess := false
  322. if strings.HasPrefix(k, buyer) && ContactInfoVagueReg.MatchString(k) && k_length-buyerLenght >= 2 && k_length-buyerLenght <= 5 {
  323. matchBuyerSuccess = true
  324. k = strings.TrimLeft(k, buyer)
  325. k_length = len([]rune(k))
  326. // kvTemp := *kv
  327. // kvTemp.Key = strings.TrimLeft(k, buyer)
  328. // (*kvs)[kv_index] = &kvTemp
  329. } else if k == buyer {
  330. matchBuyerSuccess = true
  331. if PersonMustReg.MatchString(v) {
  332. k = "联系人"
  333. } else if TelMustReg.MatchString(v) {
  334. k = "联系电话"
  335. } else if AddressReg.MatchString(v) {
  336. k = "地址"
  337. } else if PersonReg.MatchString(v) || PhoneReg.MatchString(v) {
  338. k = "联系方式"
  339. }
  340. k_length = len([]rune(k))
  341. } else if strings.HasPrefix(strings.TrimSpace(v), buyer) || (prevNotEqual && buyerLenght >= prevLineLength-5 && buyerLenght <= prevLineLength && strings.Contains(prevLine, buyer)) {
  342. matchBuyerSuccess = true
  343. isContinue = true
  344. }
  345. if matchBuyerSuccess {
  346. isBreak = false
  347. matchMap["采购单位"] = map[string]bool{}
  348. indexMap[1] = "采购单位"
  349. break
  350. }
  351. }
  352. } else if ascFind {
  353. for _, ct_k := range HasOrderContactType(k) {
  354. if k_length < 3 || k_length > 15 {
  355. isBreak = false
  356. continue
  357. }
  358. if !ContactType[ct_k].MatchString(k) {
  359. continue
  360. }
  361. if weightMap[ct_k] == nil {
  362. weightMap[ct_k] = map[string]interface{}{}
  363. }
  364. isAddToMatchMap := false
  365. addToMatchMapKey := ""
  366. if ContactInfoVagueReg.MatchString(k) {
  367. isAddToMatchMap = true
  368. if matchMap[ct_k] == nil {
  369. matchMap[ct_k] = map[string]bool{}
  370. }
  371. if !strings.HasSuffix(k, "方式") {
  372. _, kTag := KvTagsToKV([]*Kv{&Kv{Key: k, Value: v}}, "", BuyerContacts, 1)
  373. if len(kTag) == 1 {
  374. tagVal, weightVal := FirstKeyValueInMap(kTag)
  375. if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) {
  376. isAddToMatchMap = false
  377. }
  378. if mapIndexInKvs[ct_k] == nil {
  379. mapIndexInKvs[ct_k] = map[string]interface{}{}
  380. }
  381. myIndexInKvs := mapIndexInKvs[ct_k][tagVal]
  382. if myIndexInKvs != nil {
  383. if weightMap[ct_k][tagVal] == nil || (weightVal != nil && weightVal.(int) >= weightMap[ct_k][tagVal].(int)) {
  384. weightMap[ct_k][tagVal] = weightVal.(int)
  385. (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)]
  386. //(*kvs)[kv_index] = &Kv{Key: tagVal, Value: v}
  387. kvTemp := *kv
  388. kvTemp.Key = tagVal
  389. kvTemp.Value = v
  390. (*kvs)[kv_index] = &kvTemp
  391. }
  392. } else {
  393. weightMap[ct_k][tagVal] = weightVal.(int)
  394. }
  395. mapIndexInKvs[ct_k][tagVal] = kv_index
  396. }
  397. }
  398. addToMatchMapKey = k
  399. if ct_k == "采购单位" {
  400. k = ContactType[ct_k].FindString(k)
  401. }
  402. }
  403. if ct_k == "采购单位" { //打标签,权重高的重新覆盖
  404. _, kTag := KvTagsToKV([]*Kv{&Kv{Key: k, Value: v}}, "", []string{"采购单位"}, 1)
  405. tagVal, weightVal := FirstKeyValueInMap(kTag)
  406. if tagVal == ct_k {
  407. if weightMap[ct_k][ct_k] == nil || (weightVal != nil && weightVal.(int) > weightMap[ct_k][ct_k].(int)) {
  408. weightMap[ct_k][ct_k] = weightVal.(int)
  409. matchMap[ct_k] = map[string]bool{}
  410. isBreak = false
  411. prevKey = ""
  412. }
  413. }
  414. }
  415. if isAddToMatchMap && !filterValue.MatchString(v) {
  416. matchMap[ct_k][ContactInfoVagueReg.FindString(addToMatchMapKey)] = true
  417. }
  418. allMatchCount++
  419. if IsMapHasValue(ct_k, indexMap) {
  420. isContinue = true
  421. continue
  422. }
  423. isBreak = false
  424. if index != 0 || notmatchCount != 0 {
  425. startIndex = 0
  426. indexMap = map[int]string{}
  427. }
  428. if startIndex == 0 {
  429. indexMap = map[int]string{}
  430. }
  431. prevKey = ""
  432. startIndex++
  433. indexMap[startIndex] = ct_k
  434. isContinue = true
  435. }
  436. }
  437. if isContinue {
  438. continue
  439. }
  440. // if buyers == nil {
  441. // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount)
  442. // }
  443. if len(indexMap) == 0 {
  444. continue
  445. }
  446. if titleMatch && !ContactInfoMustReg.MatchString(k) {
  447. k = ContactInfoVagueReg.FindString(k)
  448. k_length = len([]rune(k))
  449. }
  450. if k_length < 2 || k_length > 10 {
  451. isBreak = false
  452. continue
  453. }
  454. if !ContactInfoMustReg.MatchString(k) {
  455. if mustMatchFirst {
  456. mustMatchFirst = false
  457. continue
  458. }
  459. if buyers == nil && len(totalIndexMap) != 0 {
  460. isBreak = false
  461. }
  462. //允许有这么多个匹配不上的key
  463. notmatchCount++
  464. if notmatchCount < len(indexMap)*2 {
  465. isBreak = false
  466. } else if contactFormat == nil && ascFind {
  467. startIndex = 0
  468. notmatchCount = 0
  469. indexMap = map[int]string{}
  470. //matchMap = map[string]map[string]bool{}
  471. }
  472. continue
  473. }
  474. isBreak = false
  475. if prevKey != k {
  476. prevKey = k
  477. index = 1
  478. } else if prevKey == k {
  479. index++
  480. }
  481. //过滤值
  482. if filterValue.MatchString(v) {
  483. continue
  484. }
  485. myContactType := indexMap[index]
  486. if myContactType == "" {
  487. continue
  488. }
  489. // if buyers == nil {
  490. // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount)
  491. // }
  492. if strings.HasSuffix(k, "方式") && TelMustReg.MatchString(v) {
  493. k = "联系电话"
  494. }
  495. if matchMap[myContactType] == nil {
  496. matchMap[myContactType] = map[string]bool{}
  497. }
  498. myTagValue := ContactInfoMustReg.FindString(k)
  499. if myTagValue == "" && titleMatch {
  500. myTagValue = ContactInfoVagueReg.FindString(k)
  501. }
  502. if IsContactKvHandle(myTagValue, matchMap[myContactType]) {
  503. continue
  504. }
  505. matchMap[myContactType][myTagValue] = true
  506. if ContactType[myContactType].MatchString(k) {
  507. continue
  508. }
  509. allMatchCount++
  510. delete(totalIndexMap, myContactType)
  511. if !strings.HasSuffix(k, "方式") {
  512. _, kTag := KvTagsToKV([]*Kv{&Kv{Key: myContactType + k, Value: v}}, "", BuyerContacts, 1)
  513. if len(kTag) == 1 {
  514. tagVal, _ := FirstKeyValueInMap(kTag)
  515. if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) {
  516. continue
  517. }
  518. if mapIndexInKvs[myContactType] == nil {
  519. mapIndexInKvs[myContactType] = map[string]interface{}{}
  520. }
  521. myIndexInKvs := mapIndexInKvs[myContactType][tagVal]
  522. if myIndexInKvs != nil {
  523. (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)]
  524. }
  525. mapIndexInKvs[myContactType][tagVal] = kv_index
  526. if weightMap[myContactType] == nil {
  527. weightMap[myContactType] = map[string]interface{}{}
  528. }
  529. weightMap[myContactType][tagVal] = 1
  530. }
  531. }
  532. //(*kvs)[kv_index] = &Kv{Key: myContactType + k, Value: v}
  533. kvTemp := *kv
  534. kvTemp.Key = myContactType + k
  535. kvTemp.Value = v
  536. (*kvs)[kv_index] = &kvTemp
  537. if ascFind && isBreak && len(indexMap) > 0 {
  538. break
  539. }
  540. }
  541. if allMatchCount == 0 && len(*kvs) > 0 {
  542. indexMap = map[int]string{}
  543. matchMap = map[string]map[string]bool{}
  544. }
  545. if contactFormat != nil {
  546. (*contactFormat).IndexMap = indexMap
  547. (*contactFormat).MatchMap = matchMap
  548. }
  549. // if buyers == nil {
  550. // for _, kv := range *kvs {
  551. // Debug("bbbbbbbbbb", kv.Key, kv.Value)
  552. // }
  553. // }
  554. //Debug("totalIndexMap", len(totalIndexMap))
  555. }
  556. */
  557. func ContactTypeTitleMatch(title string) string {
  558. matchType := ""
  559. if title != "" && len([]rune(title)) < 15 {
  560. if ContactBuyerTitleReg.MatchString(title) {
  561. matchType = "采购单位"
  562. } else if ContactAgencyTitleReg.MatchString(title) {
  563. matchType = "代理机构"
  564. } else {
  565. for _, ct_k := range HasOrderContactType(title) {
  566. if ContactType[ct_k].MatchString(title) {
  567. matchType = ct_k
  568. break
  569. }
  570. }
  571. }
  572. }
  573. return matchType
  574. }
  575. //获取带有排序的联系人类型
  576. func HasOrderContactType(text string) []string {
  577. indexs := []int{}
  578. indexMap := map[int]string{}
  579. temp := []string{}
  580. for k, v := range ContactType {
  581. s := v.FindStringIndex(text)
  582. if len(s) > 1 {
  583. if indexMap[s[0]] != "" {
  584. temp = append(temp, k)
  585. } else {
  586. indexs = append(indexs, s[0])
  587. indexMap[s[0]] = k
  588. }
  589. }
  590. }
  591. sort.Ints(indexs)
  592. result := []string{}
  593. for _, v := range indexs {
  594. result = append(result, indexMap[v])
  595. }
  596. if len(temp) > 0 {
  597. result = append(result, temp...)
  598. }
  599. return result
  600. }
  601. //两种冒号kv结合到一起
  602. //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包
  603. func GetKVAll(content, title string, from int) *JobKv {
  604. content = formatText(content, "kv")
  605. m1Kvs, _ := colonkvEntity.entrance(content, title, from)
  606. m1, m1Weight := KvTagsToKV(m1Kvs, title, nil, from)
  607. if m1 == nil {
  608. m1 = map[string]string{}
  609. }
  610. m2Kvs, m2, m2Weight := GetKvFromtxt(content, title, from)
  611. for k, v := range m2 {
  612. if m1[k] == "" {
  613. m1[k] = v
  614. m1Weight[k] = m2Weight[k]
  615. }
  616. }
  617. return &JobKv{
  618. Kvs: m1Kvs,
  619. Kvs_2: m2Kvs,
  620. Kv: m1,
  621. KvTag: m1Weight,
  622. }
  623. }
  624. //KVTags转kv
  625. func KvTagsToKV(findkvs []*Kv, title string, tagdbs []string, from int) (map[string]string, map[string]*Tag) {
  626. kvTags := map[string]*Tag{}
  627. if title != "" && BlockTagMap[title] {
  628. kvTags[title] = &Tag{title, 0, nil}
  629. }
  630. for _, findkv := range findkvs {
  631. kvMap := map[string]string{}
  632. k, val := findkv.Key, findkv.Value
  633. //val是空的话,不打标签
  634. if filterValue.MatchString(val) {
  635. continue
  636. }
  637. key := k
  638. key = ClearKey(key, 1)
  639. if key == "" {
  640. continue
  641. }
  642. key = colonkvEntity.blockTitleKV(title, key)
  643. //先用新的key
  644. tags := GetAppointTags(key, tagdbs)
  645. if len(tags) == 0 && len(key) < 10 && len(title) > 0 && len(title) < 15 {
  646. key = title + key
  647. tags = GetAppointTags(key, tagdbs)
  648. }
  649. //再用老的key
  650. if len(tags) == 0 && k != key {
  651. tags = GetAppointTags(k, tagdbs)
  652. if len(tags) == 0 && len(k) < 10 && len(title) > 0 && len(title) < 15 {
  653. k = title + k
  654. tags = GetAppointTags(k, tagdbs)
  655. if len(tags) > 0 {
  656. key = k
  657. }
  658. }
  659. }
  660. if len(tags) == 0 {
  661. //go AddtoNoMatchMap(key)
  662. //Debug(key)
  663. //continue
  664. //由跳过修改为保留
  665. tags = []*Tag{&Tag{k, -100, nil}}
  666. }
  667. for _, tk := range tags {
  668. //分包过来给kv打标签的时候,只取第一个,后面的不覆盖
  669. if kvTags[tk.Value] == nil || (kvTags[tk.Value].Weight < tk.Weight && from != 4) {
  670. // fc := StandardNameMap[tk.Value]
  671. // if (fc != nil && fc.CheckNum) || (moneyreg.MatchString(tk.Value)) {
  672. // val += GetMoneyUnit(k, val)
  673. // }
  674. if moneyreg.MatchString(tk.Value) {
  675. val += GetMoneyUnit(k, val)
  676. }
  677. //Debug("KV-key", tk, val)
  678. kvTags[tk.Value] = &Tag{val, tk.Weight, nil}
  679. kvMap[tk.Value] = val
  680. //Debug("KV-key", tk.Value, val, key, tk.Weight)
  681. }
  682. }
  683. }
  684. //
  685. kv := map[string]string{}
  686. kvWeight := map[string]*Tag{}
  687. if len(kvTags) > 0 {
  688. for k, v := range kvTags {
  689. if kv[k] != "" {
  690. continue
  691. }
  692. kv[k] = v.Value
  693. kvWeight[k] = v
  694. }
  695. }
  696. return kv, kvWeight
  697. }
  698. func FilterContactKey(key string) string {
  699. key1 := ""
  700. for _, v := range BracketsTextReg.FindAllString(key, -1) {
  701. for _, vv := range ContactType {
  702. if vv.MatchString(v) {
  703. if len([]rune(v)) < 3 || len([]rune(v)) > 10 {
  704. continue
  705. }
  706. key1 = v
  707. break
  708. }
  709. }
  710. }
  711. key = filterK.ReplaceAllString(key, "")
  712. key = tablekeyclear.ReplaceAllString(key, "")
  713. return key1 + key
  714. }
  715. //td里的内容,调用这边的方法分kv的时候,有的带有换行,清理掉
  716. func RemoveWarpOfTdVal(text string) string {
  717. //只有一个冒号
  718. if len(regDivision.FindAllString(text, -1)) != 1 {
  719. return text
  720. }
  721. text = strings.TrimSpace(text)
  722. //有一个换行
  723. array := strings.Split(text, "\n")
  724. if len(array) != 2 {
  725. return text
  726. }
  727. //第一行以冒号结尾
  728. if !colonEndReg.MatchString(array[0]) {
  729. if BracketsTextReg.ReplaceAllString(array[1], "") == "" {
  730. text = array[0] + array[1]
  731. }
  732. return text
  733. }
  734. text = array[0] + array[1]
  735. return text
  736. }
  737. //打标签的时候,清理key
  738. //from 1--冒号key 2--table key
  739. func ClearKey(k string, from int) string {
  740. for {
  741. old := k
  742. if from == 1 {
  743. k = filterK.ReplaceAllString(k, "")
  744. }
  745. k = tablekeyclear.ReplaceAllString(k, "")
  746. k = regReplKey.ReplaceAllString(k, "")
  747. if old == k {
  748. break
  749. }
  750. }
  751. return k
  752. }
  753. //获取金额的单位
  754. func GetMoneyUnit(key, val string) string {
  755. if !(strings.Index(val, "元") > 0 || strings.Index(val, "万") > 0 || strings.Index(val, "亿") > 0) {
  756. mv := clear.ObjToMoney([]interface{}{val, val})
  757. if len(mv) > 0 && qutil.IntAll(mv[0]) > 0 {
  758. for _, dw := range []string{"万", "亿"} {
  759. if strings.Index(key, dw) > 0 {
  760. return dw
  761. }
  762. }
  763. }
  764. }
  765. return ""
  766. }