colonkv.go 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927
  1. //识别冒号kv
  2. package pretreated
  3. import (
  4. "jy/clear"
  5. . "jy/util"
  6. "log"
  7. qutil "qfw/util"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. "unicode/utf8"
  12. )
  13. type ColonkvEntity struct{}
  14. var (
  15. colonkvEntity = &ColonkvEntity{}
  16. regReplKV = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?")
  17. regReplKV2 = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
  18. regKV = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
  19. filterK = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
  20. filterValue = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)")
  21. regReplKey = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全]称|姓名)$")
  22. buyerAndAgency = regexp.MustCompile("(代理(机构|人)|采购(人|单位))")
  23. BlockTagMap = map[string]bool{
  24. "招标范围": true,
  25. "资格要求": true,
  26. }
  27. brackets = map[string]string{
  28. "(": ")",
  29. "(": ")",
  30. "": "",
  31. "[": "]",
  32. "【": "】",
  33. "{": "}",
  34. "{": "}",
  35. "《": "》",
  36. "<": ">",
  37. }
  38. //
  39. PersonReg = regexp.MustCompile("[\u4e00-\u9fa5]{2,5}")
  40. //
  41. TelMustReg = regexp.MustCompile("^" + PhoneReg.String() + "$")
  42. PersonMustReg = regexp.MustCompile("^" + PersonReg.String() + "$")
  43. AddressReg = regexp.MustCompile("[省市县区路号楼]")
  44. BracketsTextReg = regexp.MustCompile("[((]([^((]+)[))]")
  45. ContactBuyerTitleReg = regexp.MustCompile("采购联系事项")
  46. ContactAgencyTitleReg = regexp.MustCompile("招标联系事项")
  47. ZipCode = regexp.MustCompile("邮(政)?编(码)?")
  48. )
  49. //一行多个冒号kv处理
  50. func (ce *ColonkvEntity) divisionMoreKV(con string) string {
  51. con = regReplKV.ReplaceAllStringFunc(con, func(temp string) string {
  52. //分kv的时候出现括号不成对出现的情况,分错了跳过
  53. matchText := regReplKV.FindStringSubmatch(con)[1]
  54. for k, v := range brackets {
  55. if strings.Count(matchText, k) != strings.Count(matchText, v) {
  56. return temp
  57. }
  58. }
  59. return regReplKV.ReplaceAllString(temp, "$1\n\n$2")
  60. })
  61. con = regReplKV2.ReplaceAllString(con, "$1\n\n$2")
  62. return con
  63. }
  64. //获取冒号kv入口
  65. func (ce *ColonkvEntity) entrance(con, title string, contactFormat *ContactFormat, from int,isSite bool,codeSite string) ([]*Kv, map[string]string) {
  66. kvs := ce.GetKvs(con, title, from)
  67. if from == 1 {
  68. FormatContactKv(&kvs, title, nil, contactFormat,isSite,codeSite)
  69. }
  70. kv := map[string]string{}
  71. for _, v := range kvs {
  72. if strings.TrimSpace(v.Value) == "" {
  73. continue
  74. }
  75. kv[v.Key] = v.Value
  76. }
  77. return kvs, kv
  78. }
  79. //获取有序的kv
  80. func (ce *ColonkvEntity) GetKvs(con, title string, from int) []*Kv {
  81. con = ce.processText(con)
  82. kvs := ce.getColonKv(con, title, from)
  83. return kvs
  84. }
  85. //处理正文
  86. func (ce *ColonkvEntity) processText(con string) string {
  87. con = ce.divisionMoreKV(con) //一行多个冒号kv处理
  88. for {
  89. tmp := con
  90. con = ce.divisionMoreKV(con)
  91. if tmp == con {
  92. break
  93. }
  94. }
  95. return con
  96. }
  97. //分冒号kv
  98. //from 1--全文 2,3--table td
  99. func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv {
  100. if from == 2 || from == 3 {
  101. con = RemoveWarpOfTdVal(con)
  102. }
  103. findkvs := []*Kv{}
  104. lines := SspacekvEntity.getLines(con)
  105. for index, line := range lines {
  106. res := regKV.FindAllStringSubmatch(line, -1)
  107. if len(res) > 0 {
  108. for _, v := range res {
  109. key, val := "", ""
  110. if len(v) == 3 {
  111. key = v[1]
  112. val = v[2]
  113. } else if len(v) == 4 {
  114. key = v[2]
  115. val = v[3]
  116. }
  117. //Debug("KV-key", key, val)
  118. //Debug("KV-key", key, val)
  119. //地址、联系人可能会重复 单位、代理机构的\时间、地点
  120. if strings.TrimSpace(key) != "" {
  121. prevLine, nextLine := "", ""
  122. if index > 0 {
  123. prevLine = lines[index-1]
  124. }
  125. if index < len(lines)-1 {
  126. nextLine = lines[index+1]
  127. }
  128. findkvs = append(findkvs, &Kv{
  129. Key: key,
  130. Value: val,
  131. Line: line,
  132. PrevLine: prevLine,
  133. NextLine: nextLine,
  134. Title: title,
  135. })
  136. splitkeys := strings.Split(key, "/")
  137. splitvalues := strings.Split(val, "/")
  138. if len(splitkeys) > 1 && len(splitkeys) == len(splitvalues) {
  139. for splitindex, splitkey := range splitkeys {
  140. findkvs = append(findkvs, &Kv{
  141. Key: splitkey,
  142. Value: splitvalues[splitindex],
  143. Line: line,
  144. PrevLine: prevLine,
  145. NextLine: nextLine,
  146. Title: title,
  147. })
  148. }
  149. }
  150. }
  151. }
  152. }
  153. }
  154. return findkvs
  155. }
  156. //冒号kv和空格kv结合
  157. func (ce *ColonkvEntity) getColonSpaceKV(con string,isSite bool,codeSite string) []*Kv {
  158. con = colonkvEntity.processText(con)
  159. lines := SspacekvEntity.getLines(con)
  160. kvMaps := []*Kv{}
  161. for _, line := range lines {
  162. kvs := colonkvEntity.getColonKv(line, "", 1)
  163. if len(kvs) == 0 {
  164. kv := SspacekvEntity.divideKV(line,isSite,codeSite)
  165. if kv != nil {
  166. kvMaps = append(kvMaps, kv...)
  167. }
  168. } else {
  169. kvMaps = append(kvMaps, kvs...)
  170. }
  171. }
  172. return kvMaps
  173. }
  174. /*
  175. 五、递交响应文件时间及地点
  176. 1、时间:2016年5月20日14时00分至2016年5月20日14时30分(北京时间)
  177. 2、地点:烟台开发区公共资源交易中心A座5楼会议室(金沙江路83号)
  178. key 时间 处理成 递交响应文件时间
  179. */
  180. func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
  181. needKey := "时间"
  182. if key != needKey {
  183. return key
  184. }
  185. titles := regSplit.Split(title, -1)
  186. for _, v := range titles {
  187. if strings.HasSuffix(v, needKey) {
  188. return v
  189. }
  190. }
  191. return key
  192. }
  193. //根据配置文件中的规则,格式化正文
  194. func formatText(content, key string) string {
  195. segments := make([]*Segment, 0)
  196. if key == "all" {
  197. segments = DivideSegmentHtml(content)
  198. } else if key == "kv" {
  199. segments = DivideSegment(content)
  200. //log.Println("清理前:\n",content)
  201. }
  202. newCon := ""
  203. for _, v := range segments {
  204. if v.Index > len(segments)-3 {
  205. if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
  206. break
  207. }
  208. }
  209. if key == "kv" && utf8.RuneCountInString(v.Text) >= 1 {
  210. //log.Println("清理前:",v.Text, []rune(v.Text)[len([]rune(v.Text))-1])
  211. v.Text = strings.TrimRightFunc(v.Text, func(r rune) bool {
  212. return r == 19968 || r == 20108 || r == 19977 ||
  213. r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061
  214. })
  215. //log.Println("清理前后",v.Text)
  216. }
  217. newCon += v.Text + "\n"
  218. }
  219. content = regEndWrap.ReplaceAllString(newCon, "")
  220. //if key == "kv"{
  221. // log.Println("清理前后\n",content)
  222. //}
  223. for _, v := range FormatTextMap[key] {
  224. reg, _ := v["reg"].(*regexp.Regexp)
  225. separator, isString := v["separator"].(string)
  226. separators, isArray := v["separator"].([]interface{})
  227. if isArray {
  228. content = reg.ReplaceAllStringFunc(content, func(temp string) string {
  229. for _, sv := range separators {
  230. separator, _ := sv.(string)
  231. if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 {
  232. temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1])
  233. }
  234. }
  235. return temp
  236. })
  237. } else if isString {
  238. if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 {
  239. content = reg.ReplaceAllStringFunc(content, func(temp string) string {
  240. temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1])
  241. return temp
  242. })
  243. } else {
  244. content = reg.ReplaceAllString(content, separator)
  245. }
  246. }
  247. //Debug(v["reg"], content)
  248. }
  249. return content
  250. }
  251. func IsContactKvHandle(value string, m map[string]bool) bool {
  252. for k, _ := range m {
  253. // if k != value && (strings.HasPrefix(k, value) || strings.HasPrefix(value, k)) {
  254. // continue
  255. // }
  256. // if strings.Contains(value, k) || strings.Contains(k, value) {
  257. // return true
  258. // }
  259. if k == value {
  260. return true
  261. }
  262. }
  263. return false
  264. }
  265. //kv关于联系人信息的处理
  266. //采购人>集中采购机构
  267. func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *ContactFormat,isSite bool,codeSite string) {
  268. ////////////////////////////
  269. //处理联系人信息
  270. var indexMap map[int]string
  271. var matchMap map[string]map[string]bool
  272. //hasMatch := make(map[string]bool)
  273. if contactFormat == nil || title != "" {
  274. indexMap = map[int]string{}
  275. matchMap = map[string]map[string]bool{}
  276. } else {
  277. indexMap = contactFormat.IndexMap
  278. matchMap = contactFormat.MatchMap
  279. }
  280. ////////////////////////////
  281. totalIndexMap := map[string]bool{}
  282. ascFind := true
  283. ascFindFlag := len(indexMap) == 0 && buyers == nil
  284. //采购人在联系人、电话后面的处理
  285. isCanAddToIndexMap := false
  286. for _, kv := range *kvs {
  287. k := FilterContactKey(kv.Key)
  288. k_length := len([]rune(k))
  289. if k_length < 2 || k_length > 15 {
  290. continue
  291. }
  292. isContinue := ContactInfoMustReg.MatchString(k)
  293. if (isContinue || (ContactInfoVagueReg.MatchString(k) && IsMapHasValue(k, ContactType))) && ascFindFlag {
  294. if len(indexMap) > 0 {
  295. ascFind = true
  296. ascFindFlag = false
  297. indexMap = map[int]string{}
  298. }
  299. isCanAddToIndexMap = true
  300. }
  301. n := 1
  302. for _, ct_k := range HasOrderContactType(k) {
  303. if !ContactType[ct_k].MatchString(k) {
  304. continue
  305. }
  306. totalIndexMap[ct_k] = true
  307. /////////////////////////////
  308. if isContinue || !ascFindFlag {
  309. continue
  310. }
  311. // if isCanAddToIndexMap && len(indexMap) == 0 {
  312. if isCanAddToIndexMap {
  313. indexMap[n] = ct_k
  314. n++
  315. ascFind = false
  316. }
  317. }
  318. }
  319. mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
  320. titleMatch := false
  321. if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" {
  322. titleMatch = true
  323. mustMatchFirst = false
  324. indexMap = map[int]string{1: titleMatchType}
  325. }
  326. // if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 {
  327. // titleMatch = true
  328. // mustMatchFirst = false
  329. // for i, t := range titleMatchType {
  330. // indexMap[i+1] = t
  331. // }
  332. // }
  333. // if buyers == nil {
  334. // Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
  335. // }
  336. //Debug("buyers-------", buyers)
  337. // if buyers == nil {
  338. // for _, kv := range *kvs {
  339. // Debug("bbbbbbbbbb", kv.Key, kv.Value)
  340. // }
  341. // }
  342. startIndex := 0
  343. prevKey := ""
  344. index, tmpindex, notmatchCount, allMatchCount := 0, 0, 0, 0
  345. weightMap := map[string]map[string]interface{}{} //权重
  346. mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
  347. kvsTemp := make([]*Kv, len(*kvs))
  348. copy(kvsTemp, *kvs)
  349. //again := 0
  350. ishad := false
  351. for kv_index, kv := range *kvs {
  352. isBreak := true
  353. v := strings.TrimSpace(kv.Value)
  354. //根据采购单位分析
  355. isContinue := false
  356. k := FilterContactKey(kv.Key)
  357. k_length := len([]rune(k))
  358. if buyers != nil {
  359. for _, buyer := range buyers {
  360. if buyer == "" {
  361. continue
  362. }
  363. prevLine := kv.PrevLine
  364. prevLine = strings.TrimSpace(prevLine)
  365. prevLine = strings.Split(prevLine, " ")[0]
  366. buyerLenght, prevLineLength := len([]rune(buyer)), len([]rune(prevLine))
  367. prevNotEqual := true
  368. if kv_index > 0 {
  369. prevNotEqual = strings.TrimSpace(((*kvs)[kv_index-1]).Value) != buyer
  370. }
  371. matchBuyerSuccess := false
  372. if strings.HasPrefix(k, buyer) && ContactInfoVagueReg.MatchString(k) && k_length-buyerLenght >= 2 && k_length-buyerLenght <= 5 {
  373. matchBuyerSuccess = true
  374. k = strings.TrimLeft(k, buyer)
  375. k_length = len([]rune(k))
  376. // kvTemp := *kv
  377. // kvTemp.Key = strings.TrimLeft(k, buyer)
  378. // (*kvs)[kv_index] = &kvTemp
  379. } else if k == buyer {
  380. matchBuyerSuccess = true
  381. if PersonMustReg.MatchString(v) {
  382. k = "联系人"
  383. } else if TelMustReg.MatchString(v) {
  384. k = "联系电话"
  385. } else if AddressReg.MatchString(v) {
  386. k = "地址"
  387. } else if PersonReg.MatchString(v) || PhoneReg.MatchString(v) {
  388. k = "联系方式"
  389. } else if ZipCode.MatchString(v) {
  390. k = "邮政编码"
  391. }
  392. k_length = len([]rune(k))
  393. } else if strings.HasPrefix(strings.TrimSpace(v), buyer) || (prevNotEqual && buyerLenght >= prevLineLength-5 && buyerLenght <= prevLineLength && strings.Contains(prevLine, buyer)) {
  394. matchBuyerSuccess = true
  395. isContinue = true
  396. }
  397. if matchBuyerSuccess {
  398. isBreak = false
  399. matchMap["采购单位"] = map[string]bool{}
  400. indexMap[1] = "采购单位"
  401. break
  402. }
  403. }
  404. } else if ascFind {
  405. for _, ct_k := range HasOrderContactType(k) {
  406. ishad = false
  407. //again++
  408. if k_length < 3 || k_length > 15 {
  409. isBreak = false
  410. continue
  411. }
  412. if !ContactType[ct_k].MatchString(k) {
  413. continue
  414. }
  415. if weightMap[ct_k] == nil {
  416. weightMap[ct_k] = map[string]interface{}{}
  417. }
  418. isAddToMatchMap := false
  419. addToMatchMapKey := ""
  420. if ContactInfoVagueReg.MatchString(k) { //判断是不是电话、地址。。。
  421. isAddToMatchMap = true
  422. if matchMap[ct_k] == nil {
  423. matchMap[ct_k] = map[string]bool{}
  424. }
  425. if !strings.HasSuffix(k, "方式") {
  426. kvTags := GetKvTags([]*Kv{&Kv{Key: k, Value: v}}, "", BuyerContacts,isSite,codeSite)
  427. if len(kvTags) == 1 {
  428. tagVal, weightVal := FirstKeyValueInMap(kvTags)
  429. if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) {
  430. isAddToMatchMap = false
  431. }
  432. if mapIndexInKvs[ct_k] == nil {
  433. mapIndexInKvs[ct_k] = map[string]interface{}{}
  434. }
  435. myIndexInKvs := mapIndexInKvs[ct_k][tagVal]
  436. if myIndexInKvs != nil {
  437. if weightMap[ct_k][tagVal] == nil || (weightVal != nil && weightVal.(int) >= weightMap[ct_k][tagVal].(int)) {
  438. weightMap[ct_k][tagVal] = weightVal.(int)
  439. (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)]
  440. //(*kvs)[kv_index] = &Kv{Key: tagVal, Value: v}
  441. kvTemp := *kv
  442. kvTemp.Key = tagVal
  443. kvTemp.Value = v
  444. (*kvs)[kv_index] = &kvTemp
  445. }
  446. } else {
  447. weightMap[ct_k][tagVal] = weightVal.(int)
  448. }
  449. mapIndexInKvs[ct_k][tagVal] = kv_index
  450. }
  451. }
  452. addToMatchMapKey = k
  453. if ct_k == "采购单位" {
  454. k = ContactType[ct_k].FindString(k)
  455. }
  456. }
  457. if ct_k == "采购单位" { //打标签,权重高的重新覆盖
  458. kvTags := GetKvTags([]*Kv{&Kv{Key: k, Value: v}}, "", []string{"采购单位"},isSite,codeSite)
  459. tagVal, weightVal := FirstKeyValueInMap(kvTags)
  460. if tagVal == ct_k {
  461. if weightMap[ct_k][ct_k] == nil || (weightVal != nil && weightVal.(int) > weightMap[ct_k][ct_k].(int)) {
  462. weightMap[ct_k][ct_k] = weightVal.(int)
  463. matchMap[ct_k] = map[string]bool{}
  464. isBreak = false
  465. prevKey = ""
  466. }
  467. }
  468. }
  469. if isAddToMatchMap && !filterValue.MatchString(v) {
  470. matchMap[ct_k][ContactInfoVagueReg.FindString(addToMatchMapKey)] = true
  471. }
  472. allMatchCount++
  473. if IsMapHasValue(ct_k, indexMap) {
  474. ishad = true
  475. tmpindex = GetIndex(ct_k, indexMap)
  476. isContinue = true
  477. continue
  478. }
  479. isBreak = false
  480. if index != 0 || notmatchCount != 0 {
  481. startIndex = 0
  482. indexMap = map[int]string{}
  483. }
  484. if startIndex == 0 {
  485. indexMap = map[int]string{}
  486. }
  487. prevKey = ""
  488. startIndex++
  489. indexMap[startIndex] = ct_k
  490. isContinue = true
  491. }
  492. }
  493. if isContinue {
  494. continue
  495. }
  496. // if buyers == nil {
  497. // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount)
  498. // }
  499. if len(indexMap) == 0 {
  500. continue
  501. }
  502. if titleMatch && !ContactInfoMustReg.MatchString(k) {
  503. k = ContactInfoVagueReg.FindString(k)
  504. k_length = len([]rune(k))
  505. }
  506. if k_length < 2 || k_length > 10 {
  507. isBreak = false
  508. continue
  509. }
  510. if !ContactInfoMustReg.MatchString(k) { //判断是否是电话、邮箱、地址等信息
  511. if mustMatchFirst {
  512. mustMatchFirst = false
  513. continue
  514. }
  515. if buyers == nil && len(totalIndexMap) != 0 {
  516. isBreak = false
  517. }
  518. //允许有这么多个匹配不上的key
  519. notmatchCount++
  520. if notmatchCount < len(indexMap)*2 {
  521. isBreak = false
  522. } else if contactFormat == nil && ascFind {
  523. startIndex = 0
  524. notmatchCount = 0
  525. indexMap = map[int]string{}
  526. //matchMap = map[string]map[string]bool{}
  527. }
  528. continue
  529. }
  530. isBreak = false
  531. // if prevKey != k && !hasMatch[k] {
  532. // prevKey = k
  533. // index = 1
  534. // } else if index < 2 {
  535. // index++
  536. // }
  537. if ishad {
  538. index = tmpindex
  539. } else {
  540. if prevKey != k {
  541. prevKey = k
  542. index = 1
  543. } else if prevKey == k {
  544. index++
  545. }
  546. }
  547. // if startIndex == 0 || startIndex%2 == 1 || index == 0 {
  548. // index = 1
  549. // } else if startIndex%2 == 0 {
  550. // index = 2
  551. // }
  552. //hasMatch[k] = true
  553. //过滤值
  554. if filterValue.MatchString(v) {
  555. continue
  556. }
  557. myContactType := indexMap[index]
  558. if myContactType == "" {
  559. continue
  560. }
  561. // if buyers == nil {
  562. // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount)
  563. // }
  564. if strings.HasSuffix(k, "方式") && TelMustReg.MatchString(v) {
  565. k = "联系电话"
  566. }
  567. if matchMap[myContactType] == nil {
  568. matchMap[myContactType] = map[string]bool{}
  569. }
  570. myTagValue := ContactInfoMustReg.FindString(k)
  571. if myTagValue == "" && titleMatch {
  572. myTagValue = ContactInfoVagueReg.FindString(k)
  573. }
  574. if IsContactKvHandle(myTagValue, matchMap[myContactType]) {
  575. continue
  576. }
  577. matchMap[myContactType][myTagValue] = true
  578. if ContactType[myContactType].MatchString(k) {
  579. continue
  580. }
  581. allMatchCount++
  582. delete(totalIndexMap, myContactType)
  583. if !strings.HasSuffix(k, "方式") {
  584. kvTags := GetKvTags([]*Kv{&Kv{Key: myContactType + k, Value: v}}, "", BuyerContacts,isSite,codeSite)
  585. if len(kvTags) == 1 {
  586. tagVal, _ := FirstKeyValueInMap(kvTags)
  587. if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) {
  588. continue
  589. }
  590. if mapIndexInKvs[myContactType] == nil {
  591. mapIndexInKvs[myContactType] = map[string]interface{}{}
  592. }
  593. myIndexInKvs := mapIndexInKvs[myContactType][tagVal]
  594. if myIndexInKvs != nil {
  595. (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)]
  596. }
  597. mapIndexInKvs[myContactType][tagVal] = kv_index
  598. if weightMap[myContactType] == nil {
  599. weightMap[myContactType] = map[string]interface{}{}
  600. }
  601. weightMap[myContactType][tagVal] = 1
  602. }
  603. }
  604. //(*kvs)[kv_index] = &Kv{Key: myContactType + k, Value: v}
  605. kvTemp := *kv
  606. kvTemp.Key = myContactType + k
  607. kvTemp.Value = v
  608. (*kvs)[kv_index] = &kvTemp
  609. if ascFind && isBreak && len(indexMap) > 0 {
  610. break
  611. }
  612. }
  613. if allMatchCount == 0 && len(*kvs) > 0 {
  614. indexMap = map[int]string{}
  615. matchMap = map[string]map[string]bool{}
  616. }
  617. if contactFormat != nil {
  618. (*contactFormat).IndexMap = indexMap
  619. (*contactFormat).MatchMap = matchMap
  620. }
  621. // if buyers == nil {
  622. // for _, kv := range *kvs {
  623. // Debug("bbbbbbbbbb", kv.Key, kv.Value)
  624. // }
  625. // }
  626. //Debug("totalIndexMap", len(totalIndexMap))
  627. }
  628. func ContactTypeTitleMatch(title string) string {
  629. // matchType := []string{}
  630. // matchTypeMap := map[string]bool{}
  631. // if title != "" && len([]rune(title)) < 25 {
  632. // if ContactBuyerTitleReg.MatchString(title) {
  633. // matchType = append(matchType, "采购单位")
  634. // matchTypeMap["采购单位"] = true
  635. // }
  636. // if ContactAgencyTitleReg.MatchString(title) {
  637. // matchType = append(matchType, "代理机构")
  638. // matchTypeMap["代理机构"] = true
  639. // }
  640. // if len(matchType) == 2 {
  641. // return matchType
  642. // }
  643. // for _, ct_k := range HasOrderContactType(title) {
  644. // if ContactType[ct_k].MatchString(title) && !matchTypeMap[ct_k] {
  645. // matchType = append(matchType, ct_k)
  646. // }
  647. // }
  648. // }
  649. matchType := ""
  650. if title != "" && len([]rune(title)) < 15 {
  651. if ContactBuyerTitleReg.MatchString(title) {
  652. matchType = "采购单位"
  653. } else if ContactAgencyTitleReg.MatchString(title) {
  654. matchType = "代理机构"
  655. } else {
  656. for _, ct_k := range HasOrderContactType(title) {
  657. if ContactType[ct_k].MatchString(title) {
  658. matchType = ct_k
  659. break
  660. }
  661. }
  662. }
  663. }
  664. return matchType
  665. }
  666. //获取带有排序的联系人类型
  667. func HasOrderContactType(text string) []string {
  668. indexs := []int{}
  669. indexMap := map[int]string{}
  670. temp := []string{}
  671. for k, v := range ContactType {
  672. s := v.FindStringIndex(text)
  673. if len(s) > 1 {
  674. if indexMap[s[0]] != "" {
  675. temp = append(temp, k)
  676. } else {
  677. indexs = append(indexs, s[0])
  678. indexMap[s[0]] = k
  679. }
  680. }
  681. }
  682. sort.Ints(indexs)
  683. result := []string{}
  684. for _, v := range indexs {
  685. result = append(result, indexMap[v])
  686. }
  687. if len(temp) > 0 {
  688. result = append(result, temp...)
  689. }
  690. return result
  691. }
  692. //两种冒号kv结合到一起
  693. //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包
  694. func GetKVAll(content, title string, contactFormat *ContactFormat, from int,isSite bool,codeSite string) *JobKv {
  695. content = formatText(content, "kv")
  696. m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from,isSite,codeSite)
  697. // for _, kvs := range m1Kvs {
  698. // qutil.Debug(kvs.Key, kvs.Value)
  699. // }
  700. kvTags := GetKvTags(m1Kvs, title, nil,isSite,codeSite)
  701. // for k, kvs := range kvTags {
  702. // qutil.Debug("kkkkk--", k)
  703. // for _, kv := range kvs {
  704. // qutil.Debug(kv.Key, kv.Value)
  705. // }
  706. // }
  707. m2Kvs, m2KvTags := GetKvFromtxt(content, title, from,isSite,codeSite)
  708. // for k, kvs := range m2KvTags {
  709. // qutil.Debug("kkkkk--", k)
  710. // for _, kv := range kvs {
  711. // qutil.Debug(kv.Key, kv.Value)
  712. // }
  713. // }
  714. MergeKvTags(kvTags, m2KvTags)
  715. // for k, kvs := range kvTags {
  716. // qutil.Debug("kkkkk--", k)
  717. // for _, kv := range kvs {
  718. // qutil.Debug(kv.Key, kv.Value)
  719. // }
  720. // }
  721. return &JobKv{
  722. Kvs: m1Kvs,
  723. Kvs_2: m2Kvs,
  724. KvTags: kvTags,
  725. }
  726. }
  727. //合并kv标签,把kvTags_2合并到kvTags_1
  728. func MergeKvTags(kvTags_1, kvTags_2 map[string][]*Tag) {
  729. for k, v := range kvTags_2 {
  730. for _, vv := range v {
  731. value_vv := strings.TrimSpace(vv.Value)
  732. if value_vv == "" || vv.Key == vv.Value {
  733. continue
  734. }
  735. isExists := false
  736. for _, vvv := range kvTags_1[k] {
  737. value_vvv := strings.TrimSpace(vvv.Value)
  738. if (value_vvv == value_vv || TimeHM.ReplaceAllString(value_vvv, ReplTimeHM) == value_vv || value_vvv == TimeHM.ReplaceAllString(value_vv, ReplTimeHM)) && vvv.Weight == vv.Weight {
  739. isExists = true
  740. break
  741. }
  742. }
  743. if !isExists {
  744. kvTags_1[k] = append(kvTags_1[k], vv)
  745. }
  746. }
  747. }
  748. }
  749. //控制台输出kv的值
  750. func PrintKvTags(kvTags map[string][]*Tag) {
  751. for k, v := range kvTags {
  752. for _, vv := range v {
  753. log.Println("kvTags===", k, "---", vv.Key, vv.Value, vv.Weight, vv.IsInvalid)
  754. }
  755. }
  756. }
  757. //KVTags转kv
  758. func GetKvTags(findkvs []*Kv, title string, tagdbs []string,isSite bool,codeSite string) map[string][]*Tag {
  759. kvTags := map[string][]*Tag{}
  760. if title != "" && BlockTagMap[title] {
  761. kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
  762. }
  763. for _, findkv := range findkvs {
  764. k, val, nextval := findkv.Key, strings.TrimSpace(findkv.Value), strings.TrimSpace(findkv.NextLine)
  765. //val是空的话,不打标签
  766. if filterValue.MatchString(val) {
  767. continue
  768. }
  769. key := k
  770. key = ClearKey(key, 1)
  771. if key == "" {
  772. continue
  773. }
  774. key = colonkvEntity.blockTitleKV(title, key)
  775. //先用新的key
  776. tags := GetAppointTags(key, tagdbs,isSite,codeSite) //找标签库
  777. if len(tags) == 0 && len(key) < 10 && len(title) > 0 && len(title) < 15 {
  778. key = title + key
  779. tags = GetAppointTags(key, tagdbs,isSite,codeSite)
  780. }
  781. //再用老的key
  782. if len(tags) == 0 && k != key {
  783. tags = GetAppointTags(k, tagdbs,isSite,codeSite)
  784. if len(tags) == 0 && len(k) < 10 && len(title) > 0 && len(title) < 15 {
  785. k = title + k
  786. tags = GetAppointTags(k, tagdbs,isSite,codeSite)
  787. if len(tags) > 0 {
  788. key = k
  789. }
  790. }
  791. }
  792. if len(tags) > 0 {
  793. for _, tk := range tags {
  794. if moneyreg.MatchString(tk.Value) {
  795. val += GetMoneyUnit(k, val)
  796. }
  797. if val != "" {
  798. kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: val, Weight: tk.Weight})
  799. } else if nextval != "" && utf8.RuneCountInString(nextval) < 30 {
  800. if strings.Contains(nextval, ":") || strings.Contains(nextval, ":") {
  801. if len(strings.Split(nextval, ":")) > 1 || len(strings.Split(nextval, ":")) > 1 {
  802. //tmpnextval := ""
  803. nextval = strings.Split(nextval, ":")[0]
  804. nextval = strings.Split(nextval, ":")[0]
  805. if strings.TrimSpace(nextval) == "" {
  806. continue
  807. }
  808. if GetAppointTags(nextval, tagdbs,isSite,codeSite).Len() > 0 || GetAppointTags(k, tagdbs,isSite,codeSite).Len() > 0 {
  809. continue
  810. }
  811. }
  812. }
  813. kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: nextval, Weight: tk.Weight})
  814. }
  815. }
  816. } else {
  817. if val != "" {
  818. kvTags[k] = append(kvTags[k], &Tag{Key: k, Value: val, IsInvalid: true})
  819. }
  820. }
  821. }
  822. return kvTags
  823. }
  824. func FilterContactKey(key string) string {
  825. key1 := ""
  826. for _, v := range BracketsTextReg.FindAllString(key, -1) {
  827. for _, vv := range ContactType {
  828. if vv.MatchString(v) {
  829. if len([]rune(v)) < 3 || len([]rune(v)) > 10 {
  830. continue
  831. }
  832. key1 = v
  833. break
  834. }
  835. }
  836. }
  837. key = filterK.ReplaceAllString(key, "")
  838. key = tablekeyclear.ReplaceAllString(key, "")
  839. return key1 + key
  840. }
  841. //td里的内容,调用这边的方法分kv的时候,有的带有换行,清理掉
  842. func RemoveWarpOfTdVal(text string) string {
  843. //只有一个冒号
  844. if len(regDivision.FindAllString(text, -1)) != 1 {
  845. return text
  846. }
  847. text = strings.TrimSpace(text)
  848. //有一个换行
  849. array := strings.Split(text, "\n")
  850. if len(array) != 2 {
  851. return text
  852. }
  853. //第一行以冒号结尾
  854. if !colonEndReg.MatchString(array[0]) {
  855. if BracketsTextReg.ReplaceAllString(array[1], "") == "" {
  856. text = array[0] + array[1]
  857. }
  858. return text
  859. }
  860. text = array[0] + array[1]
  861. return text
  862. }
  863. //打标签的时候,清理key
  864. //from 1--冒号key 2--table key
  865. func ClearKey(k string, from int) string {
  866. if buyerAndAgency.MatchString(filterK.FindString(k)) { //采购项目联系人(代理机构)5d423d70a5cb26b9b76fa2e7
  867. return k
  868. }
  869. for {
  870. old := k
  871. if from == 1 {
  872. k = filterK.ReplaceAllString(k, "")
  873. }
  874. k = tablekeyclear.ReplaceAllString(k, "")
  875. k = regReplKey.ReplaceAllString(k, "")
  876. if old == k {
  877. break
  878. }
  879. }
  880. return k
  881. }
  882. //获取金额的单位
  883. func GetMoneyUnit(key, val string) string {
  884. if !(strings.Index(val, "元") > 0 || strings.Index(val, "万") > 0 || strings.Index(val, "亿") > 0) {
  885. mv := clear.ObjToMoney([]interface{}{val, val})
  886. if len(mv) > 0 && qutil.IntAll(mv[0]) > 0 {
  887. for _, dw := range []string{"万", "亿"} {
  888. if strings.Index(key, dw) > 0 {
  889. return dw
  890. }
  891. }
  892. }
  893. }
  894. return ""
  895. }
  896. func GetIndex(ct_k string, indexMap map[int]string) int {
  897. for k, v := range indexMap {
  898. if ct_k == v {
  899. return k
  900. }
  901. }
  902. return 1
  903. }