analykv.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497
  1. package pretreated
  2. import (
  3. u "jy/util"
  4. //qu "qfw/util"
  5. "regexp"
  6. "strings"
  7. )
  8. var Han = regexp.MustCompile("[\\p{Han}]")
  9. var Han1 = regexp.MustCompile("[^:;;,:,。. \u3000\u2003\u00a0\\s]")
  10. var Han2 = regexp.MustCompile("[^:;;,:,。.]")
  11. var Key = regexp.MustCompile("[:::]")
  12. var Time = regexp.MustCompile("[\\d]")
  13. var dh = regexp.MustCompile("[,,.]")
  14. var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+")
  15. var val = regexp.MustCompile("[^\\s\\n \u3000\u2003\u00a0,,。!;;]")
  16. var matchkh = map[string]string{
  17. "(": ")",
  18. "(": ")",
  19. "【": "】",
  20. "[": "]",
  21. "[": "]",
  22. "〖": "〗",
  23. }
  24. func GetKvFromtxt(con, tag string, from int, isSite bool, codeSite string) ([]*u.Kv, map[string][]*u.Tag) {
  25. res := FindKv(TextAfterRemoveTable(con), tag, from)
  26. kvs := []*u.Kv{}
  27. for _, k := range res.Keys {
  28. v, _ := res.Map[k].(string)
  29. if k != "" && v != "" {
  30. kvs = append(kvs, &u.Kv{
  31. Key: k,
  32. Value: v,
  33. })
  34. }
  35. }
  36. kvTags := GetKvTags(kvs, tag, nil, isSite, codeSite)
  37. return kvs, kvTags
  38. }
  39. type Line struct {
  40. PreLine *Line
  41. NextLine *Line
  42. Strs []string
  43. Str string
  44. Pos int
  45. Len int
  46. KV *SortMap
  47. IsKey bool //是否只是key
  48. Kn int //冒号个数
  49. Spacen int //间隔空格个数
  50. DJh int //逗号句号
  51. }
  52. func NewLine() *Line {
  53. return &Line{
  54. Strs: []string{},
  55. KV: NewSortMap(),
  56. }
  57. }
  58. var LineKey = regexp.MustCompile("^[^,。]{2,10}[::]$")
  59. var DJh = regexp.MustCompile("[,,。]")
  60. var DunH = regexp.MustCompile("[、.]")
  61. func GetLines(con string) (res []*Line) {
  62. res = []*Line{}
  63. l1 := NewLine()
  64. strings.IndexFunc(con, func(r rune) bool {
  65. if r == 10 {
  66. if len(l1.Strs) > 0 {
  67. l1.Str = strings.Join(l1.Strs, "")
  68. if !regexp.MustCompile("^[,,.。\\s \u3000\u2003\u00a0]$").MatchString(l1.Str) {
  69. l1.Str = u.TrimLRSpace(l1.Str, "")
  70. l1.Str = TimeHM.ReplaceAllString(l1.Str, ReplTimeHM)
  71. l1.Strs = strings.Split(l1.Str, "")
  72. res = append(res, l1)
  73. }
  74. l1 = NewLine()
  75. }
  76. } else {
  77. s := string(r)
  78. l1.Strs = append(l1.Strs, s)
  79. }
  80. return false
  81. })
  82. if len(l1.Strs) > 0 {
  83. res = append(res, l1)
  84. }
  85. for k, l := range res {
  86. if k == 0 && k < len(res)-1 {
  87. l.NextLine = res[k+1]
  88. } else if k == len(res)-1 {
  89. l.PreLine = res[k-1]
  90. } else {
  91. l.PreLine = res[k-1]
  92. l.NextLine = res[k+1]
  93. }
  94. if LineKey.MatchString(l.Str) {
  95. l.IsKey = true
  96. } else {
  97. l.Kn = len(Key.FindAllString(l.Str, -1))
  98. l.DJh = len(DJh.FindAllString(l.Str, -1))
  99. }
  100. }
  101. return
  102. }
  103. func FindKv_v2(con, tag string) (m *SortMap) {
  104. m = NewSortMap()
  105. resLine := GetLines(con)
  106. for i := 0; i < len(resLine); i++ {
  107. l1 := resLine[i]
  108. if l1.IsKey {
  109. continue
  110. } else {
  111. if l1.Kn > 0 {
  112. u.Debug("=--=", l1.Str)
  113. } else {
  114. if l1.Spacen == 1 && l1.DJh < 2 && l1.Len < 50 {
  115. u.Debug("===", l1.Str)
  116. } else {
  117. u.Debug("???", l1.Str)
  118. }
  119. }
  120. }
  121. }
  122. return
  123. }
  124. var TimeHM = regexp.MustCompile("[\\s \u3000\u2003\u00a0]*([01]{0,1}[0123456789]|2[0123])[::]([012345][0-9])[::]{0,1}")
  125. var ReplTimeHM = "D${1}H${2}M"
  126. //from 1--全文 2--table td
  127. func FindKv(con, tag string, from int) (m *SortMap) {
  128. if from == 2 || from == 3 {
  129. con = RemoveWarpOfTdVal(con)
  130. }
  131. //FindKv_v2(con, tag)
  132. matchMap := map[string]map[string]bool{
  133. "代理机构": map[string]bool{},
  134. "中标单位": map[string]bool{},
  135. "采购单位": map[string]bool{},
  136. }
  137. doubtMap := map[int]bool{}
  138. m = NewSortMap()
  139. strs := [][]string{}
  140. s1 := []string{}
  141. //断行
  142. strings.IndexFunc(con, func(r rune) bool {
  143. if r == 10 || r == 59 {
  144. if len(s1) > 0 {
  145. str := strings.Join(s1, "")
  146. str = u.TrimLRSpace(str, "")
  147. str = TimeHM.ReplaceAllString(str, ReplTimeHM)
  148. s1 = strings.Split(str, "")
  149. if len(s1) > 0 {
  150. strs = append(strs, s1)
  151. }
  152. s1 = []string{}
  153. }
  154. } else {
  155. s := string(r)
  156. s1 = append(s1, s)
  157. }
  158. return false
  159. })
  160. if len(s1) > 0 {
  161. str := strings.Join(s1, "")
  162. str = u.TrimLRSpace(str, "")
  163. str = TimeHM.ReplaceAllString(str, ReplTimeHM)
  164. s1 = strings.Split(str, "")
  165. if len(s1) > 0 {
  166. strs = append(strs, s1)
  167. }
  168. }
  169. //查找
  170. LastStr := ""
  171. for k0 := 0; k0 < len(strs); k0++ {
  172. s1 := strs[k0]
  173. //u.Debug(strings.Join(s1, ""))
  174. str1 := strings.Join(s1, "")
  175. k := ""
  176. v := ""
  177. flag := 0
  178. pos1, pos2 := -1, -1
  179. bkh := false
  180. skh := ""
  181. if from == 1 && DoubtReg.MatchString(str1) {
  182. doubtMap[k0] = true
  183. }
  184. if !Key.MatchString(str1) { //此行没有冒号
  185. if k0 > 0 {
  186. tm1 := strs[k0-1]
  187. if len([]rune(LastStr)) > 2 && len(tm1) < 8 && Key.MatchString(tm1[len(tm1)-1:][0]) && len([]rune(str1)) < 30 {
  188. //u.Debug(LastStr, str1)
  189. k = strings.Join(tm1[:len(tm1)-1], "")
  190. v = str1
  191. if k0 < len(strs)-1 {
  192. s2 := u.TrimLRSpace(strings.Join(strs[k0+1], ""), "")
  193. if len([]rune(s2)) < 10 && !regexp.MustCompile("^[0-9]+[、]+$").MatchString(s2) && !Key.MatchString(s2) {
  194. v += s2
  195. k0++
  196. }
  197. }
  198. keydetail(k, v, m, tag, k0, strs, matchMap, from, doubtMap)
  199. }
  200. }
  201. LastStr = ""
  202. continue
  203. } else {
  204. //u.Debug("---===----", str1)
  205. LastStr = str1
  206. for k1 := 0; k1 < len(s1); k1++ {
  207. s := s1[k1]
  208. if matchkh[s] != "" {
  209. skh = matchkh[s]
  210. bkh = true
  211. }
  212. if bkh {
  213. if skh == s {
  214. bkh = false
  215. }
  216. if flag == 1 {
  217. k += s
  218. } else if flag == 2 {
  219. v += s
  220. }
  221. continue
  222. }
  223. if flag == 0 {
  224. k = ""
  225. v = ""
  226. pos1, pos2 = -1, -1
  227. flag = 1
  228. }
  229. if flag == 1 {
  230. if Han1.MatchString(s) || (k != "" && Han2.MatchString(s)) {
  231. k += s
  232. } else if Key.MatchString(s) && k != "" {
  233. flag = 2
  234. } else {
  235. flag = 0
  236. }
  237. } else if flag == 2 {
  238. if val.MatchString(s) || (dh.MatchString(s) && k1 > 0 && k1 < len(s1)-1 && Time.MatchString(s1[k1-1]) && Time.MatchString(s1[k1+1])) {
  239. if pos1 < 0 {
  240. pos1 = k1
  241. }
  242. continue
  243. } else {
  244. be := false
  245. if space.MatchString(s) {
  246. temp := s1[k1+1:]
  247. //()()[]【】
  248. m1 := k1
  249. bkh1 := false
  250. skh1 := ""
  251. for k2, v2 := range temp {
  252. if k2 == len(temp)-1 {
  253. be = true
  254. }
  255. if matchkh[v2] != "" {
  256. bkh1 = true
  257. skh1 = matchkh[v2]
  258. continue
  259. } else if bkh1 {
  260. if v2 == skh1 {
  261. bkh1 = false
  262. }
  263. continue
  264. } else if space.MatchString(v2) {
  265. continue
  266. } else if !val.MatchString(v2) {
  267. k1 = m1 + k2 + 1
  268. break
  269. } else {
  270. if pos1 < 0 {
  271. //u.Debug("-----", pos1)
  272. pos1 = k1 + k2 + 1
  273. }
  274. }
  275. if Key.MatchString(v2) && k2 > 0 && k2 < len(temp)-1 {
  276. if Time.MatchString(temp[k2-1]) && Time.MatchString(temp[k2+1]) {
  277. //u.Debug(v2, temp[k2-1], temp[k2+1])
  278. k1 = m1 + k2 + 1
  279. } else {
  280. //倒着
  281. for i := k2; i > k1-m1-1; i-- {
  282. if !val.MatchString(temp[i]) {
  283. k1 = m1 + i + 1
  284. break
  285. }
  286. }
  287. break
  288. }
  289. }
  290. }
  291. }
  292. if be {
  293. k1 = len(s1) //直接跳到最后
  294. }
  295. if pos2 < 0 && pos2 < pos1 {
  296. pos2 = k1
  297. }
  298. // u.Debug(pos1, pos2, k1, len(s1))
  299. if pos1 > -1 && pos2 > pos1 {
  300. v = strings.Join(s1[pos1:pos2], "")
  301. flag = 0
  302. keydetail(k, v, m, tag, k0, strs, matchMap, from, doubtMap)
  303. } else {
  304. //u.Debug(k, pos1, pos2)
  305. flag = 0
  306. }
  307. }
  308. }
  309. }
  310. if flag == 2 {
  311. if pos2 > pos1 {
  312. v = strings.Join(s1[pos1:pos2], "")
  313. } else if pos1 > 0 {
  314. v = strings.Join(s1[pos1:], "")
  315. }
  316. if v != "" {
  317. flag = 0
  318. keydetail(k, v, m, tag, k0, strs, matchMap, from, doubtMap)
  319. }
  320. //u.Debug(k, v)
  321. }
  322. }
  323. }
  324. // for _, kk := range m.Keys {
  325. // u.Debug(kk, m.Map[kk])
  326. // }
  327. return
  328. }
  329. func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, matchMap map[string]map[string]bool, from int, doubtMap map[int]bool) {
  330. if regexp.MustCompile("^[0-9]+[、]+$").MatchString(v) {
  331. return
  332. }
  333. k = space.ReplaceAllString(k, "")
  334. if len([]rune(k)) > 1 {
  335. if len([]rune(k)) < 5 && strings.HasPrefix(k, "联系") || ContactInfoMustReg.MatchString(k) {
  336. /*
  337. 5ded053fe9d1f601e4c9e3ee
  338. 中标人:XXXXXXXXXXXXXXXXXX
  339. 相关竞价人对成交结果有异议的,可自本公告发布之日起三日内书面提出。
  340. 联系方式:卢明珠 0871-66136373
  341. */
  342. if doubtMap[pos-1] && len(m.Map) == 1 { //当识别到中标、采购、代理标签后,对其后的联系人、电话等信息判断是否属于该标签
  343. goto L
  344. }
  345. num := 0
  346. bf := false
  347. for i := len(m.Keys) - 1; i > -1; i-- {
  348. //u.Debug("k", k)
  349. num++
  350. if from == 1 && !ContactType["代理机构"].MatchString(k) && ContactType["代理机构"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["代理机构"]) {
  351. matchMap["代理机构"][k] = true
  352. k = "代理机构" + k
  353. bf = true
  354. break
  355. }
  356. //if !filter_zbdw_ky.MatchString(k) && filter_zbdw_ky.MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["中标单位"]) {
  357. if from == 1 && !ContactType["中标单位"].MatchString(k) && ContactType["中标单位"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["中标单位"]) {
  358. matchMap["中标单位"][k] = true
  359. k = "中标单位" + k
  360. bf = true
  361. break
  362. }
  363. if from == 1 && !ContactType["采购单位"].MatchString(k) && ContactType["采购单位"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["采购单位"]) {
  364. matchMap["采购单位"][k] = true
  365. k = "采购单位" + k
  366. bf = true
  367. break
  368. }
  369. //if num > 0 {
  370. break
  371. //}
  372. }
  373. if !bf {
  374. //k = "采购人" + k
  375. //取出上一行
  376. if pos > 0 {
  377. if len(strs[pos-1]) < 20 {
  378. str := space.ReplaceAllString(strings.Join(strs[pos-1], ""), "")
  379. if from == 1 && ContactType["代理机构"].MatchString(str) && !IsContactKvHandle(k, matchMap["代理机构"]) {
  380. matchMap["代理机构"][k] = true
  381. k = "代理机构" + k
  382. } else if from == 1 && ContactType["中标单位"].MatchString(str) && !IsContactKvHandle(k, matchMap["中标单位"]) {
  383. matchMap["中标单位"][k] = true
  384. k = "中标单位" + k
  385. } else if from == 1 && ContactType["采购单位"].MatchString(str) && !IsContactKvHandle(k, matchMap["采购单位"]) {
  386. matchMap["采购单位"][k] = true
  387. k = "采购单位" + k
  388. }
  389. }
  390. }
  391. }
  392. } else if len([]rune(k)) == 2 {
  393. if filter_zbje_jd.MatchString(k) { //钱
  394. if tag != "" && filter_tag_zb.MatchString(tag) {
  395. k = "中标" + k
  396. } else {
  397. num := 0
  398. for i := len(m.Keys) - 1; i > -1; i-- {
  399. num++
  400. if filter_zbdw_ky.MatchString(m.Keys[i]) {
  401. k = "中标" + k
  402. break
  403. }
  404. if num > 2 {
  405. break
  406. }
  407. }
  408. }
  409. }
  410. }
  411. L:
  412. //qu.Debug(k, v)
  413. if m.Map[k] == nil {
  414. m.AddKey(k, v)
  415. } else {
  416. vals := []string{}
  417. if vvv, ok := m.Map[k].([]string); ok {
  418. vals = append(vals, vvv...)
  419. } else {
  420. vals = append(vals, v)
  421. }
  422. vals = append(vals, v)
  423. m.AddKey(k, vals)
  424. }
  425. }
  426. }
  427. //时间处理、换行优先级|
  428. func FindKv_v1(con string) (m map[string]string) {
  429. m = map[string]string{}
  430. k := ""
  431. v := ""
  432. flag := 0
  433. strings.IndexFunc(con, func(r rune) bool {
  434. s := string(r)
  435. if flag == 0 {
  436. k = ""
  437. v = ""
  438. flag = 1
  439. }
  440. if flag == 1 {
  441. if Han.MatchString(s) {
  442. k += s
  443. } else if Key.MatchString(s) && k != "" {
  444. flag = 2
  445. } else {
  446. flag = 0
  447. }
  448. } else if flag == 2 {
  449. if v == "" {
  450. if space.MatchString(s) {
  451. } else if val.MatchString(s) && !Key.MatchString(s) {
  452. v += s
  453. } else {
  454. flag = 0
  455. }
  456. } else {
  457. if val.MatchString(s) {
  458. if Key.MatchString(k) {
  459. if (regexp.MustCompile("(时间|日期)").MatchString(v) || regexp.MustCompile("(时间|日期)").MatchString(k)) && regexp.MustCompile("[^\\d][012]?[0-9]").MatchString(k) {
  460. v += s
  461. } else if regexp.MustCompile("^[\\p{Han}]$").MatchString(v) {
  462. k = v
  463. v = ""
  464. flag = 1
  465. }
  466. } else {
  467. v += s
  468. }
  469. } else if k != "" && v != "" {
  470. u.Debug(k, "=", v)
  471. flag = 0
  472. }
  473. }
  474. }
  475. return false
  476. })
  477. if flag == 2 && k != "" && v != "" {
  478. u.Debug(k, "=", v)
  479. }
  480. return
  481. }