analykv.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. package pretreated
  2. import (
  3. u "jy/util"
  4. "regexp"
  5. "strings"
  6. )
  7. var Han = regexp.MustCompile("[\\p{Han}]")
  8. var Han1 = regexp.MustCompile("[^:;;,:,。. \u3000\u2003\u00a0\\s]")
  9. var Han2 = regexp.MustCompile("[^:;;,:,。.]")
  10. var Key = regexp.MustCompile("[:::]")
  11. var Time = regexp.MustCompile("[\\d]")
  12. var dh = regexp.MustCompile("[,,.]")
  13. var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+")
  14. var val = regexp.MustCompile("[^\\s\\n \u3000\u2003\u00a0,,。!;;\\-]")
  15. var matchkh = map[string]string{
  16. "(": ")",
  17. "(": ")",
  18. "【": "】",
  19. "[": "]",
  20. "[": "]",
  21. "〖": "〗",
  22. }
  23. func GetKvFromtxt(con, tag string, from int) ([]*u.Kv, map[string][]*u.Tag) {
  24. res := FindKv(TextAfterRemoveTable(con), tag, from)
  25. kvs := []*u.Kv{}
  26. for _, k := range res.Keys {
  27. v, _ := res.Map[k].(string)
  28. if k != "" && v != "" {
  29. kvs = append(kvs, &u.Kv{
  30. Key: k,
  31. Value: v,
  32. })
  33. }
  34. }
  35. kvTags := GetKvTags(kvs, tag, nil)
  36. return kvs, kvTags
  37. }
  38. type Line struct {
  39. PreLine *Line
  40. NextLine *Line
  41. Strs []string
  42. Str string
  43. Pos int
  44. Len int
  45. KV *SortMap
  46. IsKey bool //是否只是key
  47. Kn int //冒号个数
  48. Spacen int //间隔空格个数
  49. DJh int //逗号句号
  50. }
  51. func NewLine() *Line {
  52. return &Line{
  53. Strs: []string{},
  54. KV: NewSortMap(),
  55. }
  56. }
  57. var LineKey = regexp.MustCompile("^[^,。]{2,10}[::]$")
  58. var DJh = regexp.MustCompile("[,,。]")
  59. var DunH = regexp.MustCompile("[、.]")
  60. func GetLines(con string) (res []*Line) {
  61. res = []*Line{}
  62. l1 := NewLine()
  63. strings.IndexFunc(con, func(r rune) bool {
  64. if r == 10 {
  65. if len(l1.Strs) > 0 {
  66. l1.Str = strings.Join(l1.Strs, "")
  67. if !regexp.MustCompile("^[,,.。\\s \u3000\u2003\u00a0]$").MatchString(l1.Str) {
  68. l1.Str = u.TrimLRSpace(l1.Str, "")
  69. l1.Str = TimeHM.ReplaceAllString(l1.Str, ReplTimeHM)
  70. l1.Strs = strings.Split(l1.Str, "")
  71. res = append(res, l1)
  72. }
  73. l1 = NewLine()
  74. }
  75. } else {
  76. s := string(r)
  77. l1.Strs = append(l1.Strs, s)
  78. }
  79. return false
  80. })
  81. if len(l1.Strs) > 0 {
  82. res = append(res, l1)
  83. }
  84. for k, l := range res {
  85. if k == 0 && k < len(res)-1 {
  86. l.NextLine = res[k+1]
  87. } else if k == len(res)-1 {
  88. l.PreLine = res[k-1]
  89. } else {
  90. l.PreLine = res[k-1]
  91. l.NextLine = res[k+1]
  92. }
  93. if LineKey.MatchString(l.Str) {
  94. l.IsKey = true
  95. } else {
  96. l.Kn = len(Key.FindAllString(l.Str, -1))
  97. l.DJh = len(DJh.FindAllString(l.Str, -1))
  98. }
  99. }
  100. return
  101. }
  102. func FindKv_v2(con, tag string) (m *SortMap) {
  103. m = NewSortMap()
  104. resLine := GetLines(con)
  105. for i := 0; i < len(resLine); i++ {
  106. l1 := resLine[i]
  107. if l1.IsKey {
  108. continue
  109. } else {
  110. if l1.Kn > 0 {
  111. u.Debug("=--=", l1.Str)
  112. } else {
  113. if l1.Spacen == 1 && l1.DJh < 2 && l1.Len < 50 {
  114. u.Debug("===", l1.Str)
  115. } else {
  116. u.Debug("???", l1.Str)
  117. }
  118. }
  119. }
  120. }
  121. return
  122. }
  123. var TimeHM = regexp.MustCompile("[\\s \u3000\u2003\u00a0]*([01]{0,1}[0123456789]|2[0123])[::]([012345][0-9])[::]{0,1}")
  124. var ReplTimeHM = "D${1}H${2}M"
  125. //from 1--全文 2--table td
  126. func FindKv(con, tag string, from int) (m *SortMap) {
  127. if from == 2 || from == 3 {
  128. con = RemoveWarpOfTdVal(con)
  129. }
  130. //FindKv_v2(con, tag)
  131. matchMap := map[string]map[string]bool{
  132. "代理机构": map[string]bool{},
  133. "中标单位": map[string]bool{},
  134. "采购单位": map[string]bool{},
  135. }
  136. m = NewSortMap()
  137. strs := [][]string{}
  138. s1 := []string{}
  139. //断行
  140. strings.IndexFunc(con, func(r rune) bool {
  141. if r == 10 {
  142. if len(s1) > 0 {
  143. str := strings.Join(s1, "")
  144. str = u.TrimLRSpace(str, "")
  145. str = TimeHM.ReplaceAllString(str, ReplTimeHM)
  146. s1 = strings.Split(str, "")
  147. if len(s1) > 0 {
  148. strs = append(strs, s1)
  149. }
  150. s1 = []string{}
  151. }
  152. } else {
  153. s := string(r)
  154. s1 = append(s1, s)
  155. }
  156. return false
  157. })
  158. if len(s1) > 0 {
  159. str := strings.Join(s1, "")
  160. str = u.TrimLRSpace(str, "")
  161. str = TimeHM.ReplaceAllString(str, ReplTimeHM)
  162. s1 = strings.Split(str, "")
  163. if len(s1) > 0 {
  164. strs = append(strs, s1)
  165. }
  166. }
  167. //查找
  168. LastStr := ""
  169. for k0 := 0; k0 < len(strs); k0++ {
  170. s1 := strs[k0]
  171. //u.Debug(strings.Join(s1, ""))
  172. str1 := strings.Join(s1, "")
  173. k := ""
  174. v := ""
  175. flag := 0
  176. pos1, pos2 := -1, -1
  177. bkh := false
  178. skh := ""
  179. if !Key.MatchString(str1) { //此行没有冒号
  180. if k0 > 0 {
  181. tm1 := strs[k0-1]
  182. if len([]rune(LastStr)) > 2 && len(tm1) < 8 && Key.MatchString(tm1[len(tm1)-1:][0]) && len([]rune(str1)) < 30 {
  183. //u.Debug(LastStr, str1)
  184. k = strings.Join(tm1[:len(tm1)-1], "")
  185. v = str1
  186. if k0 < len(strs)-1 {
  187. s2 := u.TrimLRSpace(strings.Join(strs[k0+1], ""), "")
  188. if len([]rune(s2)) < 10 && !regexp.MustCompile("^[0-9]+[、]+$").MatchString(s2) && !Key.MatchString(s2) {
  189. v += s2
  190. k0++
  191. }
  192. }
  193. keydetail(k, v, m, tag, k0, strs, matchMap, from)
  194. }
  195. }
  196. LastStr = ""
  197. continue
  198. } else {
  199. //u.Debug("---===----", str1)
  200. LastStr = str1
  201. for k1 := 0; k1 < len(s1); k1++ {
  202. s := s1[k1]
  203. if matchkh[s] != "" {
  204. skh = matchkh[s]
  205. bkh = true
  206. }
  207. if bkh {
  208. if skh == s {
  209. bkh = false
  210. }
  211. if flag == 1 {
  212. k += s
  213. } else if flag == 2 {
  214. v += s
  215. }
  216. continue
  217. }
  218. if flag == 0 {
  219. k = ""
  220. v = ""
  221. pos1, pos2 = -1, -1
  222. flag = 1
  223. }
  224. if flag == 1 {
  225. if Han1.MatchString(s) || (k != "" && Han2.MatchString(s)) {
  226. k += s
  227. } else if Key.MatchString(s) && k != "" {
  228. flag = 2
  229. } else {
  230. flag = 0
  231. }
  232. } else if flag == 2 {
  233. if val.MatchString(s) || (dh.MatchString(s) && k1 > 0 && k1 < len(s1)-1 && Time.MatchString(s1[k1-1]) && Time.MatchString(s1[k1+1])) {
  234. if pos1 < 0 {
  235. pos1 = k1
  236. }
  237. continue
  238. } else {
  239. be := false
  240. if space.MatchString(s) {
  241. temp := s1[k1+1:]
  242. //()()[]【】
  243. m1 := k1
  244. bkh1 := false
  245. skh1 := ""
  246. for k2, v2 := range temp {
  247. if k2 == len(temp)-1 {
  248. be = true
  249. }
  250. if matchkh[v2] != "" {
  251. bkh1 = true
  252. skh1 = matchkh[v2]
  253. continue
  254. } else if bkh1 {
  255. if v2 == skh1 {
  256. bkh1 = false
  257. }
  258. continue
  259. } else if space.MatchString(v2) {
  260. continue
  261. } else if !val.MatchString(v2) {
  262. k1 = m1 + k2 + 1
  263. break
  264. } else {
  265. if pos1 < 0 {
  266. //u.Debug("-----", pos1)
  267. pos1 = k1 + k2 + 1
  268. }
  269. }
  270. if Key.MatchString(v2) && k2 > 0 && k2 < len(temp)-1 {
  271. if Time.MatchString(temp[k2-1]) && Time.MatchString(temp[k2+1]) {
  272. //u.Debug(v2, temp[k2-1], temp[k2+1])
  273. k1 = m1 + k2 + 1
  274. } else {
  275. //倒着
  276. for i := k2; i > k1-m1-1; i-- {
  277. if !val.MatchString(temp[i]) {
  278. k1 = m1 + i + 1
  279. break
  280. }
  281. }
  282. break
  283. }
  284. }
  285. }
  286. }
  287. if be {
  288. k1 = len(s1) //直接跳到最后
  289. }
  290. if pos2 < 0 && pos2 < pos1 {
  291. pos2 = k1
  292. }
  293. // u.Debug(pos1, pos2, k1, len(s1))
  294. if pos1 > -1 && pos2 > pos1 {
  295. v = strings.Join(s1[pos1:pos2], "")
  296. flag = 0
  297. keydetail(k, v, m, tag, k0, strs, matchMap, from)
  298. } else {
  299. //u.Debug(k, pos1, pos2)
  300. flag = 0
  301. }
  302. }
  303. }
  304. }
  305. if flag == 2 {
  306. if pos2 > pos1 {
  307. v = strings.Join(s1[pos1:pos2], "")
  308. } else if pos1 > 0 {
  309. v = strings.Join(s1[pos1:], "")
  310. }
  311. if v != "" {
  312. flag = 0
  313. keydetail(k, v, m, tag, k0, strs, matchMap, from)
  314. }
  315. //u.Debug(k, v)
  316. }
  317. }
  318. }
  319. // for _, kk := range m.Keys {
  320. // u.Debug(kk, m.Map[kk])
  321. // }
  322. return
  323. }
  324. func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, matchMap map[string]map[string]bool, from int) {
  325. if regexp.MustCompile("^[0-9]+[、]+$").MatchString(v) {
  326. return
  327. }
  328. k = space.ReplaceAllString(k, "")
  329. if len([]rune(k)) > 1 {
  330. if len([]rune(k)) < 5 && strings.HasPrefix(k, "联系") || ContactInfoMustReg.MatchString(k) {
  331. num := 0
  332. bf := false
  333. for i := len(m.Keys) - 1; i > -1; i-- {
  334. num++
  335. if from == 1 && !ContactType["代理机构"].MatchString(k) && ContactType["代理机构"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["代理机构"]) {
  336. matchMap["代理机构"][k] = true
  337. k = "代理机构" + k
  338. bf = true
  339. break
  340. }
  341. if !filter_zbdw_ky.MatchString(k) && filter_zbdw_ky.MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["中标单位"]) {
  342. matchMap["中标单位"][k] = true
  343. k = "中标单位" + k
  344. bf = true
  345. break
  346. }
  347. if from == 1 && !ContactType["采购单位"].MatchString(k) && ContactType["采购单位"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["采购单位"]) {
  348. matchMap["采购单位"][k] = true
  349. k = "采购单位" + k
  350. bf = true
  351. break
  352. }
  353. //if num > 0 {
  354. break
  355. //}
  356. }
  357. if !bf {
  358. //k = "采购人" + k
  359. //取出上一行
  360. if pos > 0 {
  361. if len(strs[pos-1]) < 20 {
  362. str := space.ReplaceAllString(strings.Join(strs[pos-1], ""), "")
  363. if from == 1 && ContactType["代理机构"].MatchString(str) && !IsContactKvHandle(k, matchMap["代理机构"]) {
  364. matchMap["代理机构"][k] = true
  365. k = "代理机构" + k
  366. } else if filter_zbdw_ky.MatchString(str) && !IsContactKvHandle(k, matchMap["中标单位"]) {
  367. matchMap["中标单位"][k] = true
  368. k = "中标单位" + k
  369. } else if from == 1 && ContactType["采购单位"].MatchString(str) && !IsContactKvHandle(k, matchMap["采购单位"]) {
  370. matchMap["采购单位"][k] = true
  371. k = "采购单位" + k
  372. }
  373. }
  374. }
  375. }
  376. } else if len([]rune(k)) == 2 {
  377. if filter_zbje_jd.MatchString(k) { //钱
  378. if tag != "" && filter_tag_zb.MatchString(tag) {
  379. k = "中标" + k
  380. } else {
  381. num := 0
  382. for i := len(m.Keys) - 1; i > -1; i-- {
  383. num++
  384. if filter_zbdw_ky.MatchString(m.Keys[i]) {
  385. k = "中标" + k
  386. break
  387. }
  388. if num > 2 {
  389. break
  390. }
  391. }
  392. }
  393. }
  394. }
  395. //u.Debug(k, v)
  396. if m.Map[k] == nil {
  397. m.AddKey(k, v)
  398. }
  399. }
  400. }
  401. //时间处理、换行优先级|
  402. func FindKv_v1(con string) (m map[string]string) {
  403. m = map[string]string{}
  404. k := ""
  405. v := ""
  406. flag := 0
  407. strings.IndexFunc(con, func(r rune) bool {
  408. s := string(r)
  409. if flag == 0 {
  410. k = ""
  411. v = ""
  412. flag = 1
  413. }
  414. if flag == 1 {
  415. if Han.MatchString(s) {
  416. k += s
  417. } else if Key.MatchString(s) && k != "" {
  418. flag = 2
  419. } else {
  420. flag = 0
  421. }
  422. } else if flag == 2 {
  423. if v == "" {
  424. if space.MatchString(s) {
  425. } else if val.MatchString(s) && !Key.MatchString(s) {
  426. v += s
  427. } else {
  428. flag = 0
  429. }
  430. } else {
  431. if val.MatchString(s) {
  432. if Key.MatchString(k) {
  433. if (regexp.MustCompile("(时间|日期)").MatchString(v) || regexp.MustCompile("(时间|日期)").MatchString(k)) && regexp.MustCompile("[^\\d][012]?[0-9]").MatchString(k) {
  434. v += s
  435. } else if regexp.MustCompile("^[\\p{Han}]$").MatchString(v) {
  436. k = v
  437. v = ""
  438. flag = 1
  439. }
  440. } else {
  441. v += s
  442. }
  443. } else if k != "" && v != "" {
  444. u.Debug(k, "=", v)
  445. flag = 0
  446. }
  447. }
  448. }
  449. return false
  450. })
  451. if flag == 2 && k != "" && v != "" {
  452. u.Debug(k, "=", v)
  453. }
  454. return
  455. }