|
- package pretreated
- import (
- u "jy/util"
- "regexp"
- "strings"
- )
- var Han = regexp.MustCompile("[\\p{Han}]")
- var Han1 = regexp.MustCompile("[^:;;,:,。. \u3000\u2003\u00a0\\s]")
- var Han2 = regexp.MustCompile("[^:;;,:,。.]")
- var Key = regexp.MustCompile("[:::]")
- var Time = regexp.MustCompile("[\\d]")
- var dh = regexp.MustCompile("[,,.]")
- var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+")
- var val = regexp.MustCompile("[^\\s\\n \u3000\u2003\u00a0,,。!;;\\-]")
- var matchkh = map[string]string{
- "(": ")",
- "(": ")",
- "【": "】",
- "[": "]",
- "[": "]",
- "〖": "〗",
- }
- func GetKvFromtxt(con, tag string, from int) ([]*u.Kv, map[string][]*u.Tag) {
- res := FindKv(TextAfterRemoveTable(con), tag, from)
- kvs := []*u.Kv{}
- for _, k := range res.Keys {
- v, _ := res.Map[k].(string)
- if k != "" && v != "" {
- kvs = append(kvs, &u.Kv{
- Key: k,
- Value: v,
- })
- }
- }
- kvTags := GetKvTags(kvs, tag, nil)
- return kvs, kvTags
- }
- type Line struct {
- PreLine *Line
- NextLine *Line
- Strs []string
- Str string
- Pos int
- Len int
- KV *SortMap
- IsKey bool //是否只是key
- Kn int //冒号个数
- Spacen int //间隔空格个数
- DJh int //逗号句号
- }
- func NewLine() *Line {
- return &Line{
- Strs: []string{},
- KV: NewSortMap(),
- }
- }
- var LineKey = regexp.MustCompile("^[^,。]{2,10}[::]$")
- var DJh = regexp.MustCompile("[,,。]")
- var DunH = regexp.MustCompile("[、.]")
- func GetLines(con string) (res []*Line) {
- res = []*Line{}
- l1 := NewLine()
- strings.IndexFunc(con, func(r rune) bool {
- if r == 10 {
- if len(l1.Strs) > 0 {
- l1.Str = strings.Join(l1.Strs, "")
- if !regexp.MustCompile("^[,,.。\\s \u3000\u2003\u00a0]$").MatchString(l1.Str) {
- l1.Str = u.TrimLRSpace(l1.Str, "")
- l1.Str = TimeHM.ReplaceAllString(l1.Str, ReplTimeHM)
- l1.Strs = strings.Split(l1.Str, "")
- res = append(res, l1)
- }
- l1 = NewLine()
- }
- } else {
- s := string(r)
- l1.Strs = append(l1.Strs, s)
- }
- return false
- })
- if len(l1.Strs) > 0 {
- res = append(res, l1)
- }
- for k, l := range res {
- if k == 0 && k < len(res)-1 {
- l.NextLine = res[k+1]
- } else if k == len(res)-1 {
- l.PreLine = res[k-1]
- } else {
- l.PreLine = res[k-1]
- l.NextLine = res[k+1]
- }
- if LineKey.MatchString(l.Str) {
- l.IsKey = true
- } else {
- l.Kn = len(Key.FindAllString(l.Str, -1))
- l.DJh = len(DJh.FindAllString(l.Str, -1))
- }
- }
- return
- }
- func FindKv_v2(con, tag string) (m *SortMap) {
- m = NewSortMap()
- resLine := GetLines(con)
- for i := 0; i < len(resLine); i++ {
- l1 := resLine[i]
- if l1.IsKey {
- continue
- } else {
- if l1.Kn > 0 {
- u.Debug("=--=", l1.Str)
- } else {
- if l1.Spacen == 1 && l1.DJh < 2 && l1.Len < 50 {
- u.Debug("===", l1.Str)
- } else {
- u.Debug("???", l1.Str)
- }
- }
- }
- }
- return
- }
- var TimeHM = regexp.MustCompile("[\\s \u3000\u2003\u00a0]*([01]{0,1}[0123456789]|2[0123])[::]([012345][0-9])[::]{0,1}")
- var ReplTimeHM = "D${1}H${2}M"
- //from 1--全文 2--table td
- func FindKv(con, tag string, from int) (m *SortMap) {
- if from == 2 || from == 3 {
- con = RemoveWarpOfTdVal(con)
- }
- //FindKv_v2(con, tag)
- matchMap := map[string]map[string]bool{
- "代理机构": map[string]bool{},
- "中标单位": map[string]bool{},
- "采购单位": map[string]bool{},
- }
- m = NewSortMap()
- strs := [][]string{}
- s1 := []string{}
- //断行
- strings.IndexFunc(con, func(r rune) bool {
- if r == 10 {
- if len(s1) > 0 {
- str := strings.Join(s1, "")
- str = u.TrimLRSpace(str, "")
- str = TimeHM.ReplaceAllString(str, ReplTimeHM)
- s1 = strings.Split(str, "")
- if len(s1) > 0 {
- strs = append(strs, s1)
- }
- s1 = []string{}
- }
- } else {
- s := string(r)
- s1 = append(s1, s)
- }
- return false
- })
- if len(s1) > 0 {
- str := strings.Join(s1, "")
- str = u.TrimLRSpace(str, "")
- str = TimeHM.ReplaceAllString(str, ReplTimeHM)
- s1 = strings.Split(str, "")
- if len(s1) > 0 {
- strs = append(strs, s1)
- }
- }
- //查找
- LastStr := ""
- for k0 := 0; k0 < len(strs); k0++ {
- s1 := strs[k0]
- //u.Debug(strings.Join(s1, ""))
- str1 := strings.Join(s1, "")
- k := ""
- v := ""
- flag := 0
- pos1, pos2 := -1, -1
- bkh := false
- skh := ""
- if !Key.MatchString(str1) { //此行没有冒号
- if k0 > 0 {
- tm1 := strs[k0-1]
- if len([]rune(LastStr)) > 2 && len(tm1) < 8 && Key.MatchString(tm1[len(tm1)-1:][0]) && len([]rune(str1)) < 30 {
- //u.Debug(LastStr, str1)
- k = strings.Join(tm1[:len(tm1)-1], "")
- v = str1
- if k0 < len(strs)-1 {
- s2 := u.TrimLRSpace(strings.Join(strs[k0+1], ""), "")
- if len([]rune(s2)) < 10 && !regexp.MustCompile("^[0-9]+[、]+$").MatchString(s2) && !Key.MatchString(s2) {
- v += s2
- k0++
- }
- }
- keydetail(k, v, m, tag, k0, strs, matchMap, from)
- }
- }
- LastStr = ""
- continue
- } else {
- //u.Debug("---===----", str1)
- LastStr = str1
- for k1 := 0; k1 < len(s1); k1++ {
- s := s1[k1]
- if matchkh[s] != "" {
- skh = matchkh[s]
- bkh = true
- }
- if bkh {
- if skh == s {
- bkh = false
- }
- if flag == 1 {
- k += s
- } else if flag == 2 {
- v += s
- }
- continue
- }
- if flag == 0 {
- k = ""
- v = ""
- pos1, pos2 = -1, -1
- flag = 1
- }
- if flag == 1 {
- if Han1.MatchString(s) || (k != "" && Han2.MatchString(s)) {
- k += s
- } else if Key.MatchString(s) && k != "" {
- flag = 2
- } else {
- flag = 0
- }
- } else if flag == 2 {
- if val.MatchString(s) || (dh.MatchString(s) && k1 > 0 && k1 < len(s1)-1 && Time.MatchString(s1[k1-1]) && Time.MatchString(s1[k1+1])) {
- if pos1 < 0 {
- pos1 = k1
- }
- continue
- } else {
- be := false
- if space.MatchString(s) {
- temp := s1[k1+1:]
- //()()[]【】
- m1 := k1
- bkh1 := false
- skh1 := ""
- for k2, v2 := range temp {
- if k2 == len(temp)-1 {
- be = true
- }
- if matchkh[v2] != "" {
- bkh1 = true
- skh1 = matchkh[v2]
- continue
- } else if bkh1 {
- if v2 == skh1 {
- bkh1 = false
- }
- continue
- } else if space.MatchString(v2) {
- continue
- } else if !val.MatchString(v2) {
- k1 = m1 + k2 + 1
- break
- } else {
- if pos1 < 0 {
- //u.Debug("-----", pos1)
- pos1 = k1 + k2 + 1
- }
- }
- if Key.MatchString(v2) && k2 > 0 && k2 < len(temp)-1 {
- if Time.MatchString(temp[k2-1]) && Time.MatchString(temp[k2+1]) {
- //u.Debug(v2, temp[k2-1], temp[k2+1])
- k1 = m1 + k2 + 1
- } else {
- //倒着
- for i := k2; i > k1-m1-1; i-- {
- if !val.MatchString(temp[i]) {
- k1 = m1 + i + 1
- break
- }
- }
- break
- }
- }
- }
- }
- if be {
- k1 = len(s1) //直接跳到最后
- }
- if pos2 < 0 && pos2 < pos1 {
- pos2 = k1
- }
- // u.Debug(pos1, pos2, k1, len(s1))
- if pos1 > -1 && pos2 > pos1 {
- v = strings.Join(s1[pos1:pos2], "")
- flag = 0
- keydetail(k, v, m, tag, k0, strs, matchMap, from)
- } else {
- //u.Debug(k, pos1, pos2)
- flag = 0
- }
- }
- }
- }
- if flag == 2 {
- if pos2 > pos1 {
- v = strings.Join(s1[pos1:pos2], "")
- } else if pos1 > 0 {
- v = strings.Join(s1[pos1:], "")
- }
- if v != "" {
- flag = 0
- keydetail(k, v, m, tag, k0, strs, matchMap, from)
- }
- //u.Debug(k, v)
- }
- }
- }
- // for _, kk := range m.Keys {
- // u.Debug(kk, m.Map[kk])
- // }
- return
- }
- func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, matchMap map[string]map[string]bool, from int) {
- if regexp.MustCompile("^[0-9]+[、]+$").MatchString(v) {
- return
- }
- k = space.ReplaceAllString(k, "")
- if len([]rune(k)) > 1 {
- if len([]rune(k)) < 5 && strings.HasPrefix(k, "联系") || ContactInfoMustReg.MatchString(k) {
- num := 0
- bf := false
- for i := len(m.Keys) - 1; i > -1; i-- {
- num++
- if from == 1 && !ContactType["代理机构"].MatchString(k) && ContactType["代理机构"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["代理机构"]) {
- matchMap["代理机构"][k] = true
- k = "代理机构" + k
- bf = true
- break
- }
- if !filter_zbdw_ky.MatchString(k) && filter_zbdw_ky.MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["中标单位"]) {
- matchMap["中标单位"][k] = true
- k = "中标单位" + k
- bf = true
- break
- }
- if from == 1 && !ContactType["采购单位"].MatchString(k) && ContactType["采购单位"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["采购单位"]) {
- matchMap["采购单位"][k] = true
- k = "采购单位" + k
- bf = true
- break
- }
- //if num > 0 {
- break
- //}
- }
- if !bf {
- //k = "采购人" + k
- //取出上一行
- if pos > 0 {
- if len(strs[pos-1]) < 20 {
- str := space.ReplaceAllString(strings.Join(strs[pos-1], ""), "")
- if from == 1 && ContactType["代理机构"].MatchString(str) && !IsContactKvHandle(k, matchMap["代理机构"]) {
- matchMap["代理机构"][k] = true
- k = "代理机构" + k
- } else if filter_zbdw_ky.MatchString(str) && !IsContactKvHandle(k, matchMap["中标单位"]) {
- matchMap["中标单位"][k] = true
- k = "中标单位" + k
- } else if from == 1 && ContactType["采购单位"].MatchString(str) && !IsContactKvHandle(k, matchMap["采购单位"]) {
- matchMap["采购单位"][k] = true
- k = "采购单位" + k
- }
- }
- }
- }
- } else if len([]rune(k)) == 2 {
- if filter_zbje_jd.MatchString(k) { //钱
- if tag != "" && filter_tag_zb.MatchString(tag) {
- k = "中标" + k
- } else {
- num := 0
- for i := len(m.Keys) - 1; i > -1; i-- {
- num++
- if filter_zbdw_ky.MatchString(m.Keys[i]) {
- k = "中标" + k
- break
- }
- if num > 2 {
- break
- }
- }
- }
- }
- }
- //u.Debug(k, v)
- if m.Map[k] == nil {
- m.AddKey(k, v)
- }
- }
- }
- //时间处理、换行优先级|
- func FindKv_v1(con string) (m map[string]string) {
- m = map[string]string{}
- k := ""
- v := ""
- flag := 0
- strings.IndexFunc(con, func(r rune) bool {
- s := string(r)
- if flag == 0 {
- k = ""
- v = ""
- flag = 1
- }
- if flag == 1 {
- if Han.MatchString(s) {
- k += s
- } else if Key.MatchString(s) && k != "" {
- flag = 2
- } else {
- flag = 0
- }
- } else if flag == 2 {
- if v == "" {
- if space.MatchString(s) {
- } else if val.MatchString(s) && !Key.MatchString(s) {
- v += s
- } else {
- flag = 0
- }
- } else {
- if val.MatchString(s) {
- if Key.MatchString(k) {
- if (regexp.MustCompile("(时间|日期)").MatchString(v) || regexp.MustCompile("(时间|日期)").MatchString(k)) && regexp.MustCompile("[^\\d][012]?[0-9]").MatchString(k) {
- v += s
- } else if regexp.MustCompile("^[\\p{Han}]$").MatchString(v) {
- k = v
- v = ""
- flag = 1
- }
- } else {
- v += s
- }
- } else if k != "" && v != "" {
- u.Debug(k, "=", v)
- flag = 0
- }
- }
- }
- return false
- })
- if flag == 2 && k != "" && v != "" {
- u.Debug(k, "=", v)
- }
- return
- }
|