package pretreated import ( u "jy/util" "regexp" "strings" ) var Han = regexp.MustCompile("[\\p{Han}]") var Han1 = regexp.MustCompile("[^:;;,:,。. \u3000\u2003\u00a0\\s]") var Han2 = regexp.MustCompile("[^:;;,:,。.]") var Key = regexp.MustCompile("[:::]") var Time = regexp.MustCompile("[\\d]") var dh = regexp.MustCompile("[,,.]") var space = regexp.MustCompile("[\\s\\n \u3000\u2003\u00a0]+") var val = regexp.MustCompile("[^\\s\\n \u3000\u2003\u00a0,,。!;;]") var matchkh = map[string]string{ "(": ")", "(": ")", "【": "】", "[": "]", "[": "]", "〖": "〗", } func GetKvFromtxt(con, tag string, from int,isSite bool) ([]*u.Kv, map[string][]*u.Tag) { res := FindKv(TextAfterRemoveTable(con), tag, from) kvs := []*u.Kv{} for _, k := range res.Keys { v, _ := res.Map[k].(string) if k != "" && v != "" { kvs = append(kvs, &u.Kv{ Key: k, Value: v, }) } } kvTags := GetKvTags(kvs, tag, nil,isSite) return kvs, kvTags } type Line struct { PreLine *Line NextLine *Line Strs []string Str string Pos int Len int KV *SortMap IsKey bool //是否只是key Kn int //冒号个数 Spacen int //间隔空格个数 DJh int //逗号句号 } func NewLine() *Line { return &Line{ Strs: []string{}, KV: NewSortMap(), } } var LineKey = regexp.MustCompile("^[^,。]{2,10}[::]$") var DJh = regexp.MustCompile("[,,。]") var DunH = regexp.MustCompile("[、.]") func GetLines(con string) (res []*Line) { res = []*Line{} l1 := NewLine() strings.IndexFunc(con, func(r rune) bool { if r == 10 { if len(l1.Strs) > 0 { l1.Str = strings.Join(l1.Strs, "") if !regexp.MustCompile("^[,,.。\\s \u3000\u2003\u00a0]$").MatchString(l1.Str) { l1.Str = u.TrimLRSpace(l1.Str, "") l1.Str = TimeHM.ReplaceAllString(l1.Str, ReplTimeHM) l1.Strs = strings.Split(l1.Str, "") res = append(res, l1) } l1 = NewLine() } } else { s := string(r) l1.Strs = append(l1.Strs, s) } return false }) if len(l1.Strs) > 0 { res = append(res, l1) } for k, l := range res { if k == 0 && k < len(res)-1 { l.NextLine = res[k+1] } else if k == len(res)-1 { l.PreLine = res[k-1] } else { l.PreLine = res[k-1] l.NextLine = res[k+1] } if LineKey.MatchString(l.Str) { l.IsKey = true } else { l.Kn = len(Key.FindAllString(l.Str, -1)) l.DJh = len(DJh.FindAllString(l.Str, -1)) } } return } func FindKv_v2(con, tag string) (m *SortMap) { m = NewSortMap() resLine := GetLines(con) for i := 0; i < len(resLine); i++ { l1 := resLine[i] if l1.IsKey { continue } else { if l1.Kn > 0 { u.Debug("=--=", l1.Str) } else { if l1.Spacen == 1 && l1.DJh < 2 && l1.Len < 50 { u.Debug("===", l1.Str) } else { u.Debug("???", l1.Str) } } } } return } var TimeHM = regexp.MustCompile("[\\s \u3000\u2003\u00a0]*([01]{0,1}[0123456789]|2[0123])[::]([012345][0-9])[::]{0,1}") var ReplTimeHM = "D${1}H${2}M" //from 1--全文 2--table td func FindKv(con, tag string, from int) (m *SortMap) { if from == 2 || from == 3 { con = RemoveWarpOfTdVal(con) } //FindKv_v2(con, tag) matchMap := map[string]map[string]bool{ "代理机构": map[string]bool{}, "中标单位": map[string]bool{}, "采购单位": map[string]bool{}, } m = NewSortMap() strs := [][]string{} s1 := []string{} //断行 strings.IndexFunc(con, func(r rune) bool { if r == 10 || r == 59 { if len(s1) > 0 { str := strings.Join(s1, "") str = u.TrimLRSpace(str, "") str = TimeHM.ReplaceAllString(str, ReplTimeHM) s1 = strings.Split(str, "") if len(s1) > 0 { strs = append(strs, s1) } s1 = []string{} } } else { s := string(r) s1 = append(s1, s) } return false }) if len(s1) > 0 { str := strings.Join(s1, "") str = u.TrimLRSpace(str, "") str = TimeHM.ReplaceAllString(str, ReplTimeHM) s1 = strings.Split(str, "") if len(s1) > 0 { strs = append(strs, s1) } } //查找 LastStr := "" for k0 := 0; k0 < len(strs); k0++ { s1 := strs[k0] //u.Debug(strings.Join(s1, "")) str1 := strings.Join(s1, "") k := "" v := "" flag := 0 pos1, pos2 := -1, -1 bkh := false skh := "" if !Key.MatchString(str1) { //此行没有冒号 if k0 > 0 { tm1 := strs[k0-1] if len([]rune(LastStr)) > 2 && len(tm1) < 8 && Key.MatchString(tm1[len(tm1)-1:][0]) && len([]rune(str1)) < 30 { //u.Debug(LastStr, str1) k = strings.Join(tm1[:len(tm1)-1], "") v = str1 if k0 < len(strs)-1 { s2 := u.TrimLRSpace(strings.Join(strs[k0+1], ""), "") if len([]rune(s2)) < 10 && !regexp.MustCompile("^[0-9]+[、]+$").MatchString(s2) && !Key.MatchString(s2) { v += s2 k0++ } } keydetail(k, v, m, tag, k0, strs, matchMap, from) } } LastStr = "" continue } else { //u.Debug("---===----", str1) LastStr = str1 for k1 := 0; k1 < len(s1); k1++ { s := s1[k1] if matchkh[s] != "" { skh = matchkh[s] bkh = true } if bkh { if skh == s { bkh = false } if flag == 1 { k += s } else if flag == 2 { v += s } continue } if flag == 0 { k = "" v = "" pos1, pos2 = -1, -1 flag = 1 } if flag == 1 { if Han1.MatchString(s) || (k != "" && Han2.MatchString(s)) { k += s } else if Key.MatchString(s) && k != "" { flag = 2 } else { flag = 0 } } else if flag == 2 { if val.MatchString(s) || (dh.MatchString(s) && k1 > 0 && k1 < len(s1)-1 && Time.MatchString(s1[k1-1]) && Time.MatchString(s1[k1+1])) { if pos1 < 0 { pos1 = k1 } continue } else { be := false if space.MatchString(s) { temp := s1[k1+1:] //()()[]【】 m1 := k1 bkh1 := false skh1 := "" for k2, v2 := range temp { if k2 == len(temp)-1 { be = true } if matchkh[v2] != "" { bkh1 = true skh1 = matchkh[v2] continue } else if bkh1 { if v2 == skh1 { bkh1 = false } continue } else if space.MatchString(v2) { continue } else if !val.MatchString(v2) { k1 = m1 + k2 + 1 break } else { if pos1 < 0 { //u.Debug("-----", pos1) pos1 = k1 + k2 + 1 } } if Key.MatchString(v2) && k2 > 0 && k2 < len(temp)-1 { if Time.MatchString(temp[k2-1]) && Time.MatchString(temp[k2+1]) { //u.Debug(v2, temp[k2-1], temp[k2+1]) k1 = m1 + k2 + 1 } else { //倒着 for i := k2; i > k1-m1-1; i-- { if !val.MatchString(temp[i]) { k1 = m1 + i + 1 break } } break } } } } if be { k1 = len(s1) //直接跳到最后 } if pos2 < 0 && pos2 < pos1 { pos2 = k1 } // u.Debug(pos1, pos2, k1, len(s1)) if pos1 > -1 && pos2 > pos1 { v = strings.Join(s1[pos1:pos2], "") flag = 0 keydetail(k, v, m, tag, k0, strs, matchMap, from) } else { //u.Debug(k, pos1, pos2) flag = 0 } } } } if flag == 2 { if pos2 > pos1 { v = strings.Join(s1[pos1:pos2], "") } else if pos1 > 0 { v = strings.Join(s1[pos1:], "") } if v != "" { flag = 0 keydetail(k, v, m, tag, k0, strs, matchMap, from) } //u.Debug(k, v) } } } // for _, kk := range m.Keys { // u.Debug(kk, m.Map[kk]) // } return } func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, matchMap map[string]map[string]bool, from int) { if regexp.MustCompile("^[0-9]+[、]+$").MatchString(v) { return } k = space.ReplaceAllString(k, "") if len([]rune(k)) > 1 { if len([]rune(k)) < 5 && strings.HasPrefix(k, "联系") || ContactInfoMustReg.MatchString(k) { num := 0 bf := false for i := len(m.Keys) - 1; i > -1; i-- { num++ if from == 1 && !ContactType["代理机构"].MatchString(k) && ContactType["代理机构"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["代理机构"]) { matchMap["代理机构"][k] = true k = "代理机构" + k bf = true break } if !filter_zbdw_ky.MatchString(k) && filter_zbdw_ky.MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["中标单位"]) { matchMap["中标单位"][k] = true k = "中标单位" + k bf = true break } if from == 1 && !ContactType["采购单位"].MatchString(k) && ContactType["采购单位"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["采购单位"]) { matchMap["采购单位"][k] = true k = "采购单位" + k bf = true break } //if num > 0 { break //} } if !bf { //k = "采购人" + k //取出上一行 if pos > 0 { if len(strs[pos-1]) < 20 { str := space.ReplaceAllString(strings.Join(strs[pos-1], ""), "") if from == 1 && ContactType["代理机构"].MatchString(str) && !IsContactKvHandle(k, matchMap["代理机构"]) { matchMap["代理机构"][k] = true k = "代理机构" + k } else if filter_zbdw_ky.MatchString(str) && !IsContactKvHandle(k, matchMap["中标单位"]) { matchMap["中标单位"][k] = true k = "中标单位" + k } else if from == 1 && ContactType["采购单位"].MatchString(str) && !IsContactKvHandle(k, matchMap["采购单位"]) { matchMap["采购单位"][k] = true k = "采购单位" + k } } } } } else if len([]rune(k)) == 2 { if filter_zbje_jd.MatchString(k) { //钱 if tag != "" && filter_tag_zb.MatchString(tag) { k = "中标" + k } else { num := 0 for i := len(m.Keys) - 1; i > -1; i-- { num++ if filter_zbdw_ky.MatchString(m.Keys[i]) { k = "中标" + k break } if num > 2 { break } } } } } //u.Debug(k, v) if m.Map[k] == nil { m.AddKey(k, v) }else { vals := []string{} if vvv ,ok := m.Map[k].([]string);ok{ vals = append(vals, vvv...) }else { vals = append(vals,v) } vals = append(vals, v) m.AddKey(k,vals) } } } //时间处理、换行优先级| func FindKv_v1(con string) (m map[string]string) { m = map[string]string{} k := "" v := "" flag := 0 strings.IndexFunc(con, func(r rune) bool { s := string(r) if flag == 0 { k = "" v = "" flag = 1 } if flag == 1 { if Han.MatchString(s) { k += s } else if Key.MatchString(s) && k != "" { flag = 2 } else { flag = 0 } } else if flag == 2 { if v == "" { if space.MatchString(s) { } else if val.MatchString(s) && !Key.MatchString(s) { v += s } else { flag = 0 } } else { if val.MatchString(s) { if Key.MatchString(k) { if (regexp.MustCompile("(时间|日期)").MatchString(v) || regexp.MustCompile("(时间|日期)").MatchString(k)) && regexp.MustCompile("[^\\d][012]?[0-9]").MatchString(k) { v += s } else if regexp.MustCompile("^[\\p{Han}]$").MatchString(v) { k = v v = "" flag = 1 } } else { v += s } } else if k != "" && v != "" { u.Debug(k, "=", v) flag = 0 } } } return false }) if flag == 2 && k != "" && v != "" { u.Debug(k, "=", v) } return }