//识别冒号kv package pretreated import ( "jy/clear" . "jy/util" qutil "qfw/util" "regexp" "sort" "strings" ) type ColonkvEntity struct{} var ( colonkvEntity = &ColonkvEntity{} regReplKV = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?") regReplKV2 = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)") regKV = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)") filterK = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+") filterValue = regexp.MustCompile("^(无)$") regReplKey = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$") BlockTagMap = map[string]bool{ "招标范围": true, "资格要求": true, } brackets = map[string]string{ "(": ")", "(": ")", "": "", "[": "]", "【": "】", "{": "}", "{": "}", "《": "》", "<": ">", } // PersonReg = regexp.MustCompile("[\u4e00-\u9fa5]{2,5}") // TelMustReg = regexp.MustCompile("^" + PhoneReg.String() + "$") PersonMustReg = regexp.MustCompile("^" + PersonReg.String() + "$") AddressReg = regexp.MustCompile("[省市县区路号楼]") BracketsTextReg = regexp.MustCompile("[((]([^((]+)[))]") ContactBuyerTitleReg = regexp.MustCompile("采购联系事项") ContactAgencyTitleReg = regexp.MustCompile("招标联系事项") ) //一行多个冒号kv处理 func (ce *ColonkvEntity) divisionMoreKV(con string) string { con = regReplKV.ReplaceAllStringFunc(con, func(temp string) string { //分kv的时候出现括号不成对出现的情况,分错了跳过 matchText := regReplKV.FindStringSubmatch(con)[1] for k, v := range brackets { if strings.Count(matchText, k) != strings.Count(matchText, v) { return temp } } return regReplKV.ReplaceAllString(temp, "$1\n\n$2") }) con = regReplKV2.ReplaceAllString(con, "$1\n\n$2") return con } //获取冒号kv入口 func (ce *ColonkvEntity) entrance(con, title string, from int) ([]*Kv, map[string]string) { kvs := ce.GetKvs(con, title, from) kv := map[string]string{} for _, v := range kvs { if strings.TrimSpace(v.Value) == "" { continue } kv[v.Key] = v.Value } return kvs, kv } //获取有序的kv func (ce *ColonkvEntity) GetKvs(con, title string, from int) []*Kv { con = ce.processText(con) kvs := ce.getColonKv(con, title, from) return kvs } //处理正文 func (ce *ColonkvEntity) processText(con string) string { con = ce.divisionMoreKV(con)//一行多个冒号kv处理 for { tmp := con con = ce.divisionMoreKV(con) if tmp == con { break } } return con } //分冒号kv //from 1--全文 2,3--table td func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv { if from == 2 || from == 3 { con = RemoveWarpOfTdVal(con) } findkvs := []*Kv{} lines := SspacekvEntity.getLines(con) for index, line := range lines { res := regKV.FindAllStringSubmatch(line, -1) if len(res) > 0 { for _, v := range res { key, val := "", "" if len(v) == 3 { key = v[1] val = v[2] } else if len(v) == 4 { key = v[2] val = v[3] } //Debug("KV-key", key, val) //Debug("KV-key", key, val) //地址、联系人可能会重复 单位、代理机构的\时间、地点 if strings.TrimSpace(key) != "" { prevLine, nextLine := "", "" if index > 0 { prevLine = lines[index-1] } if index < len(lines)-1 { nextLine = lines[index+1] } findkvs = append(findkvs, &Kv{ Key: key, Value: val, Line: line, PrevLine: prevLine, NextLine: nextLine, Title: title, }) splitkeys := strings.Split(key, "/") splitvalues := strings.Split(val, "/") if len(splitkeys) > 1 && len(splitkeys) == len(splitvalues) { for splitindex, splitkey := range splitkeys { findkvs = append(findkvs, &Kv{ Key: splitkey, Value: splitvalues[splitindex], Line: line, PrevLine: prevLine, NextLine: nextLine, Title: title, }) } } } } } } return findkvs } //冒号kv和空格kv结合 func (ce *ColonkvEntity) getColonSpaceKV(con string) []*Kv { con = colonkvEntity.processText(con) lines := SspacekvEntity.getLines(con) kvMaps := []*Kv{} for _, line := range lines { kvs := colonkvEntity.getColonKv(line, "", 1) if len(kvs) == 0 { kv := SspacekvEntity.divideKV(line) if kv != nil { kvMaps = append(kvMaps, kv...) } } else { kvMaps = append(kvMaps, kvs...) } } return kvMaps } /* 五、递交响应文件时间及地点 1、时间:2016年5月20日14时00分至2016年5月20日14时30分(北京时间) 2、地点:烟台开发区公共资源交易中心A座5楼会议室(金沙江路83号) key 时间 处理成 递交响应文件时间 */ func (ce *ColonkvEntity) blockTitleKV(title, key string) string { needKey := "时间" if key != needKey { return key } titles := regSplit.Split(title, -1) for _, v := range titles { if strings.HasSuffix(v, needKey) { return v } } return key } //根据配置文件中的规则,格式化正文 func formatText(content, key string) string { for _, v := range FormatTextMap[key] { reg, _ := v["reg"].(*regexp.Regexp) separator, isString := v["separator"].(string) separators, isArray := v["separator"].([]interface{}) if isArray { content = reg.ReplaceAllStringFunc(content, func(temp string) string { for _, sv := range separators { separator, _ := sv.(string) if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 { temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1]) } } return temp }) } else if isString { if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 { content = reg.ReplaceAllStringFunc(content, func(temp string) string { temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1]) return temp }) } else { content = reg.ReplaceAllString(content, separator) } } //Debug(v["reg"], content) } return content } func IsContactKvHandle(value string, m map[string]bool) bool { for k, _ := range m { if k != value && (strings.HasPrefix(k, value) || strings.HasPrefix(value, k)) { continue } if strings.Contains(value, k) || strings.Contains(k, value) { return true } } return false } //kv关于联系人信息的处理 //采购人>集中采购机构 /* func FormatContactKv(kvs *[]*Kv, title string, buyers []string) { //////////////////////////// //处理联系人信息 var indexMap map[int]string var matchMap map[string]map[string]bool if contactFormat == nil || title != "" { indexMap = map[int]string{} matchMap = map[string]map[string]bool{} } else { indexMap = contactFormat.IndexMap matchMap = contactFormat.MatchMap } //////////////////////////// totalIndexMap := map[string]bool{} ascFind := true ascFindFlag := len(indexMap) == 0 && buyers == nil //采购人在联系人、电话后面的处理 isCanAddToIndexMap := false for _, kv := range *kvs { k := FilterContactKey(kv.Key) k_length := len([]rune(k)) if k_length < 2 || k_length > 15 { continue } isContinue := ContactInfoMustReg.MatchString(k) if (isContinue || (ContactInfoVagueReg.MatchString(k) && IsMapHasValue(k, ContactType))) && ascFindFlag { if len(indexMap) > 0 { ascFind = true ascFindFlag = false indexMap = map[int]string{} } isCanAddToIndexMap = true } for _, ct_k := range HasOrderContactType(k) { if !ContactType[ct_k].MatchString(k) { continue } totalIndexMap[ct_k] = true ///////////////////////////// if isContinue || !ascFindFlag { continue } if isCanAddToIndexMap && len(indexMap) == 0 { indexMap[1] = ct_k ascFind = false } } } mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上 titleMatch := false if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" { titleMatch = true mustMatchFirst = false indexMap = map[int]string{1: titleMatchType} } // if buyers == nil { // Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind) // } //Debug("buyers-------", buyers) // if buyers == nil { // for _, kv := range *kvs { // Debug("bbbbbbbbbb", kv.Key, kv.Value) // } // } startIndex := 0 prevKey := "" index, notmatchCount, allMatchCount := 0, 0, 0 weightMap := map[string]map[string]interface{}{} //权重 mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置 kvsTemp := make([]*Kv, len(*kvs)) copy(kvsTemp, *kvs) for kv_index, kv := range *kvs { isBreak := true v := strings.TrimSpace(kv.Value) //根据采购单位分析 isContinue := false k := FilterContactKey(kv.Key) k_length := len([]rune(k)) if buyers != nil { for _, buyer := range buyers { if buyer == "" { continue } prevLine := kv.PrevLine prevLine = strings.TrimSpace(prevLine) prevLine = strings.Split(prevLine, " ")[0] buyerLenght, prevLineLength := len([]rune(buyer)), len([]rune(prevLine)) prevNotEqual := true if kv_index > 0 { prevNotEqual = strings.TrimSpace(((*kvs)[kv_index-1]).Value) != buyer } matchBuyerSuccess := false if strings.HasPrefix(k, buyer) && ContactInfoVagueReg.MatchString(k) && k_length-buyerLenght >= 2 && k_length-buyerLenght <= 5 { matchBuyerSuccess = true k = strings.TrimLeft(k, buyer) k_length = len([]rune(k)) // kvTemp := *kv // kvTemp.Key = strings.TrimLeft(k, buyer) // (*kvs)[kv_index] = &kvTemp } else if k == buyer { matchBuyerSuccess = true if PersonMustReg.MatchString(v) { k = "联系人" } else if TelMustReg.MatchString(v) { k = "联系电话" } else if AddressReg.MatchString(v) { k = "地址" } else if PersonReg.MatchString(v) || PhoneReg.MatchString(v) { k = "联系方式" } k_length = len([]rune(k)) } else if strings.HasPrefix(strings.TrimSpace(v), buyer) || (prevNotEqual && buyerLenght >= prevLineLength-5 && buyerLenght <= prevLineLength && strings.Contains(prevLine, buyer)) { matchBuyerSuccess = true isContinue = true } if matchBuyerSuccess { isBreak = false matchMap["采购单位"] = map[string]bool{} indexMap[1] = "采购单位" break } } } else if ascFind { for _, ct_k := range HasOrderContactType(k) { if k_length < 3 || k_length > 15 { isBreak = false continue } if !ContactType[ct_k].MatchString(k) { continue } if weightMap[ct_k] == nil { weightMap[ct_k] = map[string]interface{}{} } isAddToMatchMap := false addToMatchMapKey := "" if ContactInfoVagueReg.MatchString(k) { isAddToMatchMap = true if matchMap[ct_k] == nil { matchMap[ct_k] = map[string]bool{} } if !strings.HasSuffix(k, "方式") { _, kTag := KvTagsToKV([]*Kv{&Kv{Key: k, Value: v}}, "", BuyerContacts, 1) if len(kTag) == 1 { tagVal, weightVal := FirstKeyValueInMap(kTag) if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) { isAddToMatchMap = false } if mapIndexInKvs[ct_k] == nil { mapIndexInKvs[ct_k] = map[string]interface{}{} } myIndexInKvs := mapIndexInKvs[ct_k][tagVal] if myIndexInKvs != nil { if weightMap[ct_k][tagVal] == nil || (weightVal != nil && weightVal.(int) >= weightMap[ct_k][tagVal].(int)) { weightMap[ct_k][tagVal] = weightVal.(int) (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)] //(*kvs)[kv_index] = &Kv{Key: tagVal, Value: v} kvTemp := *kv kvTemp.Key = tagVal kvTemp.Value = v (*kvs)[kv_index] = &kvTemp } } else { weightMap[ct_k][tagVal] = weightVal.(int) } mapIndexInKvs[ct_k][tagVal] = kv_index } } addToMatchMapKey = k if ct_k == "采购单位" { k = ContactType[ct_k].FindString(k) } } if ct_k == "采购单位" { //打标签,权重高的重新覆盖 _, kTag := KvTagsToKV([]*Kv{&Kv{Key: k, Value: v}}, "", []string{"采购单位"}, 1) tagVal, weightVal := FirstKeyValueInMap(kTag) if tagVal == ct_k { if weightMap[ct_k][ct_k] == nil || (weightVal != nil && weightVal.(int) > weightMap[ct_k][ct_k].(int)) { weightMap[ct_k][ct_k] = weightVal.(int) matchMap[ct_k] = map[string]bool{} isBreak = false prevKey = "" } } } if isAddToMatchMap && !filterValue.MatchString(v) { matchMap[ct_k][ContactInfoVagueReg.FindString(addToMatchMapKey)] = true } allMatchCount++ if IsMapHasValue(ct_k, indexMap) { isContinue = true continue } isBreak = false if index != 0 || notmatchCount != 0 { startIndex = 0 indexMap = map[int]string{} } if startIndex == 0 { indexMap = map[int]string{} } prevKey = "" startIndex++ indexMap[startIndex] = ct_k isContinue = true } } if isContinue { continue } // if buyers == nil { // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount) // } if len(indexMap) == 0 { continue } if titleMatch && !ContactInfoMustReg.MatchString(k) { k = ContactInfoVagueReg.FindString(k) k_length = len([]rune(k)) } if k_length < 2 || k_length > 10 { isBreak = false continue } if !ContactInfoMustReg.MatchString(k) { if mustMatchFirst { mustMatchFirst = false continue } if buyers == nil && len(totalIndexMap) != 0 { isBreak = false } //允许有这么多个匹配不上的key notmatchCount++ if notmatchCount < len(indexMap)*2 { isBreak = false } else if contactFormat == nil && ascFind { startIndex = 0 notmatchCount = 0 indexMap = map[int]string{} //matchMap = map[string]map[string]bool{} } continue } isBreak = false if prevKey != k { prevKey = k index = 1 } else if prevKey == k { index++ } //过滤值 if filterValue.MatchString(v) { continue } myContactType := indexMap[index] if myContactType == "" { continue } // if buyers == nil { // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount) // } if strings.HasSuffix(k, "方式") && TelMustReg.MatchString(v) { k = "联系电话" } if matchMap[myContactType] == nil { matchMap[myContactType] = map[string]bool{} } myTagValue := ContactInfoMustReg.FindString(k) if myTagValue == "" && titleMatch { myTagValue = ContactInfoVagueReg.FindString(k) } if IsContactKvHandle(myTagValue, matchMap[myContactType]) { continue } matchMap[myContactType][myTagValue] = true if ContactType[myContactType].MatchString(k) { continue } allMatchCount++ delete(totalIndexMap, myContactType) if !strings.HasSuffix(k, "方式") { _, kTag := KvTagsToKV([]*Kv{&Kv{Key: myContactType + k, Value: v}}, "", BuyerContacts, 1) if len(kTag) == 1 { tagVal, _ := FirstKeyValueInMap(kTag) if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) { continue } if mapIndexInKvs[myContactType] == nil { mapIndexInKvs[myContactType] = map[string]interface{}{} } myIndexInKvs := mapIndexInKvs[myContactType][tagVal] if myIndexInKvs != nil { (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)] } mapIndexInKvs[myContactType][tagVal] = kv_index if weightMap[myContactType] == nil { weightMap[myContactType] = map[string]interface{}{} } weightMap[myContactType][tagVal] = 1 } } //(*kvs)[kv_index] = &Kv{Key: myContactType + k, Value: v} kvTemp := *kv kvTemp.Key = myContactType + k kvTemp.Value = v (*kvs)[kv_index] = &kvTemp if ascFind && isBreak && len(indexMap) > 0 { break } } if allMatchCount == 0 && len(*kvs) > 0 { indexMap = map[int]string{} matchMap = map[string]map[string]bool{} } if contactFormat != nil { (*contactFormat).IndexMap = indexMap (*contactFormat).MatchMap = matchMap } // if buyers == nil { // for _, kv := range *kvs { // Debug("bbbbbbbbbb", kv.Key, kv.Value) // } // } //Debug("totalIndexMap", len(totalIndexMap)) } */ func ContactTypeTitleMatch(title string) string { matchType := "" if title != "" && len([]rune(title)) < 15 { if ContactBuyerTitleReg.MatchString(title) { matchType = "采购单位" } else if ContactAgencyTitleReg.MatchString(title) { matchType = "代理机构" } else { for _, ct_k := range HasOrderContactType(title) { if ContactType[ct_k].MatchString(title) { matchType = ct_k break } } } } return matchType } //获取带有排序的联系人类型 func HasOrderContactType(text string) []string { indexs := []int{} indexMap := map[int]string{} temp := []string{} for k, v := range ContactType { s := v.FindStringIndex(text) if len(s) > 1 { if indexMap[s[0]] != "" { temp = append(temp, k) } else { indexs = append(indexs, s[0]) indexMap[s[0]] = k } } } sort.Ints(indexs) result := []string{} for _, v := range indexs { result = append(result, indexMap[v]) } if len(temp) > 0 { result = append(result, temp...) } return result } //两种冒号kv结合到一起 //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包 func GetKVAll(content, title string, from int) *JobKv { content = formatText(content, "kv") m1Kvs, _ := colonkvEntity.entrance(content, title, from) m1, m1Weight := KvTagsToKV(m1Kvs, title, nil, from) if m1 == nil { m1 = map[string]string{} } m2Kvs, m2, m2Weight := GetKvFromtxt(content, title, from) for k, v := range m2 { if m1[k] == "" { m1[k] = v m1Weight[k] = m2Weight[k] } } return &JobKv{ Kvs: m1Kvs, Kvs_2: m2Kvs, Kv: m1, KvTag: m1Weight, } } //KVTags转kv func KvTagsToKV(findkvs []*Kv, title string, tagdbs []string, from int) (map[string]string, map[string]*Tag) { kvTags := map[string]*Tag{} if title != "" && BlockTagMap[title] { kvTags[title] = &Tag{title, 0, nil} } for _, findkv := range findkvs { kvMap := map[string]string{} k, val := findkv.Key, findkv.Value //val是空的话,不打标签 if filterValue.MatchString(val) { continue } key := k key = ClearKey(key, 1) if key == "" { continue } key = colonkvEntity.blockTitleKV(title, key) //先用新的key tags := GetAppointTags(key, tagdbs) if len(tags) == 0 && len(key) < 10 && len(title) > 0 && len(title) < 15 { key = title + key tags = GetAppointTags(key, tagdbs) } //再用老的key if len(tags) == 0 && k != key { tags = GetAppointTags(k, tagdbs) if len(tags) == 0 && len(k) < 10 && len(title) > 0 && len(title) < 15 { k = title + k tags = GetAppointTags(k, tagdbs) if len(tags) > 0 { key = k } } } if len(tags) == 0 { //go AddtoNoMatchMap(key) //Debug(key) //continue //由跳过修改为保留 tags = []*Tag{&Tag{k, -100, nil}} } for _, tk := range tags { //分包过来给kv打标签的时候,只取第一个,后面的不覆盖 if kvTags[tk.Value] == nil || (kvTags[tk.Value].Weight < tk.Weight && from != 4) { // fc := StandardNameMap[tk.Value] // if (fc != nil && fc.CheckNum) || (moneyreg.MatchString(tk.Value)) { // val += GetMoneyUnit(k, val) // } if moneyreg.MatchString(tk.Value) { val += GetMoneyUnit(k, val) } //Debug("KV-key", tk, val) kvTags[tk.Value] = &Tag{val, tk.Weight, nil} kvMap[tk.Value] = val //Debug("KV-key", tk.Value, val, key, tk.Weight) } } } // kv := map[string]string{} kvWeight := map[string]*Tag{} if len(kvTags) > 0 { for k, v := range kvTags { if kv[k] != "" { continue } kv[k] = v.Value kvWeight[k] = v } } return kv, kvWeight } func FilterContactKey(key string) string { key1 := "" for _, v := range BracketsTextReg.FindAllString(key, -1) { for _, vv := range ContactType { if vv.MatchString(v) { if len([]rune(v)) < 3 || len([]rune(v)) > 10 { continue } key1 = v break } } } key = filterK.ReplaceAllString(key, "") key = tablekeyclear.ReplaceAllString(key, "") return key1 + key } //td里的内容,调用这边的方法分kv的时候,有的带有换行,清理掉 func RemoveWarpOfTdVal(text string) string { //只有一个冒号 if len(regDivision.FindAllString(text, -1)) != 1 { return text } text = strings.TrimSpace(text) //有一个换行 array := strings.Split(text, "\n") if len(array) != 2 { return text } //第一行以冒号结尾 if !colonEndReg.MatchString(array[0]) { if BracketsTextReg.ReplaceAllString(array[1], "") == "" { text = array[0] + array[1] } return text } text = array[0] + array[1] return text } //打标签的时候,清理key //from 1--冒号key 2--table key func ClearKey(k string, from int) string { for { old := k if from == 1 { k = filterK.ReplaceAllString(k, "") } k = tablekeyclear.ReplaceAllString(k, "") k = regReplKey.ReplaceAllString(k, "") if old == k { break } } return k } //获取金额的单位 func GetMoneyUnit(key, val string) string { if !(strings.Index(val, "元") > 0 || strings.Index(val, "万") > 0 || strings.Index(val, "亿") > 0) { mv := clear.ObjToMoney([]interface{}{val, val}) if len(mv) > 0 && qutil.IntAll(mv[0]) > 0 { for _, dw := range []string{"万", "亿"} { if strings.Index(key, dw) > 0 { return dw } } } } return "" }