//识别冒号kv package pretreated import ( "jy/clear" . "jy/util" "log" qutil "qfw/util" "regexp" "sort" "strings" "unicode/utf8" ) type ColonkvEntity struct{} var ( colonkvEntity = &ColonkvEntity{} regReplKV = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?") regReplKV2 = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)") regKV = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)") filterK = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+") filterValue = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)") regReplKey = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全]称|姓名)$") buyerAndAgency = regexp.MustCompile("(代理(机构|人)|采购(人|单位))") BlockTagMap = map[string]bool{ "招标范围": true, "资格要求": true, } brackets = map[string]string{ "(": ")", "(": ")", "": "", "[": "]", "【": "】", "{": "}", "{": "}", "《": "》", "<": ">", } // PersonReg = regexp.MustCompile("[\u4e00-\u9fa5]{2,5}") // TelMustReg = regexp.MustCompile("^" + PhoneReg.String() + "$") PersonMustReg = regexp.MustCompile("^" + PersonReg.String() + "$") AddressReg = regexp.MustCompile("[省市县区路号楼]") BracketsTextReg = regexp.MustCompile("[((]([^((]+)[))]") ContactBuyerTitleReg = regexp.MustCompile("采购联系事项") ContactAgencyTitleReg = regexp.MustCompile("招标联系事项") ZipCode = regexp.MustCompile("邮(政)?编(码)?") ) //一行多个冒号kv处理 func (ce *ColonkvEntity) divisionMoreKV(con string) string { con = regReplKV.ReplaceAllStringFunc(con, func(temp string) string { //分kv的时候出现括号不成对出现的情况,分错了跳过 matchText := regReplKV.FindStringSubmatch(con)[1] for k, v := range brackets { if strings.Count(matchText, k) != strings.Count(matchText, v) { return temp } } return regReplKV.ReplaceAllString(temp, "$1\n\n$2") }) con = regReplKV2.ReplaceAllString(con, "$1\n\n$2") return con } //获取冒号kv入口 func (ce *ColonkvEntity) entrance(con, title string, contactFormat *ContactFormat, from int,isSite bool,codeSite string) ([]*Kv, map[string]string) { kvs := ce.GetKvs(con, title, from) if from == 1 { FormatContactKv(&kvs, title, nil, contactFormat,isSite,codeSite) } kv := map[string]string{} for _, v := range kvs { if strings.TrimSpace(v.Value) == "" { continue } kv[v.Key] = v.Value } return kvs, kv } //获取有序的kv func (ce *ColonkvEntity) GetKvs(con, title string, from int) []*Kv { con = ce.processText(con) kvs := ce.getColonKv(con, title, from) return kvs } //处理正文 func (ce *ColonkvEntity) processText(con string) string { con = ce.divisionMoreKV(con) //一行多个冒号kv处理 for { tmp := con con = ce.divisionMoreKV(con) if tmp == con { break } } return con } //分冒号kv //from 1--全文 2,3--table td func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv { if from == 2 || from == 3 { con = RemoveWarpOfTdVal(con) } findkvs := []*Kv{} lines := SspacekvEntity.getLines(con) for index, line := range lines { res := regKV.FindAllStringSubmatch(line, -1) if len(res) > 0 { for _, v := range res { key, val := "", "" if len(v) == 3 { key = v[1] val = v[2] } else if len(v) == 4 { key = v[2] val = v[3] } //Debug("KV-key", key, val) //Debug("KV-key", key, val) //地址、联系人可能会重复 单位、代理机构的\时间、地点 if strings.TrimSpace(key) != "" { prevLine, nextLine := "", "" if index > 0 { prevLine = lines[index-1] } if index < len(lines)-1 { nextLine = lines[index+1] } findkvs = append(findkvs, &Kv{ Key: key, Value: val, Line: line, PrevLine: prevLine, NextLine: nextLine, Title: title, }) splitkeys := strings.Split(key, "/") splitvalues := strings.Split(val, "/") if len(splitkeys) > 1 && len(splitkeys) == len(splitvalues) { for splitindex, splitkey := range splitkeys { findkvs = append(findkvs, &Kv{ Key: splitkey, Value: splitvalues[splitindex], Line: line, PrevLine: prevLine, NextLine: nextLine, Title: title, }) } } } } } } return findkvs } //冒号kv和空格kv结合 func (ce *ColonkvEntity) getColonSpaceKV(con string,isSite bool,codeSite string) []*Kv { con = colonkvEntity.processText(con) lines := SspacekvEntity.getLines(con) kvMaps := []*Kv{} for _, line := range lines { kvs := colonkvEntity.getColonKv(line, "", 1) if len(kvs) == 0 { kv := SspacekvEntity.divideKV(line,isSite,codeSite) if kv != nil { kvMaps = append(kvMaps, kv...) } } else { kvMaps = append(kvMaps, kvs...) } } return kvMaps } /* 五、递交响应文件时间及地点 1、时间:2016年5月20日14时00分至2016年5月20日14时30分(北京时间) 2、地点:烟台开发区公共资源交易中心A座5楼会议室(金沙江路83号) key 时间 处理成 递交响应文件时间 */ func (ce *ColonkvEntity) blockTitleKV(title, key string) string { needKey := "时间" if key != needKey { return key } titles := regSplit.Split(title, -1) for _, v := range titles { if strings.HasSuffix(v, needKey) { return v } } return key } //根据配置文件中的规则,格式化正文 func formatText(content, key string) string { segments := make([]*Segment, 0) if key == "all" { segments = DivideSegmentHtml(content) } else if key == "kv" { segments = DivideSegment(content) //log.Println("清理前:\n",content) } newCon := "" for _, v := range segments { if v.Index > len(segments)-3 { if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) { break } } if key == "kv" && utf8.RuneCountInString(v.Text) >= 1 { //log.Println("清理前:",v.Text, []rune(v.Text)[len([]rune(v.Text))-1]) v.Text = strings.TrimRightFunc(v.Text, func(r rune) bool { return r == 19968 || r == 20108 || r == 19977 || r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 }) //log.Println("清理前后",v.Text) } newCon += v.Text + "\n" } content = regEndWrap.ReplaceAllString(newCon, "") //if key == "kv"{ // log.Println("清理前后\n",content) //} for _, v := range FormatTextMap[key] { reg, _ := v["reg"].(*regexp.Regexp) separator, isString := v["separator"].(string) separators, isArray := v["separator"].([]interface{}) if isArray { content = reg.ReplaceAllStringFunc(content, func(temp string) string { for _, sv := range separators { separator, _ := sv.(string) if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 { temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1]) } } return temp }) } else if isString { if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 { content = reg.ReplaceAllStringFunc(content, func(temp string) string { temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1]) return temp }) } else { content = reg.ReplaceAllString(content, separator) } } //Debug(v["reg"], content) } return content } func IsContactKvHandle(value string, m map[string]bool) bool { for k, _ := range m { // if k != value && (strings.HasPrefix(k, value) || strings.HasPrefix(value, k)) { // continue // } // if strings.Contains(value, k) || strings.Contains(k, value) { // return true // } if k == value { return true } } return false } //kv关于联系人信息的处理 //采购人>集中采购机构 func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *ContactFormat,isSite bool,codeSite string) { //////////////////////////// //处理联系人信息 var indexMap map[int]string var matchMap map[string]map[string]bool //hasMatch := make(map[string]bool) if contactFormat == nil || title != "" { indexMap = map[int]string{} matchMap = map[string]map[string]bool{} } else { indexMap = contactFormat.IndexMap matchMap = contactFormat.MatchMap } //////////////////////////// totalIndexMap := map[string]bool{} ascFind := true ascFindFlag := len(indexMap) == 0 && buyers == nil //采购人在联系人、电话后面的处理 isCanAddToIndexMap := false for _, kv := range *kvs { k := FilterContactKey(kv.Key) k_length := len([]rune(k)) if k_length < 2 || k_length > 15 { continue } isContinue := ContactInfoMustReg.MatchString(k) if (isContinue || (ContactInfoVagueReg.MatchString(k) && IsMapHasValue(k, ContactType))) && ascFindFlag { if len(indexMap) > 0 { ascFind = true ascFindFlag = false indexMap = map[int]string{} } isCanAddToIndexMap = true } n := 1 for _, ct_k := range HasOrderContactType(k) { if !ContactType[ct_k].MatchString(k) { continue } totalIndexMap[ct_k] = true ///////////////////////////// if isContinue || !ascFindFlag { continue } // if isCanAddToIndexMap && len(indexMap) == 0 { if isCanAddToIndexMap { indexMap[n] = ct_k n++ ascFind = false } } } mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上 titleMatch := false if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" { titleMatch = true mustMatchFirst = false indexMap = map[int]string{1: titleMatchType} } // if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 { // titleMatch = true // mustMatchFirst = false // for i, t := range titleMatchType { // indexMap[i+1] = t // } // } // if buyers == nil { // Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind) // } //Debug("buyers-------", buyers) // if buyers == nil { // for _, kv := range *kvs { // Debug("bbbbbbbbbb", kv.Key, kv.Value) // } // } startIndex := 0 prevKey := "" index, tmpindex, notmatchCount, allMatchCount := 0, 0, 0, 0 weightMap := map[string]map[string]interface{}{} //权重 mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置 kvsTemp := make([]*Kv, len(*kvs)) copy(kvsTemp, *kvs) //again := 0 ishad := false for kv_index, kv := range *kvs { isBreak := true v := strings.TrimSpace(kv.Value) //根据采购单位分析 isContinue := false k := FilterContactKey(kv.Key) k_length := len([]rune(k)) if buyers != nil { for _, buyer := range buyers { if buyer == "" { continue } prevLine := kv.PrevLine prevLine = strings.TrimSpace(prevLine) prevLine = strings.Split(prevLine, " ")[0] buyerLenght, prevLineLength := len([]rune(buyer)), len([]rune(prevLine)) prevNotEqual := true if kv_index > 0 { prevNotEqual = strings.TrimSpace(((*kvs)[kv_index-1]).Value) != buyer } matchBuyerSuccess := false if strings.HasPrefix(k, buyer) && ContactInfoVagueReg.MatchString(k) && k_length-buyerLenght >= 2 && k_length-buyerLenght <= 5 { matchBuyerSuccess = true k = strings.TrimLeft(k, buyer) k_length = len([]rune(k)) // kvTemp := *kv // kvTemp.Key = strings.TrimLeft(k, buyer) // (*kvs)[kv_index] = &kvTemp } else if k == buyer { matchBuyerSuccess = true if PersonMustReg.MatchString(v) { k = "联系人" } else if TelMustReg.MatchString(v) { k = "联系电话" } else if AddressReg.MatchString(v) { k = "地址" } else if PersonReg.MatchString(v) || PhoneReg.MatchString(v) { k = "联系方式" } else if ZipCode.MatchString(v) { k = "邮政编码" } k_length = len([]rune(k)) } else if strings.HasPrefix(strings.TrimSpace(v), buyer) || (prevNotEqual && buyerLenght >= prevLineLength-5 && buyerLenght <= prevLineLength && strings.Contains(prevLine, buyer)) { matchBuyerSuccess = true isContinue = true } if matchBuyerSuccess { isBreak = false matchMap["采购单位"] = map[string]bool{} indexMap[1] = "采购单位" break } } } else if ascFind { for _, ct_k := range HasOrderContactType(k) { ishad = false //again++ if k_length < 3 || k_length > 15 { isBreak = false continue } if !ContactType[ct_k].MatchString(k) { continue } if weightMap[ct_k] == nil { weightMap[ct_k] = map[string]interface{}{} } isAddToMatchMap := false addToMatchMapKey := "" if ContactInfoVagueReg.MatchString(k) { //判断是不是电话、地址。。。 isAddToMatchMap = true if matchMap[ct_k] == nil { matchMap[ct_k] = map[string]bool{} } if !strings.HasSuffix(k, "方式") { kvTags := GetKvTags([]*Kv{&Kv{Key: k, Value: v}}, "", BuyerContacts,isSite,codeSite) if len(kvTags) == 1 { tagVal, weightVal := FirstKeyValueInMap(kvTags) if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) { isAddToMatchMap = false } if mapIndexInKvs[ct_k] == nil { mapIndexInKvs[ct_k] = map[string]interface{}{} } myIndexInKvs := mapIndexInKvs[ct_k][tagVal] if myIndexInKvs != nil { if weightMap[ct_k][tagVal] == nil || (weightVal != nil && weightVal.(int) >= weightMap[ct_k][tagVal].(int)) { weightMap[ct_k][tagVal] = weightVal.(int) (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)] //(*kvs)[kv_index] = &Kv{Key: tagVal, Value: v} kvTemp := *kv kvTemp.Key = tagVal kvTemp.Value = v (*kvs)[kv_index] = &kvTemp } } else { weightMap[ct_k][tagVal] = weightVal.(int) } mapIndexInKvs[ct_k][tagVal] = kv_index } } addToMatchMapKey = k if ct_k == "采购单位" { k = ContactType[ct_k].FindString(k) } } if ct_k == "采购单位" { //打标签,权重高的重新覆盖 kvTags := GetKvTags([]*Kv{&Kv{Key: k, Value: v}}, "", []string{"采购单位"},isSite,codeSite) tagVal, weightVal := FirstKeyValueInMap(kvTags) if tagVal == ct_k { if weightMap[ct_k][ct_k] == nil || (weightVal != nil && weightVal.(int) > weightMap[ct_k][ct_k].(int)) { weightMap[ct_k][ct_k] = weightVal.(int) matchMap[ct_k] = map[string]bool{} isBreak = false prevKey = "" } } } if isAddToMatchMap && !filterValue.MatchString(v) { matchMap[ct_k][ContactInfoVagueReg.FindString(addToMatchMapKey)] = true } allMatchCount++ if IsMapHasValue(ct_k, indexMap) { ishad = true tmpindex = GetIndex(ct_k, indexMap) isContinue = true continue } isBreak = false if index != 0 || notmatchCount != 0 { startIndex = 0 indexMap = map[int]string{} } if startIndex == 0 { indexMap = map[int]string{} } prevKey = "" startIndex++ indexMap[startIndex] = ct_k isContinue = true } } if isContinue { continue } // if buyers == nil { // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount) // } if len(indexMap) == 0 { continue } if titleMatch && !ContactInfoMustReg.MatchString(k) { k = ContactInfoVagueReg.FindString(k) k_length = len([]rune(k)) } if k_length < 2 || k_length > 10 { isBreak = false continue } if !ContactInfoMustReg.MatchString(k) { //判断是否是电话、邮箱、地址等信息 if mustMatchFirst { mustMatchFirst = false continue } if buyers == nil && len(totalIndexMap) != 0 { isBreak = false } //允许有这么多个匹配不上的key notmatchCount++ if notmatchCount < len(indexMap)*2 { isBreak = false } else if contactFormat == nil && ascFind { startIndex = 0 notmatchCount = 0 indexMap = map[int]string{} //matchMap = map[string]map[string]bool{} } continue } isBreak = false // if prevKey != k && !hasMatch[k] { // prevKey = k // index = 1 // } else if index < 2 { // index++ // } if ishad { index = tmpindex } else { if prevKey != k { prevKey = k index = 1 } else if prevKey == k { index++ } } // if startIndex == 0 || startIndex%2 == 1 || index == 0 { // index = 1 // } else if startIndex%2 == 0 { // index = 2 // } //hasMatch[k] = true //过滤值 if filterValue.MatchString(v) { continue } myContactType := indexMap[index] if myContactType == "" { continue } // if buyers == nil { // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount) // } if strings.HasSuffix(k, "方式") && TelMustReg.MatchString(v) { k = "联系电话" } if matchMap[myContactType] == nil { matchMap[myContactType] = map[string]bool{} } myTagValue := ContactInfoMustReg.FindString(k) if myTagValue == "" && titleMatch { myTagValue = ContactInfoVagueReg.FindString(k) } if IsContactKvHandle(myTagValue, matchMap[myContactType]) { continue } matchMap[myContactType][myTagValue] = true if ContactType[myContactType].MatchString(k) { continue } allMatchCount++ delete(totalIndexMap, myContactType) if !strings.HasSuffix(k, "方式") { kvTags := GetKvTags([]*Kv{&Kv{Key: myContactType + k, Value: v}}, "", BuyerContacts,isSite,codeSite) if len(kvTags) == 1 { tagVal, _ := FirstKeyValueInMap(kvTags) if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) { continue } if mapIndexInKvs[myContactType] == nil { mapIndexInKvs[myContactType] = map[string]interface{}{} } myIndexInKvs := mapIndexInKvs[myContactType][tagVal] if myIndexInKvs != nil { (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)] } mapIndexInKvs[myContactType][tagVal] = kv_index if weightMap[myContactType] == nil { weightMap[myContactType] = map[string]interface{}{} } weightMap[myContactType][tagVal] = 1 } } //(*kvs)[kv_index] = &Kv{Key: myContactType + k, Value: v} kvTemp := *kv kvTemp.Key = myContactType + k kvTemp.Value = v (*kvs)[kv_index] = &kvTemp if ascFind && isBreak && len(indexMap) > 0 { break } } if allMatchCount == 0 && len(*kvs) > 0 { indexMap = map[int]string{} matchMap = map[string]map[string]bool{} } if contactFormat != nil { (*contactFormat).IndexMap = indexMap (*contactFormat).MatchMap = matchMap } // if buyers == nil { // for _, kv := range *kvs { // Debug("bbbbbbbbbb", kv.Key, kv.Value) // } // } //Debug("totalIndexMap", len(totalIndexMap)) } func ContactTypeTitleMatch(title string) string { // matchType := []string{} // matchTypeMap := map[string]bool{} // if title != "" && len([]rune(title)) < 25 { // if ContactBuyerTitleReg.MatchString(title) { // matchType = append(matchType, "采购单位") // matchTypeMap["采购单位"] = true // } // if ContactAgencyTitleReg.MatchString(title) { // matchType = append(matchType, "代理机构") // matchTypeMap["代理机构"] = true // } // if len(matchType) == 2 { // return matchType // } // for _, ct_k := range HasOrderContactType(title) { // if ContactType[ct_k].MatchString(title) && !matchTypeMap[ct_k] { // matchType = append(matchType, ct_k) // } // } // } matchType := "" if title != "" && len([]rune(title)) < 15 { if ContactBuyerTitleReg.MatchString(title) { matchType = "采购单位" } else if ContactAgencyTitleReg.MatchString(title) { matchType = "代理机构" } else { for _, ct_k := range HasOrderContactType(title) { if ContactType[ct_k].MatchString(title) { matchType = ct_k break } } } } return matchType } //获取带有排序的联系人类型 func HasOrderContactType(text string) []string { indexs := []int{} indexMap := map[int]string{} temp := []string{} for k, v := range ContactType { s := v.FindStringIndex(text) if len(s) > 1 { if indexMap[s[0]] != "" { temp = append(temp, k) } else { indexs = append(indexs, s[0]) indexMap[s[0]] = k } } } sort.Ints(indexs) result := []string{} for _, v := range indexs { result = append(result, indexMap[v]) } if len(temp) > 0 { result = append(result, temp...) } return result } //两种冒号kv结合到一起 //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包 func GetKVAll(content, title string, contactFormat *ContactFormat, from int,isSite bool,codeSite string) *JobKv { content = formatText(content, "kv") m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from,isSite,codeSite) // for _, kvs := range m1Kvs { // qutil.Debug(kvs.Key, kvs.Value) // } kvTags := GetKvTags(m1Kvs, title, nil,isSite,codeSite) // for k, kvs := range kvTags { // qutil.Debug("kkkkk--", k) // for _, kv := range kvs { // qutil.Debug(kv.Key, kv.Value) // } // } m2Kvs, m2KvTags := GetKvFromtxt(content, title, from,isSite,codeSite) // for k, kvs := range m2KvTags { // qutil.Debug("kkkkk--", k) // for _, kv := range kvs { // qutil.Debug(kv.Key, kv.Value) // } // } MergeKvTags(kvTags, m2KvTags) // for k, kvs := range kvTags { // qutil.Debug("kkkkk--", k) // for _, kv := range kvs { // qutil.Debug(kv.Key, kv.Value) // } // } return &JobKv{ Kvs: m1Kvs, Kvs_2: m2Kvs, KvTags: kvTags, } } //合并kv标签,把kvTags_2合并到kvTags_1 func MergeKvTags(kvTags_1, kvTags_2 map[string][]*Tag) { for k, v := range kvTags_2 { for _, vv := range v { value_vv := strings.TrimSpace(vv.Value) if value_vv == "" || vv.Key == vv.Value { continue } isExists := false for _, vvv := range kvTags_1[k] { value_vvv := strings.TrimSpace(vvv.Value) if (value_vvv == value_vv || TimeHM.ReplaceAllString(value_vvv, ReplTimeHM) == value_vv || value_vvv == TimeHM.ReplaceAllString(value_vv, ReplTimeHM)) && vvv.Weight == vv.Weight { isExists = true break } } if !isExists { kvTags_1[k] = append(kvTags_1[k], vv) } } } } //控制台输出kv的值 func PrintKvTags(kvTags map[string][]*Tag) { for k, v := range kvTags { for _, vv := range v { log.Println("kvTags===", k, "---", vv.Key, vv.Value, vv.Weight, vv.IsInvalid) } } } //KVTags转kv func GetKvTags(findkvs []*Kv, title string, tagdbs []string,isSite bool,codeSite string) map[string][]*Tag { kvTags := map[string][]*Tag{} if title != "" && BlockTagMap[title] { kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false}) } for _, findkv := range findkvs { k, val, nextval := findkv.Key, strings.TrimSpace(findkv.Value), strings.TrimSpace(findkv.NextLine) //val是空的话,不打标签 if filterValue.MatchString(val) { continue } key := k key = ClearKey(key, 1) if key == "" { continue } key = colonkvEntity.blockTitleKV(title, key) //先用新的key tags := GetAppointTags(key, tagdbs,isSite,codeSite) //找标签库 if len(tags) == 0 && len(key) < 10 && len(title) > 0 && len(title) < 15 { key = title + key tags = GetAppointTags(key, tagdbs,isSite,codeSite) } //再用老的key if len(tags) == 0 && k != key { tags = GetAppointTags(k, tagdbs,isSite,codeSite) if len(tags) == 0 && len(k) < 10 && len(title) > 0 && len(title) < 15 { k = title + k tags = GetAppointTags(k, tagdbs,isSite,codeSite) if len(tags) > 0 { key = k } } } if len(tags) > 0 { for _, tk := range tags { if moneyreg.MatchString(tk.Value) { val += GetMoneyUnit(k, val) } if val != "" { kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: val, Weight: tk.Weight}) } else if nextval != "" && utf8.RuneCountInString(nextval) < 30 { if strings.Contains(nextval, ":") || strings.Contains(nextval, ":") { if len(strings.Split(nextval, ":")) > 1 || len(strings.Split(nextval, ":")) > 1 { //tmpnextval := "" nextval = strings.Split(nextval, ":")[0] nextval = strings.Split(nextval, ":")[0] if strings.TrimSpace(nextval) == "" { continue } if GetAppointTags(nextval, tagdbs,isSite,codeSite).Len() > 0 || GetAppointTags(k, tagdbs,isSite,codeSite).Len() > 0 { continue } } } kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: nextval, Weight: tk.Weight}) } } } else { if val != "" { kvTags[k] = append(kvTags[k], &Tag{Key: k, Value: val, IsInvalid: true}) } } } return kvTags } func FilterContactKey(key string) string { key1 := "" for _, v := range BracketsTextReg.FindAllString(key, -1) { for _, vv := range ContactType { if vv.MatchString(v) { if len([]rune(v)) < 3 || len([]rune(v)) > 10 { continue } key1 = v break } } } key = filterK.ReplaceAllString(key, "") key = tablekeyclear.ReplaceAllString(key, "") return key1 + key } //td里的内容,调用这边的方法分kv的时候,有的带有换行,清理掉 func RemoveWarpOfTdVal(text string) string { //只有一个冒号 if len(regDivision.FindAllString(text, -1)) != 1 { return text } text = strings.TrimSpace(text) //有一个换行 array := strings.Split(text, "\n") if len(array) != 2 { return text } //第一行以冒号结尾 if !colonEndReg.MatchString(array[0]) { if BracketsTextReg.ReplaceAllString(array[1], "") == "" { text = array[0] + array[1] } return text } text = array[0] + array[1] return text } //打标签的时候,清理key //from 1--冒号key 2--table key func ClearKey(k string, from int) string { if buyerAndAgency.MatchString(filterK.FindString(k)) { //采购项目联系人(代理机构)5d423d70a5cb26b9b76fa2e7 return k } for { old := k if from == 1 { k = filterK.ReplaceAllString(k, "") } k = tablekeyclear.ReplaceAllString(k, "") k = regReplKey.ReplaceAllString(k, "") if old == k { break } } return k } //获取金额的单位 func GetMoneyUnit(key, val string) string { if !(strings.Index(val, "元") > 0 || strings.Index(val, "万") > 0 || strings.Index(val, "亿") > 0) { mv := clear.ObjToMoney([]interface{}{val, val}) if len(mv) > 0 && qutil.IntAll(mv[0]) > 0 { for _, dw := range []string{"万", "亿"} { if strings.Index(key, dw) > 0 { return dw } } } } return "" } func GetIndex(ct_k string, indexMap map[int]string) int { for k, v := range indexMap { if ct_k == v { return k } } return 1 }