123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785 |
- //识别冒号kv
- package pretreated
- import (
- "jy/clear"
- . "jy/util"
- qutil "qfw/util"
- "regexp"
- "sort"
- "strings"
- )
- type ColonkvEntity struct{}
- var (
- colonkvEntity = &ColonkvEntity{}
- regReplKV = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?")
- regReplKV2 = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
- regKV = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
- filterK = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
- filterValue = regexp.MustCompile("^(无)$")
- regReplKey = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
- BlockTagMap = map[string]bool{
- "招标范围": true,
- "资格要求": true,
- }
- brackets = map[string]string{
- "(": ")",
- "(": ")",
- "": "",
- "[": "]",
- "【": "】",
- "{": "}",
- "{": "}",
- "《": "》",
- "<": ">",
- }
- //
- PersonReg = regexp.MustCompile("[\u4e00-\u9fa5]{2,5}")
- //
- TelMustReg = regexp.MustCompile("^" + PhoneReg.String() + "$")
- PersonMustReg = regexp.MustCompile("^" + PersonReg.String() + "$")
- AddressReg = regexp.MustCompile("[省市县区路号楼]")
- BracketsTextReg = regexp.MustCompile("[((]([^((]+)[))]")
- ContactBuyerTitleReg = regexp.MustCompile("采购联系事项")
- ContactAgencyTitleReg = regexp.MustCompile("招标联系事项")
- )
- //一行多个冒号kv处理
- func (ce *ColonkvEntity) divisionMoreKV(con string) string {
- con = regReplKV.ReplaceAllStringFunc(con, func(temp string) string {
- //分kv的时候出现括号不成对出现的情况,分错了跳过
- matchText := regReplKV.FindStringSubmatch(con)[1]
- for k, v := range brackets {
- if strings.Count(matchText, k) != strings.Count(matchText, v) {
- return temp
- }
- }
- return regReplKV.ReplaceAllString(temp, "$1\n\n$2")
- })
- con = regReplKV2.ReplaceAllString(con, "$1\n\n$2")
- return con
- }
- //获取冒号kv入口
- func (ce *ColonkvEntity) entrance(con, title string, from int) ([]*Kv, map[string]string) {
- kvs := ce.GetKvs(con, title, from)
- kv := map[string]string{}
- for _, v := range kvs {
- if strings.TrimSpace(v.Value) == "" {
- continue
- }
- kv[v.Key] = v.Value
- }
- return kvs, kv
- }
- //获取有序的kv
- func (ce *ColonkvEntity) GetKvs(con, title string, from int) []*Kv {
- con = ce.processText(con)
- kvs := ce.getColonKv(con, title, from)
- return kvs
- }
- //处理正文
- func (ce *ColonkvEntity) processText(con string) string {
- con = ce.divisionMoreKV(con)//一行多个冒号kv处理
- for {
- tmp := con
- con = ce.divisionMoreKV(con)
- if tmp == con {
- break
- }
- }
- return con
- }
- //分冒号kv
- //from 1--全文 2,3--table td
- func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv {
- if from == 2 || from == 3 {
- con = RemoveWarpOfTdVal(con)
- }
- findkvs := []*Kv{}
- lines := SspacekvEntity.getLines(con)
- for index, line := range lines {
- res := regKV.FindAllStringSubmatch(line, -1)
- if len(res) > 0 {
- for _, v := range res {
- key, val := "", ""
- if len(v) == 3 {
- key = v[1]
- val = v[2]
- } else if len(v) == 4 {
- key = v[2]
- val = v[3]
- }
- //Debug("KV-key", key, val)
- //Debug("KV-key", key, val)
- //地址、联系人可能会重复 单位、代理机构的\时间、地点
- if strings.TrimSpace(key) != "" {
- prevLine, nextLine := "", ""
- if index > 0 {
- prevLine = lines[index-1]
- }
- if index < len(lines)-1 {
- nextLine = lines[index+1]
- }
- findkvs = append(findkvs, &Kv{
- Key: key,
- Value: val,
- Line: line,
- PrevLine: prevLine,
- NextLine: nextLine,
- Title: title,
- })
- splitkeys := strings.Split(key, "/")
- splitvalues := strings.Split(val, "/")
- if len(splitkeys) > 1 && len(splitkeys) == len(splitvalues) {
- for splitindex, splitkey := range splitkeys {
- findkvs = append(findkvs, &Kv{
- Key: splitkey,
- Value: splitvalues[splitindex],
- Line: line,
- PrevLine: prevLine,
- NextLine: nextLine,
- Title: title,
- })
- }
- }
- }
- }
- }
- }
- return findkvs
- }
- //冒号kv和空格kv结合
- func (ce *ColonkvEntity) getColonSpaceKV(con string) []*Kv {
- con = colonkvEntity.processText(con)
- lines := SspacekvEntity.getLines(con)
- kvMaps := []*Kv{}
- for _, line := range lines {
- kvs := colonkvEntity.getColonKv(line, "", 1)
- if len(kvs) == 0 {
- kv := SspacekvEntity.divideKV(line)
- if kv != nil {
- kvMaps = append(kvMaps, kv...)
- }
- } else {
- kvMaps = append(kvMaps, kvs...)
- }
- }
- return kvMaps
- }
- /*
- 五、递交响应文件时间及地点
- 1、时间:2016年5月20日14时00分至2016年5月20日14时30分(北京时间)
- 2、地点:烟台开发区公共资源交易中心A座5楼会议室(金沙江路83号)
- key 时间 处理成 递交响应文件时间
- */
- func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
- needKey := "时间"
- if key != needKey {
- return key
- }
- titles := regSplit.Split(title, -1)
- for _, v := range titles {
- if strings.HasSuffix(v, needKey) {
- return v
- }
- }
- return key
- }
- //根据配置文件中的规则,格式化正文
- func formatText(content, key string) string {
- for _, v := range FormatTextMap[key] {
- reg, _ := v["reg"].(*regexp.Regexp)
- separator, isString := v["separator"].(string)
- separators, isArray := v["separator"].([]interface{})
- if isArray {
- content = reg.ReplaceAllStringFunc(content, func(temp string) string {
- for _, sv := range separators {
- separator, _ := sv.(string)
- if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 {
- temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1])
- }
- }
- return temp
- })
- } else if isString {
- if array := strings.Split(separator, "__"); separator != "__" && len(array) == 2 {
- content = reg.ReplaceAllStringFunc(content, func(temp string) string {
- temp = regexp.MustCompile(array[0]).ReplaceAllString(temp, array[1])
- return temp
- })
- } else {
- content = reg.ReplaceAllString(content, separator)
- }
- }
- //Debug(v["reg"], content)
- }
- return content
- }
- func IsContactKvHandle(value string, m map[string]bool) bool {
- for k, _ := range m {
- if k != value && (strings.HasPrefix(k, value) || strings.HasPrefix(value, k)) {
- continue
- }
- if strings.Contains(value, k) || strings.Contains(k, value) {
- return true
- }
- }
- return false
- }
- //kv关于联系人信息的处理
- //采购人>集中采购机构
- /*
- func FormatContactKv(kvs *[]*Kv, title string, buyers []string) {
- ////////////////////////////
- //处理联系人信息
- var indexMap map[int]string
- var matchMap map[string]map[string]bool
- if contactFormat == nil || title != "" {
- indexMap = map[int]string{}
- matchMap = map[string]map[string]bool{}
- } else {
- indexMap = contactFormat.IndexMap
- matchMap = contactFormat.MatchMap
- }
- ////////////////////////////
- totalIndexMap := map[string]bool{}
- ascFind := true
- ascFindFlag := len(indexMap) == 0 && buyers == nil
- //采购人在联系人、电话后面的处理
- isCanAddToIndexMap := false
- for _, kv := range *kvs {
- k := FilterContactKey(kv.Key)
- k_length := len([]rune(k))
- if k_length < 2 || k_length > 15 {
- continue
- }
- isContinue := ContactInfoMustReg.MatchString(k)
- if (isContinue || (ContactInfoVagueReg.MatchString(k) && IsMapHasValue(k, ContactType))) && ascFindFlag {
- if len(indexMap) > 0 {
- ascFind = true
- ascFindFlag = false
- indexMap = map[int]string{}
- }
- isCanAddToIndexMap = true
- }
- for _, ct_k := range HasOrderContactType(k) {
- if !ContactType[ct_k].MatchString(k) {
- continue
- }
- totalIndexMap[ct_k] = true
- /////////////////////////////
- if isContinue || !ascFindFlag {
- continue
- }
- if isCanAddToIndexMap && len(indexMap) == 0 {
- indexMap[1] = ct_k
- ascFind = false
- }
- }
- }
- mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
- titleMatch := false
- if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" {
- titleMatch = true
- mustMatchFirst = false
- indexMap = map[int]string{1: titleMatchType}
- }
- // if buyers == nil {
- // Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
- // }
- //Debug("buyers-------", buyers)
- // if buyers == nil {
- // for _, kv := range *kvs {
- // Debug("bbbbbbbbbb", kv.Key, kv.Value)
- // }
- // }
- startIndex := 0
- prevKey := ""
- index, notmatchCount, allMatchCount := 0, 0, 0
- weightMap := map[string]map[string]interface{}{} //权重
- mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
- kvsTemp := make([]*Kv, len(*kvs))
- copy(kvsTemp, *kvs)
- for kv_index, kv := range *kvs {
- isBreak := true
- v := strings.TrimSpace(kv.Value)
- //根据采购单位分析
- isContinue := false
- k := FilterContactKey(kv.Key)
- k_length := len([]rune(k))
- if buyers != nil {
- for _, buyer := range buyers {
- if buyer == "" {
- continue
- }
- prevLine := kv.PrevLine
- prevLine = strings.TrimSpace(prevLine)
- prevLine = strings.Split(prevLine, " ")[0]
- buyerLenght, prevLineLength := len([]rune(buyer)), len([]rune(prevLine))
- prevNotEqual := true
- if kv_index > 0 {
- prevNotEqual = strings.TrimSpace(((*kvs)[kv_index-1]).Value) != buyer
- }
- matchBuyerSuccess := false
- if strings.HasPrefix(k, buyer) && ContactInfoVagueReg.MatchString(k) && k_length-buyerLenght >= 2 && k_length-buyerLenght <= 5 {
- matchBuyerSuccess = true
- k = strings.TrimLeft(k, buyer)
- k_length = len([]rune(k))
- // kvTemp := *kv
- // kvTemp.Key = strings.TrimLeft(k, buyer)
- // (*kvs)[kv_index] = &kvTemp
- } else if k == buyer {
- matchBuyerSuccess = true
- if PersonMustReg.MatchString(v) {
- k = "联系人"
- } else if TelMustReg.MatchString(v) {
- k = "联系电话"
- } else if AddressReg.MatchString(v) {
- k = "地址"
- } else if PersonReg.MatchString(v) || PhoneReg.MatchString(v) {
- k = "联系方式"
- }
- k_length = len([]rune(k))
- } else if strings.HasPrefix(strings.TrimSpace(v), buyer) || (prevNotEqual && buyerLenght >= prevLineLength-5 && buyerLenght <= prevLineLength && strings.Contains(prevLine, buyer)) {
- matchBuyerSuccess = true
- isContinue = true
- }
- if matchBuyerSuccess {
- isBreak = false
- matchMap["采购单位"] = map[string]bool{}
- indexMap[1] = "采购单位"
- break
- }
- }
- } else if ascFind {
- for _, ct_k := range HasOrderContactType(k) {
- if k_length < 3 || k_length > 15 {
- isBreak = false
- continue
- }
- if !ContactType[ct_k].MatchString(k) {
- continue
- }
- if weightMap[ct_k] == nil {
- weightMap[ct_k] = map[string]interface{}{}
- }
- isAddToMatchMap := false
- addToMatchMapKey := ""
- if ContactInfoVagueReg.MatchString(k) {
- isAddToMatchMap = true
- if matchMap[ct_k] == nil {
- matchMap[ct_k] = map[string]bool{}
- }
- if !strings.HasSuffix(k, "方式") {
- _, kTag := KvTagsToKV([]*Kv{&Kv{Key: k, Value: v}}, "", BuyerContacts, 1)
- if len(kTag) == 1 {
- tagVal, weightVal := FirstKeyValueInMap(kTag)
- if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) {
- isAddToMatchMap = false
- }
- if mapIndexInKvs[ct_k] == nil {
- mapIndexInKvs[ct_k] = map[string]interface{}{}
- }
- myIndexInKvs := mapIndexInKvs[ct_k][tagVal]
- if myIndexInKvs != nil {
- if weightMap[ct_k][tagVal] == nil || (weightVal != nil && weightVal.(int) >= weightMap[ct_k][tagVal].(int)) {
- weightMap[ct_k][tagVal] = weightVal.(int)
- (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)]
- //(*kvs)[kv_index] = &Kv{Key: tagVal, Value: v}
- kvTemp := *kv
- kvTemp.Key = tagVal
- kvTemp.Value = v
- (*kvs)[kv_index] = &kvTemp
- }
- } else {
- weightMap[ct_k][tagVal] = weightVal.(int)
- }
- mapIndexInKvs[ct_k][tagVal] = kv_index
- }
- }
- addToMatchMapKey = k
- if ct_k == "采购单位" {
- k = ContactType[ct_k].FindString(k)
- }
- }
- if ct_k == "采购单位" { //打标签,权重高的重新覆盖
- _, kTag := KvTagsToKV([]*Kv{&Kv{Key: k, Value: v}}, "", []string{"采购单位"}, 1)
- tagVal, weightVal := FirstKeyValueInMap(kTag)
- if tagVal == ct_k {
- if weightMap[ct_k][ct_k] == nil || (weightVal != nil && weightVal.(int) > weightMap[ct_k][ct_k].(int)) {
- weightMap[ct_k][ct_k] = weightVal.(int)
- matchMap[ct_k] = map[string]bool{}
- isBreak = false
- prevKey = ""
- }
- }
- }
- if isAddToMatchMap && !filterValue.MatchString(v) {
- matchMap[ct_k][ContactInfoVagueReg.FindString(addToMatchMapKey)] = true
- }
- allMatchCount++
- if IsMapHasValue(ct_k, indexMap) {
- isContinue = true
- continue
- }
- isBreak = false
- if index != 0 || notmatchCount != 0 {
- startIndex = 0
- indexMap = map[int]string{}
- }
- if startIndex == 0 {
- indexMap = map[int]string{}
- }
- prevKey = ""
- startIndex++
- indexMap[startIndex] = ct_k
- isContinue = true
- }
- }
- if isContinue {
- continue
- }
- // if buyers == nil {
- // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount)
- // }
- if len(indexMap) == 0 {
- continue
- }
- if titleMatch && !ContactInfoMustReg.MatchString(k) {
- k = ContactInfoVagueReg.FindString(k)
- k_length = len([]rune(k))
- }
- if k_length < 2 || k_length > 10 {
- isBreak = false
- continue
- }
- if !ContactInfoMustReg.MatchString(k) {
- if mustMatchFirst {
- mustMatchFirst = false
- continue
- }
- if buyers == nil && len(totalIndexMap) != 0 {
- isBreak = false
- }
- //允许有这么多个匹配不上的key
- notmatchCount++
- if notmatchCount < len(indexMap)*2 {
- isBreak = false
- } else if contactFormat == nil && ascFind {
- startIndex = 0
- notmatchCount = 0
- indexMap = map[int]string{}
- //matchMap = map[string]map[string]bool{}
- }
- continue
- }
- isBreak = false
- if prevKey != k {
- prevKey = k
- index = 1
- } else if prevKey == k {
- index++
- }
- //过滤值
- if filterValue.MatchString(v) {
- continue
- }
- myContactType := indexMap[index]
- if myContactType == "" {
- continue
- }
- // if buyers == nil {
- // Debug(ascFind, indexMap, k, v, matchMap, notmatchCount)
- // }
- if strings.HasSuffix(k, "方式") && TelMustReg.MatchString(v) {
- k = "联系电话"
- }
- if matchMap[myContactType] == nil {
- matchMap[myContactType] = map[string]bool{}
- }
- myTagValue := ContactInfoMustReg.FindString(k)
- if myTagValue == "" && titleMatch {
- myTagValue = ContactInfoVagueReg.FindString(k)
- }
- if IsContactKvHandle(myTagValue, matchMap[myContactType]) {
- continue
- }
- matchMap[myContactType][myTagValue] = true
- if ContactType[myContactType].MatchString(k) {
- continue
- }
- allMatchCount++
- delete(totalIndexMap, myContactType)
- if !strings.HasSuffix(k, "方式") {
- _, kTag := KvTagsToKV([]*Kv{&Kv{Key: myContactType + k, Value: v}}, "", BuyerContacts, 1)
- if len(kTag) == 1 {
- tagVal, _ := FirstKeyValueInMap(kTag)
- if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) {
- continue
- }
- if mapIndexInKvs[myContactType] == nil {
- mapIndexInKvs[myContactType] = map[string]interface{}{}
- }
- myIndexInKvs := mapIndexInKvs[myContactType][tagVal]
- if myIndexInKvs != nil {
- (*kvs)[myIndexInKvs.(int)] = kvsTemp[myIndexInKvs.(int)]
- }
- mapIndexInKvs[myContactType][tagVal] = kv_index
- if weightMap[myContactType] == nil {
- weightMap[myContactType] = map[string]interface{}{}
- }
- weightMap[myContactType][tagVal] = 1
- }
- }
- //(*kvs)[kv_index] = &Kv{Key: myContactType + k, Value: v}
- kvTemp := *kv
- kvTemp.Key = myContactType + k
- kvTemp.Value = v
- (*kvs)[kv_index] = &kvTemp
- if ascFind && isBreak && len(indexMap) > 0 {
- break
- }
- }
- if allMatchCount == 0 && len(*kvs) > 0 {
- indexMap = map[int]string{}
- matchMap = map[string]map[string]bool{}
- }
- if contactFormat != nil {
- (*contactFormat).IndexMap = indexMap
- (*contactFormat).MatchMap = matchMap
- }
- // if buyers == nil {
- // for _, kv := range *kvs {
- // Debug("bbbbbbbbbb", kv.Key, kv.Value)
- // }
- // }
- //Debug("totalIndexMap", len(totalIndexMap))
- }
- */
- func ContactTypeTitleMatch(title string) string {
- matchType := ""
- if title != "" && len([]rune(title)) < 15 {
- if ContactBuyerTitleReg.MatchString(title) {
- matchType = "采购单位"
- } else if ContactAgencyTitleReg.MatchString(title) {
- matchType = "代理机构"
- } else {
- for _, ct_k := range HasOrderContactType(title) {
- if ContactType[ct_k].MatchString(title) {
- matchType = ct_k
- break
- }
- }
- }
- }
- return matchType
- }
- //获取带有排序的联系人类型
- func HasOrderContactType(text string) []string {
- indexs := []int{}
- indexMap := map[int]string{}
- temp := []string{}
- for k, v := range ContactType {
- s := v.FindStringIndex(text)
- if len(s) > 1 {
- if indexMap[s[0]] != "" {
- temp = append(temp, k)
- } else {
- indexs = append(indexs, s[0])
- indexMap[s[0]] = k
- }
- }
- }
- sort.Ints(indexs)
- result := []string{}
- for _, v := range indexs {
- result = append(result, indexMap[v])
- }
- if len(temp) > 0 {
- result = append(result, temp...)
- }
- return result
- }
- //两种冒号kv结合到一起
- //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包
- func GetKVAll(content, title string, from int) *JobKv {
- content = formatText(content, "kv")
- m1Kvs, _ := colonkvEntity.entrance(content, title, from)
- m1, m1Weight := KvTagsToKV(m1Kvs, title, nil, from)
- if m1 == nil {
- m1 = map[string]string{}
- }
- m2Kvs, m2, m2Weight := GetKvFromtxt(content, title, from)
- for k, v := range m2 {
- if m1[k] == "" {
- m1[k] = v
- m1Weight[k] = m2Weight[k]
- }
- }
- return &JobKv{
- Kvs: m1Kvs,
- Kvs_2: m2Kvs,
- Kv: m1,
- KvTag: m1Weight,
- }
- }
- //KVTags转kv
- func KvTagsToKV(findkvs []*Kv, title string, tagdbs []string, from int) (map[string]string, map[string]*Tag) {
- kvTags := map[string]*Tag{}
- if title != "" && BlockTagMap[title] {
- kvTags[title] = &Tag{title, 0, nil}
- }
- for _, findkv := range findkvs {
- kvMap := map[string]string{}
- k, val := findkv.Key, findkv.Value
- //val是空的话,不打标签
- if filterValue.MatchString(val) {
- continue
- }
- key := k
- key = ClearKey(key, 1)
- if key == "" {
- continue
- }
- key = colonkvEntity.blockTitleKV(title, key)
- //先用新的key
- tags := GetAppointTags(key, tagdbs)
- if len(tags) == 0 && len(key) < 10 && len(title) > 0 && len(title) < 15 {
- key = title + key
- tags = GetAppointTags(key, tagdbs)
- }
- //再用老的key
- if len(tags) == 0 && k != key {
- tags = GetAppointTags(k, tagdbs)
- if len(tags) == 0 && len(k) < 10 && len(title) > 0 && len(title) < 15 {
- k = title + k
- tags = GetAppointTags(k, tagdbs)
- if len(tags) > 0 {
- key = k
- }
- }
- }
- if len(tags) == 0 {
- //go AddtoNoMatchMap(key)
- //Debug(key)
- //continue
- //由跳过修改为保留
- tags = []*Tag{&Tag{k, -100, nil}}
- }
- for _, tk := range tags {
- //分包过来给kv打标签的时候,只取第一个,后面的不覆盖
- if kvTags[tk.Value] == nil || (kvTags[tk.Value].Weight < tk.Weight && from != 4) {
- // fc := StandardNameMap[tk.Value]
- // if (fc != nil && fc.CheckNum) || (moneyreg.MatchString(tk.Value)) {
- // val += GetMoneyUnit(k, val)
- // }
- if moneyreg.MatchString(tk.Value) {
- val += GetMoneyUnit(k, val)
- }
- //Debug("KV-key", tk, val)
- kvTags[tk.Value] = &Tag{val, tk.Weight, nil}
- kvMap[tk.Value] = val
- //Debug("KV-key", tk.Value, val, key, tk.Weight)
- }
- }
- }
- //
- kv := map[string]string{}
- kvWeight := map[string]*Tag{}
- if len(kvTags) > 0 {
- for k, v := range kvTags {
- if kv[k] != "" {
- continue
- }
- kv[k] = v.Value
- kvWeight[k] = v
- }
- }
- return kv, kvWeight
- }
- func FilterContactKey(key string) string {
- key1 := ""
- for _, v := range BracketsTextReg.FindAllString(key, -1) {
- for _, vv := range ContactType {
- if vv.MatchString(v) {
- if len([]rune(v)) < 3 || len([]rune(v)) > 10 {
- continue
- }
- key1 = v
- break
- }
- }
- }
- key = filterK.ReplaceAllString(key, "")
- key = tablekeyclear.ReplaceAllString(key, "")
- return key1 + key
- }
- //td里的内容,调用这边的方法分kv的时候,有的带有换行,清理掉
- func RemoveWarpOfTdVal(text string) string {
- //只有一个冒号
- if len(regDivision.FindAllString(text, -1)) != 1 {
- return text
- }
- text = strings.TrimSpace(text)
- //有一个换行
- array := strings.Split(text, "\n")
- if len(array) != 2 {
- return text
- }
- //第一行以冒号结尾
- if !colonEndReg.MatchString(array[0]) {
- if BracketsTextReg.ReplaceAllString(array[1], "") == "" {
- text = array[0] + array[1]
- }
- return text
- }
- text = array[0] + array[1]
- return text
- }
- //打标签的时候,清理key
- //from 1--冒号key 2--table key
- func ClearKey(k string, from int) string {
- for {
- old := k
- if from == 1 {
- k = filterK.ReplaceAllString(k, "")
- }
- k = tablekeyclear.ReplaceAllString(k, "")
- k = regReplKey.ReplaceAllString(k, "")
- if old == k {
- break
- }
- }
- return k
- }
- //获取金额的单位
- func GetMoneyUnit(key, val string) string {
- if !(strings.Index(val, "元") > 0 || strings.Index(val, "万") > 0 || strings.Index(val, "亿") > 0) {
- mv := clear.ObjToMoney([]interface{}{val, val})
- if len(mv) > 0 && qutil.IntAll(mv[0]) > 0 {
- for _, dw := range []string{"万", "亿"} {
- if strings.Index(key, dw) > 0 {
- return dw
- }
- }
- }
- }
- return ""
- }
|