|
@@ -15,14 +15,15 @@ import (
|
|
type ColonkvEntity struct{}
|
|
type ColonkvEntity struct{}
|
|
|
|
|
|
var (
|
|
var (
|
|
- colonkvEntity = &ColonkvEntity{}
|
|
|
|
- regReplKV = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?")
|
|
|
|
- regReplKV2 = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
|
|
|
|
- regKV = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
|
|
|
|
- filterK = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
|
|
|
|
- filterValue = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)")
|
|
|
|
- regReplKey = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
|
|
|
|
- BlockTagMap = map[string]bool{
|
|
|
|
|
|
+ colonkvEntity = &ColonkvEntity{}
|
|
|
|
+ regReplKV = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?")
|
|
|
|
+ regReplKV2 = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
|
|
|
|
+ regKV = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
|
|
|
|
+ filterK = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
|
|
|
|
+ filterValue = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)")
|
|
|
|
+ regReplKey = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
|
|
|
|
+ buyerAndAgency = regexp.MustCompile("(代理(机构|人)|采购(人|单位))")
|
|
|
|
+ BlockTagMap = map[string]bool{
|
|
"招标范围": true,
|
|
"招标范围": true,
|
|
"资格要求": true,
|
|
"资格要求": true,
|
|
}
|
|
}
|
|
@@ -202,17 +203,34 @@ func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
|
|
|
|
|
|
//根据配置文件中的规则,格式化正文
|
|
//根据配置文件中的规则,格式化正文
|
|
func formatText(content, key string) string {
|
|
func formatText(content, key string) string {
|
|
- segment := DivideSegment(content)
|
|
|
|
|
|
+ segments := make([]*Segment, 0)
|
|
|
|
+ if key == "all" {
|
|
|
|
+ segments = DivideSegmentHtml(content)
|
|
|
|
+ } else if key == "kv" {
|
|
|
|
+ segments = DivideSegment(content)
|
|
|
|
+ //log.Println("清理前:\n",content)
|
|
|
|
+ }
|
|
newCon := ""
|
|
newCon := ""
|
|
- for _, v := range segment {
|
|
|
|
- if v.Index > len(segment)-3 {
|
|
|
|
|
|
+ for _, v := range segments {
|
|
|
|
+ if v.Index > len(segments)-3 {
|
|
if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
|
|
if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
|
|
break
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ if key == "kv" && utf8.RuneCountInString(v.Text) >= 1 {
|
|
|
|
+ //log.Println("清理前:",v.Text, []rune(v.Text)[len([]rune(v.Text))-1])
|
|
|
|
+ v.Text = strings.TrimRightFunc(v.Text, func(r rune) bool {
|
|
|
|
+ return r == 19968 || r == 20108 || r == 19977 ||
|
|
|
|
+ r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061
|
|
|
|
+ })
|
|
|
|
+ //log.Println("清理前后",v.Text)
|
|
|
|
+ }
|
|
newCon += v.Text + "\n"
|
|
newCon += v.Text + "\n"
|
|
}
|
|
}
|
|
content = regEndWrap.ReplaceAllString(newCon, "")
|
|
content = regEndWrap.ReplaceAllString(newCon, "")
|
|
|
|
+ //if key == "kv"{
|
|
|
|
+ // log.Println("清理前后\n",content)
|
|
|
|
+ //}
|
|
for _, v := range FormatTextMap[key] {
|
|
for _, v := range FormatTextMap[key] {
|
|
reg, _ := v["reg"].(*regexp.Regexp)
|
|
reg, _ := v["reg"].(*regexp.Regexp)
|
|
separator, isString := v["separator"].(string)
|
|
separator, isString := v["separator"].(string)
|
|
@@ -312,14 +330,18 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
|
|
}
|
|
}
|
|
mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
|
|
mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
|
|
titleMatch := false
|
|
titleMatch := false
|
|
- if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 {
|
|
|
|
|
|
+ if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" {
|
|
titleMatch = true
|
|
titleMatch = true
|
|
mustMatchFirst = false
|
|
mustMatchFirst = false
|
|
- for i, t := range titleMatchType {
|
|
|
|
- indexMap[i+1] = t
|
|
|
|
- }
|
|
|
|
- //indexMap = map[int]string{1: titleMatchType}
|
|
|
|
|
|
+ indexMap = map[int]string{1: titleMatchType}
|
|
}
|
|
}
|
|
|
|
+ // if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 {
|
|
|
|
+ // titleMatch = true
|
|
|
|
+ // mustMatchFirst = false
|
|
|
|
+ // for i, t := range titleMatchType {
|
|
|
|
+ // indexMap[i+1] = t
|
|
|
|
+ // }
|
|
|
|
+ // }
|
|
// if buyers == nil {
|
|
// if buyers == nil {
|
|
// Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
|
|
// Debug("title-------", mustMatchFirst, title, indexMap, matchMap, totalIndexMap, ascFind)
|
|
// }
|
|
// }
|
|
@@ -330,7 +352,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
startIndex := 0
|
|
startIndex := 0
|
|
- //prevKey := ""
|
|
|
|
|
|
+ prevKey := ""
|
|
index, notmatchCount, allMatchCount := 0, 0, 0
|
|
index, notmatchCount, allMatchCount := 0, 0, 0
|
|
weightMap := map[string]map[string]interface{}{} //权重
|
|
weightMap := map[string]map[string]interface{}{} //权重
|
|
mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
|
|
mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
|
|
@@ -349,6 +371,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
|
|
if buyer == "" {
|
|
if buyer == "" {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
|
|
+
|
|
prevLine := kv.PrevLine
|
|
prevLine := kv.PrevLine
|
|
prevLine = strings.TrimSpace(prevLine)
|
|
prevLine = strings.TrimSpace(prevLine)
|
|
prevLine = strings.Split(prevLine, " ")[0]
|
|
prevLine = strings.Split(prevLine, " ")[0]
|
|
@@ -450,7 +473,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
|
|
weightMap[ct_k][ct_k] = weightVal.(int)
|
|
weightMap[ct_k][ct_k] = weightVal.(int)
|
|
matchMap[ct_k] = map[string]bool{}
|
|
matchMap[ct_k] = map[string]bool{}
|
|
isBreak = false
|
|
isBreak = false
|
|
- //prevKey = ""
|
|
|
|
|
|
+ prevKey = ""
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -470,7 +493,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
|
|
if startIndex == 0 {
|
|
if startIndex == 0 {
|
|
indexMap = map[int]string{}
|
|
indexMap = map[int]string{}
|
|
}
|
|
}
|
|
- //prevKey = ""
|
|
|
|
|
|
+ prevKey = ""
|
|
startIndex++
|
|
startIndex++
|
|
indexMap[startIndex] = ct_k
|
|
indexMap[startIndex] = ct_k
|
|
isContinue = true
|
|
isContinue = true
|
|
@@ -493,7 +516,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
|
|
isBreak = false
|
|
isBreak = false
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
- if !ContactInfoMustReg.MatchString(k) {
|
|
|
|
|
|
+ if !ContactInfoMustReg.MatchString(k) { //判断是否是电话、邮箱、地址等信息
|
|
if mustMatchFirst {
|
|
if mustMatchFirst {
|
|
mustMatchFirst = false
|
|
mustMatchFirst = false
|
|
continue
|
|
continue
|
|
@@ -520,17 +543,18 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
|
|
// } else if index < 2 {
|
|
// } else if index < 2 {
|
|
// index++
|
|
// index++
|
|
// }
|
|
// }
|
|
- // if prevKey != k {
|
|
|
|
- // prevKey = k
|
|
|
|
- // index = 1
|
|
|
|
- // } else if prevKey == k {
|
|
|
|
- // index++
|
|
|
|
- // }
|
|
|
|
- if startIndex == 0 || startIndex%2 == 1 {
|
|
|
|
|
|
+ if prevKey != k {
|
|
|
|
+ prevKey = k
|
|
index = 1
|
|
index = 1
|
|
- } else if startIndex%2 == 0 {
|
|
|
|
- index = 2
|
|
|
|
|
|
+ } else if prevKey == k {
|
|
|
|
+ index++
|
|
}
|
|
}
|
|
|
|
+ // if startIndex == 0 || startIndex%2 == 1 || index == 0 {
|
|
|
|
+ // index = 1
|
|
|
|
+ // } else if startIndex%2 == 0 {
|
|
|
|
+ // index = 2
|
|
|
|
+ // }
|
|
|
|
+
|
|
//hasMatch[k] = true
|
|
//hasMatch[k] = true
|
|
//过滤值
|
|
//过滤值
|
|
if filterValue.MatchString(v) {
|
|
if filterValue.MatchString(v) {
|
|
@@ -607,42 +631,42 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
|
|
// }
|
|
// }
|
|
//Debug("totalIndexMap", len(totalIndexMap))
|
|
//Debug("totalIndexMap", len(totalIndexMap))
|
|
}
|
|
}
|
|
-func ContactTypeTitleMatch(title string) []string {
|
|
|
|
- matchType := []string{}
|
|
|
|
- matchTypeMap := map[string]bool{}
|
|
|
|
- if title != "" && len([]rune(title)) < 25 {
|
|
|
|
- if ContactBuyerTitleReg.MatchString(title) {
|
|
|
|
- matchType = append(matchType, "采购单位")
|
|
|
|
- matchTypeMap["采购单位"] = true
|
|
|
|
- }
|
|
|
|
- if ContactAgencyTitleReg.MatchString(title) {
|
|
|
|
- matchType = append(matchType, "代理机构")
|
|
|
|
- matchTypeMap["代理机构"] = true
|
|
|
|
- }
|
|
|
|
- if len(matchType) == 2 {
|
|
|
|
- return matchType
|
|
|
|
- }
|
|
|
|
- for _, ct_k := range HasOrderContactType(title) {
|
|
|
|
- if ContactType[ct_k].MatchString(title) && !matchTypeMap[ct_k] {
|
|
|
|
- matchType = append(matchType, ct_k)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- // matchType := ""
|
|
|
|
- // if title != "" && len([]rune(title)) < 15 {
|
|
|
|
|
|
+func ContactTypeTitleMatch(title string) string {
|
|
|
|
+ // matchType := []string{}
|
|
|
|
+ // matchTypeMap := map[string]bool{}
|
|
|
|
+ // if title != "" && len([]rune(title)) < 25 {
|
|
// if ContactBuyerTitleReg.MatchString(title) {
|
|
// if ContactBuyerTitleReg.MatchString(title) {
|
|
- // matchType = "采购单位"
|
|
|
|
- // } else if ContactAgencyTitleReg.MatchString(title) {
|
|
|
|
- // matchType = "代理机构"
|
|
|
|
- // } else {
|
|
|
|
- // for _, ct_k := range HasOrderContactType(title) {
|
|
|
|
- // if ContactType[ct_k].MatchString(title) {
|
|
|
|
- // matchType = ct_k
|
|
|
|
- // break
|
|
|
|
- // }
|
|
|
|
|
|
+ // matchType = append(matchType, "采购单位")
|
|
|
|
+ // matchTypeMap["采购单位"] = true
|
|
|
|
+ // }
|
|
|
|
+ // if ContactAgencyTitleReg.MatchString(title) {
|
|
|
|
+ // matchType = append(matchType, "代理机构")
|
|
|
|
+ // matchTypeMap["代理机构"] = true
|
|
|
|
+ // }
|
|
|
|
+ // if len(matchType) == 2 {
|
|
|
|
+ // return matchType
|
|
|
|
+ // }
|
|
|
|
+ // for _, ct_k := range HasOrderContactType(title) {
|
|
|
|
+ // if ContactType[ct_k].MatchString(title) && !matchTypeMap[ct_k] {
|
|
|
|
+ // matchType = append(matchType, ct_k)
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
|
|
+ matchType := ""
|
|
|
|
+ if title != "" && len([]rune(title)) < 15 {
|
|
|
|
+ if ContactBuyerTitleReg.MatchString(title) {
|
|
|
|
+ matchType = "采购单位"
|
|
|
|
+ } else if ContactAgencyTitleReg.MatchString(title) {
|
|
|
|
+ matchType = "代理机构"
|
|
|
|
+ } else {
|
|
|
|
+ for _, ct_k := range HasOrderContactType(title) {
|
|
|
|
+ if ContactType[ct_k].MatchString(title) {
|
|
|
|
+ matchType = ct_k
|
|
|
|
+ break
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
return matchType
|
|
return matchType
|
|
}
|
|
}
|
|
|
|
|
|
@@ -748,16 +772,6 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
|
|
kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
|
|
kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
|
|
}
|
|
}
|
|
for _, findkv := range findkvs {
|
|
for _, findkv := range findkvs {
|
|
- // if ContactInfoMustReg.MatchString(findkv.Value) { //名称、地址、联系人、邮编、电话
|
|
|
|
- // preval := findkv.PrevLine
|
|
|
|
- // ctkarr := HasOrderContactType(preval)
|
|
|
|
- // if len(ctkarr) > 0 {
|
|
|
|
- // for i, ct_k := range ctkarr {
|
|
|
|
- // indexMap[i+1] = ct_k
|
|
|
|
- // }
|
|
|
|
- // }
|
|
|
|
- // qutil.Debug("----", indexMap)
|
|
|
|
- // }
|
|
|
|
k, val, nextval := findkv.Key, strings.TrimSpace(findkv.Value), strings.TrimSpace(findkv.NextLine)
|
|
k, val, nextval := findkv.Key, strings.TrimSpace(findkv.Value), strings.TrimSpace(findkv.NextLine)
|
|
//val是空的话,不打标签
|
|
//val是空的话,不打标签
|
|
if filterValue.MatchString(val) {
|
|
if filterValue.MatchString(val) {
|
|
@@ -863,6 +877,9 @@ func RemoveWarpOfTdVal(text string) string {
|
|
//打标签的时候,清理key
|
|
//打标签的时候,清理key
|
|
//from 1--冒号key 2--table key
|
|
//from 1--冒号key 2--table key
|
|
func ClearKey(k string, from int) string {
|
|
func ClearKey(k string, from int) string {
|
|
|
|
+ if buyerAndAgency.MatchString(filterK.FindString(k)) { //采购项目联系人(代理机构)5d423d70a5cb26b9b76fa2e7
|
|
|
|
+ return k
|
|
|
|
+ }
|
|
for {
|
|
for {
|
|
old := k
|
|
old := k
|
|
if from == 1 {
|
|
if from == 1 {
|