wcj 6 ani în urmă
părinte
comite
a35f9f489f

+ 4 - 4
src/jy/pretreated/analykv.go

@@ -73,7 +73,7 @@ func GetLines(con string) (res []*Line) {
 				l1.Str = strings.Join(l1.Strs, "")
 				if !regexp.MustCompile("^[,,.。\\s \u3000\u2003\u00a0]$").MatchString(l1.Str) {
 					l1.Str = u.TrimLRSpace(l1.Str, "")
-					l1.Str = TimeHM.ReplaceAllString(l1.Str, "D$1H$2M")
+					l1.Str = TimeHM.ReplaceAllString(l1.Str, ReplTimeHM)
 					l1.Strs = strings.Split(l1.Str, "")
 					res = append(res, l1)
 				}
@@ -130,6 +130,7 @@ func FindKv_v2(con, tag string) (m *SortMap) {
 }
 
 var TimeHM = regexp.MustCompile("[\\s \u3000\u2003\u00a0]*([01]{0,1}[0123456789]|2[0123])[::]([012345][0-9])[::]{0,1}")
+var ReplTimeHM = "D${1}H${2}M"
 
 //from 1--全文 2--table td
 func FindKv(con, tag string, from int) (m *SortMap) {
@@ -151,7 +152,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 			if len(s1) > 0 {
 				str := strings.Join(s1, "")
 				str = u.TrimLRSpace(str, "")
-				str = TimeHM.ReplaceAllString(str, "D${1}H${2}M")
+				str = TimeHM.ReplaceAllString(str, ReplTimeHM)
 				s1 = strings.Split(str, "")
 				if len(s1) > 0 {
 					strs = append(strs, s1)
@@ -168,8 +169,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 	if len(s1) > 0 {
 		str := strings.Join(s1, "")
 		str = u.TrimLRSpace(str, "")
-		//u.Debug(str, TimeHM.ReplaceAllString(str, "D${1}H${2}M"))
-		str = TimeHM.ReplaceAllString(str, "D${1}H${2}M")
+		str = TimeHM.ReplaceAllString(str, ReplTimeHM)
 		s1 = strings.Split(str, "")
 		if len(s1) > 0 {
 			strs = append(strs, s1)

+ 1 - 0
src/jy/pretreated/analystep.go

@@ -70,6 +70,7 @@ func AnalyStart(job *util.Job) {
 		bl.Text = HtmlToText(con)
 		//调用kv解析
 		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1)
+		PrintKvTags(bl.ColonKV.KvTags)
 		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil)
 		//新加 未分块table中未能解析到中标候选人,从正文中解析
 		if job.Winnerorder == nil || len(job.Winnerorder) == 0 {

+ 8 - 5
src/jy/pretreated/colonkv.go

@@ -20,7 +20,7 @@ var (
 	regReplKV2    = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
 	regKV         = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
 	filterK       = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
-	filterValue   = regexp.MustCompile("(^(无)$|.+%.*|[\r\n\\s\u3000\u2003\u00a0]+|^<.*>)")
+	filterValue   = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)")
 	regReplKey    = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全名]称|姓名)$")
 	BlockTagMap   = map[string]bool{
 		"招标范围": true,
@@ -648,12 +648,14 @@ func GetKVAll(content, title string, contactFormat *ContactFormat, from int) *Jo
 func MergeKvTags(kvTags_1, kvTags_2 map[string][]*Tag) {
 	for k, v := range kvTags_2 {
 		for _, vv := range v {
-			if strings.TrimSpace(vv.Value) == "" {
+			value_vv := strings.TrimSpace(vv.Value)
+			if value_vv == "" {
 				continue
 			}
 			isExists := false
 			for _, vvv := range kvTags_1[k] {
-				if vvv.Value == vv.Value && vvv.Weight == vv.Weight {
+				value_vvv := strings.TrimSpace(vvv.Value)
+				if (value_vvv == value_vv || TimeHM.ReplaceAllString(value_vvv, ReplTimeHM) == value_vv || value_vvv == TimeHM.ReplaceAllString(value_vv, ReplTimeHM)) && vvv.Weight == vv.Weight {
 					isExists = true
 					break
 				}
@@ -681,9 +683,10 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 		kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
 	}
 	for _, findkv := range findkvs {
-		k, val, nextval := findkv.Key, findkv.Value, strings.TrimSpace(findkv.NextLine)
+		k, val, nextval := findkv.Key, strings.TrimSpace(findkv.Value), strings.TrimSpace(findkv.NextLine)
 		//val是空的话,不打标签
 		if filterValue.MatchString(val) {
+			log.Println(k, val)
 			continue
 		}
 		key := k
@@ -724,7 +727,7 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 							if strings.TrimSpace(nextval) == "" {
 								continue
 							}
-							if GetAppointTags(nextval, tagdbs).Len() > 0 || GetAppointTags(k, tagdbs).Len() > 0{
+							if GetAppointTags(nextval, tagdbs).Len() > 0 || GetAppointTags(k, tagdbs).Len() > 0 {
 								continue
 							}
 						}