Selaa lähdekoodia

过滤信息调整

zhengkun 9 kuukautta sitten
vanhempi
commit
72e14fbbf0
2 muutettua tiedostoa jossa 13 lisäystä ja 2 poistoa
  1. 11 1
      extract/extension.go
  2. 2 1
      extract/extract.go

+ 11 - 1
extract/extension.go

@@ -6,10 +6,19 @@ import (
 	"data_ai/ul"
 	log "github.com/donnie4w/go-logger/logger"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	"regexp"
 	"strings"
 	"unicode/utf8"
 )
 
+func FilterDetail(con string) string {
+	return Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "")
+}
+
+var Reg = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+")
+var Filter = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
+var SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
+
 // 确认抽取范围
 func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
 	dict := map[string]interface{}{}
@@ -94,10 +103,11 @@ func getPurList(v map[string]interface{}, detail string, f_info map[string]inter
 
 // 过滤信息规则···
 func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool {
-	dl := utf8.RuneCountInString(detail) //文本长度
 	if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
 		return true
 	}
+	detail = FilterDetail(detail)        //只保留文本内容
+	dl := utf8.RuneCountInString(detail) //文本长度
 	if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) {
 		return true
 	}

+ 2 - 1
extract/extract.go

@@ -92,7 +92,8 @@ func ResolveInfo(v map[string]interface{}) map[string]interface{} {
 	f_info_1 := prompt.AcquireExtractFieldInfoFirst(detail)
 	f_info_2 := prompt.AcquireExtractFieldInfoFirst(detail)
 	f_info := MergeInfo([]map[string]interface{}{f_info_1, f_info_2})
-
+	
+	//非短文本以下识别
 	if !shorText {
 		//获取分包信息
 		if pkg := prompt.AcquireNewMultiplePackageInfo(detail); len(pkg) > 0 {