Explorar o código

过滤文本规则···

zhengkun hai 9 meses
pai
achega
48f37c7726
Modificáronse 2 ficheiros con 158 adicións e 133 borrados
  1. 153 0
      extract/extension.go
  2. 5 133
      extract/extract.go

+ 153 - 0
extract/extension.go

@@ -0,0 +1,153 @@
+package extract
+
+import (
+	"data_ai/clean"
+	"data_ai/prompt"
+	"data_ai/ul"
+	log "github.com/donnie4w/go-logger/logger"
+	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
+	"strings"
+	"unicode/utf8"
+)
+
+// 确认抽取范围
+func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
+	dict := map[string]interface{}{}
+	sess := ul.SourceMgo.GetMgoConn()
+	defer ul.SourceMgo.DestoryMongoConn(sess)
+	total := 0
+	it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total%1000 == 0 {
+			log.Debug("cur index ", total)
+		}
+		if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
+			tmpid := ul.BsonTOStringId(tmp["_id"])
+			dict[tmpid] = tmpid
+		}
+		tmp = make(map[string]interface{})
+	}
+	return dict
+}
+
+// 获取附件名字信息
+func getpnsinfo(tmp map[string]interface{}) []string {
+	arr := []string{}
+	if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil {
+		if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil {
+			for _, v := range *attachments {
+				if info := qu.ObjToMap(v); info != nil {
+					if filename := qu.ObjToString((*info)["filename"]); filename != "" {
+						arr = append(arr, filename)
+					}
+				}
+			}
+		}
+	}
+	return arr
+}
+
+// 获取正文数据
+func getDetailText(v map[string]interface{}, tmpid string) string {
+	detail := qu.ObjToString(v["detail"])
+	if ul.IsTool {
+		detail = qu.ObjToString(v["details"])
+		filetext := qu.ObjToString(v["filetext"])
+		if utf8.RuneCountInString(detail) < 100 && filetext != "" {
+			detail = filetext
+		}
+	} else {
+		//if bs := ul.OssGetObject(tmpid); bs != "" {
+		//	detail = bs
+		//}
+	}
+	return detail
+}
+
+// 获取标的物-过滤产权-拟建
+func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) []map[string]interface{} {
+	if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" {
+		return []map[string]interface{}{}
+	}
+	p_data := map[string]interface{}{}
+	p_data["detail"] = qu.ObjToString(v["title"]) + "\n" + detail
+	p_data["site"] = v["site"]
+	p_data["attach_text"] = v["attach_text"]
+	p_data["toptype"] = v["toptype"]
+	if f_info["s_toptype"] != nil {
+		p_data["toptype"] = f_info["s_toptype"]
+	}
+	if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 {
+		if qu.IntAll(p_info["status"]) == 200 {
+			p_list := ul.IsMarkInterfaceMap(p_info["purchasinglist"])
+			return p_list
+		}
+	}
+	return []map[string]interface{}{}
+}
+
+/*
+****************************************
+****************************************
+****************************************
+ */
+
+// 过滤信息规则···
+func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool {
+	dl := utf8.RuneCountInString(detail) //文本长度
+	if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
+		return true
+	}
+	if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) {
+		return true
+	}
+	return false
+}
+
+// 二次校验采购单位
+func CheckOutBuyerInfo(f_data map[string]interface{}) {
+	if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
+		if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
+			if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
+				f_data["s_buyer"] = ns_buyer
+			}
+		}
+	}
+}
+
+// 合并字段
+func MergeInfo(infos []map[string]interface{}) map[string]interface{} {
+	info := map[string]interface{}{}
+	for _, v := range infos {
+		for k1, v1 := range v {
+			info[k1] = v1
+		}
+	}
+	return info
+}
+
+// 强制逻辑判断数据
+func ForcedLogicDecideInfo(f_data map[string]interface{}) {
+	//原则大模型
+	//多单位不能一致
+	s_buyer := qu.ObjToString(f_data["s_buyer"])
+	s_winner := qu.ObjToString(f_data["s_winner"])
+	if s_buyer == s_winner && s_buyer != "" {
+		/*
+			1、若单位名称-不含公司保留采购单位
+			2、若单位名称-含公司保留中标单位
+		*/
+		if strings.Contains(s_buyer, "公司") {
+			f_data["s_buyer"] = ""
+		} else {
+			f_data["s_winner"] = ""
+		}
+	}
+
+	//代理机构
+	if s_agency := qu.ObjToString(f_data["s_agency"]); s_agency != "" {
+		if s_agency == s_buyer || s_agency == s_winner {
+			f_data["s_agency"] = ""
+		}
+	}
+}

+ 5 - 133
extract/extract.go

@@ -6,7 +6,6 @@ import (
 	"data_ai/ul"
 	log "github.com/donnie4w/go-logger/logger"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
-	"strings"
 	"sync"
 	"unicode/utf8"
 )
@@ -71,19 +70,14 @@ func ExtractFieldInfo(sid string, eid string) {
 // 获取处理数据...
 func ResolveInfo(v map[string]interface{}) map[string]interface{} {
 	tmpid := ul.BsonTOStringId(v["_id"])
-	detail := getDetailText(v, tmpid) //获取正文文本
 	title := qu.ObjToString(v["title"])
-	dl := utf8.RuneCountInString(detail) //文本长度
-	//过滤数据···
-	if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
-		return map[string]interface{}{}
-	}
-	if dl < 20 {
+	detail := getDetailText(v, tmpid)        //获取正文文本
+	if NotInProgressInfo(title, detail, v) { //过滤信息
 		return map[string]interface{}{}
 	}
 	//识别结构,短文本结构
 	f_data, shorText := map[string]interface{}{}, false
-	if dl < 100 {
+	if utf8.RuneCountInString(detail) < 100 {
 		shorText = true
 	}
 	//文本格式转换
@@ -124,131 +118,9 @@ func ResolveInfo(v map[string]interface{}) map[string]interface{} {
 
 	//强制逻辑判断-
 	ForcedLogicDecideInfo(f_data)
-	return f_data
-}
-
-// 确认抽取范围
-func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
-	dict := map[string]interface{}{}
-	sess := ul.SourceMgo.GetMgoConn()
-	defer ul.SourceMgo.DestoryMongoConn(sess)
-	total := 0
-	it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
-	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total%1000 == 0 {
-			log.Debug("cur index ", total)
-		}
-		if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
-			tmpid := ul.BsonTOStringId(tmp["_id"])
-			dict[tmpid] = tmpid
-		}
-		tmp = make(map[string]interface{})
-	}
-	return dict
-}
-
-// 获取附件名字信息
-func getpnsinfo(tmp map[string]interface{}) []string {
-	arr := []string{}
-	if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil {
-		if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil {
-			for _, v := range *attachments {
-				if info := qu.ObjToMap(v); info != nil {
-					if filename := qu.ObjToString((*info)["filename"]); filename != "" {
-						arr = append(arr, filename)
-					}
-				}
-			}
-		}
-	}
-	return arr
-}
-
-// 获取正文数据
-func getDetailText(v map[string]interface{}, tmpid string) string {
-	detail := qu.ObjToString(v["detail"])
-	if ul.IsTool {
-		detail = qu.ObjToString(v["details"])
-		filetext := qu.ObjToString(v["filetext"])
-		if utf8.RuneCountInString(detail) < 100 && filetext != "" {
-			detail = filetext
-		}
-	} else {
-		//if bs := ul.OssGetObject(tmpid); bs != "" {
-		//	detail = bs
-		//}
-	}
-	return detail
-}
-
-// 获取标的物-过滤产权-拟建
-func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) []map[string]interface{} {
-	if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" {
-		return []map[string]interface{}{}
-	}
-	p_data := map[string]interface{}{}
-	p_data["detail"] = qu.ObjToString(v["title"]) + "\n" + detail
-	p_data["site"] = v["site"]
-	p_data["attach_text"] = v["attach_text"]
-	p_data["toptype"] = v["toptype"]
-	if f_info["s_toptype"] != nil {
-		p_data["toptype"] = f_info["s_toptype"]
-	}
-	if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 {
-		if qu.IntAll(p_info["status"]) == 200 {
-			p_list := ul.IsMarkInterfaceMap(p_info["purchasinglist"])
-			return p_list
-		}
-	}
-	return []map[string]interface{}{}
-}
-
-// 二次校验采购单位
-func CheckOutBuyerInfo(f_data map[string]interface{}) {
-	if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
-		if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
-			if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
-				f_data["s_buyer"] = ns_buyer
-			}
-		}
-	}
-}
-
-// 合并字段
-func MergeInfo(infos []map[string]interface{}) map[string]interface{} {
-	info := map[string]interface{}{}
-	for _, v := range infos {
-		for k1, v1 := range v {
-			info[k1] = v1
-		}
-	}
-	return info
-}
 
-// 强制逻辑判断数据
-func ForcedLogicDecideInfo(f_data map[string]interface{}) {
-	//原则大模型
-	//多单位不能一致
-	s_buyer := qu.ObjToString(f_data["s_buyer"])
-	s_winner := qu.ObjToString(f_data["s_winner"])
-	if s_buyer == s_winner && s_buyer != "" {
-		/*
-			1、若单位名称-不含公司保留采购单位
-			2、若单位名称-含公司保留中标单位
-		*/
-		if strings.Contains(s_buyer, "公司") {
-			f_data["s_buyer"] = ""
-		} else {
-			f_data["s_winner"] = ""
-		}
-	}
-
-	//代理机构
-	if s_agency := qu.ObjToString(f_data["s_agency"]); s_agency != "" {
-		if s_agency == s_buyer || s_agency == s_winner {
-			f_data["s_agency"] = ""
-		}
-	}
+	//返回数据
+	return f_data
 }
 
 // 暂时不启用...无限重试