Browse Source

添加华汇新字段抽取逻辑

wcc 1 year ago
parent
commit
db186e72bc

+ 29 - 0
src/jy/admin/rulecheck.go

@@ -313,6 +313,35 @@ func checkCoreReg(field, content, ruleText string) map[string]string {
 					}
 					}
 				}
 				}
 			}
 			}
+		} else {
+			//添加 "<<"符号,返回所有匹配结果
+			regs := strings.Split(ptmp[0], "<<")
+			if len(regs) == 2 {
+				parses := regs[1] //定义的排除关键词,含有这些关键词,则视为作废
+				pattern := regs[0]
+				reg := regexp.MustCompile(pattern)
+				results := reg.FindAllString(content, -1)
+				if len(results) > 0 {
+					results = ju.RemoveDuplicates(results)
+					result := ""
+					//有排除词时需要过滤
+					if len(parses) > 0 {
+						var res []string
+						for _, v := range results {
+							reg2 := regexp.MustCompile(parses)
+							res2 := reg2.FindAllString(v, -1)
+							if len(res2) == 0 {
+								res = append(res, v)
+							}
+						}
+						result = strings.Join(res, "\n")
+					} else {
+						result = strings.Join(results, "\n")
+					}
+
+					rep[field] = result
+				}
+			}
 		}
 		}
 	}, func(err interface{}) {
 	}, func(err interface{}) {
 		rep["err"] = fmt.Sprint(err)
 		rep["err"] = fmt.Sprint(err)

+ 20 - 0
src/jy/extract/extractcheck.go

@@ -256,5 +256,25 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 		}
 		}
 	}
 	}
 
 
+	//企业资质检验,不含有资质时删除
+	if enterprise_qualification, ok := tmp["enterprise_qualification"]; ok {
+		special := `(甲级|乙级|丙级|丁级|一级|二级|三级|叁级|壹级|贰级|四级|五级|工程设计|市政公用工程|铁路工程|建筑工程|公路工程|人防工程|工程勘察|岩土工程|水文地质勘察|工程测量|工程钻探|电力工程|大地测量|消防设施工程|特种工程|房屋建筑工程|信息技术服务|信息系统安全|机电工程|建筑机电安装工程|消防设施工程|水利水电工程|水利工程|地基基础工程)`
+		reg := regexp.MustCompile(special)
+		var res = make([]string, 0)
+		datas := strings.Split(qu.ObjToString(enterprise_qualification), "\n")
+		for _, data := range datas {
+			results := reg.FindAllString(data, -1)
+			if len(results) > 0 {
+				res = append(res, data)
+			}
+		}
+
+		if len(res) == 0 {
+			delete(tmp, "enterprise_qualification")
+		} else {
+			tmp["enterprise_qualification"] = strings.Join(res, "\n")
+		}
+	}
+
 	return tmp
 	return tmp
 }
 }

+ 60 - 7
src/jy/extract/extractrule.go

@@ -172,6 +172,30 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 			if len(extinfo) > 0 {
 			if len(extinfo) > 0 {
 				AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 				AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 			}
 			}
+		} else if in.Field == "enterprise_qualification" || in.Field == "personnel_qualification" || in.Field == "performance_qualification" || in.Field == "enterprise_credit" {
+			hasResult := false
+			//1.通过文中的资质要求抽取四个资质
+			qualifications := ju.GetQualifications(pretreated.HtmlToText(qu.ObjToString(doc[extfrom])))
+			if qualifications != "" {
+				extinfo := extRegCoreToResult(extfrom, qualifications, &map[string]string{}, j, in, isSite, "")
+				if len(extinfo) > 0 {
+					hasResult = true
+					AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+				}
+			}
+			//2.通过抽取资质要求段落匹配不到时,匹配全文或者附件内容,
+			if !hasResult {
+				content := ""
+				if j.IsFile {
+					content = j.Content
+				} else {
+					content = doc["detail"].(string)
+				}
+				extinfo := extRegCoreToResult(extfrom, content, &map[string]string{}, j, in, isSite, "")
+				if len(extinfo) > 0 {
+					AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
+				}
+			}
 		} else {
 		} else {
 			for _, v := range j.Block {
 			for _, v := range j.Block {
 				btag := make(map[string]string)
 				btag := make(map[string]string)
@@ -651,14 +675,43 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
 			}
 			}
 		}
 		}
 	} else {
 	} else {
-		pos := vre.RegCore.Reg.FindStringIndex(text)
 		val := ""
 		val := ""
-		if len(pos) == 2 {
-			text = text[pos[1]:]
-			rs := regexp.MustCompile("[^\r\n\t]+")
-			tmp := rs.FindAllString(text, -1)
-			if len(tmp) > 0 {
-				val = tmp[0]
+		//正则表达式含有"<<",表示需要所有匹配数据
+		regs := strings.Split(vre.RuleText, "<<")
+		if len(regs) == 2 {
+			pattern := regs[0]
+			parses := regs[1] //定义的排除关键词,含有这些关键词,则视为作废
+			reg := regexp.MustCompile(pattern)
+			results := reg.FindAllString(text, -1)
+			if len(results) > 0 {
+				results = ju.RemoveDuplicates(results)
+				result := ""
+				//有排除词时需要过滤
+				if len(parses) > 0 {
+					var res []string
+					for _, v := range results {
+						reg2 := regexp.MustCompile(parses)
+						res2 := reg2.FindAllString(v, -1)
+						if len(res2) == 0 {
+							res = append(res, v)
+						}
+					}
+					result = strings.Join(res, "\n")
+				} else {
+					result = strings.Join(results, "\n")
+				}
+
+				val = result
+			}
+		} else {
+			pos := vre.RegCore.Reg.FindStringIndex(text)
+			if len(pos) == 2 {
+				text = text[pos[1]:]
+				rs := regexp.MustCompile("[^\r\n\t]+")
+				tmp := rs.FindAllString(text, -1)
+				if len(tmp) > 0 {
+					val = tmp[0]
+				}
 			}
 			}
 		}
 		}
 		if val != "" {
 		if val != "" {

+ 2 - 0
src/jy/extract/extractsave.go

@@ -399,6 +399,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		e.inscribeRecognize(&tmp, *j.Data)
 		e.inscribeRecognize(&tmp, *j.Data)
 		//落款识别指定特殊采购单位
 		//落款识别指定特殊采购单位
 		e.AimAtRecognizeBuyer(&tmp, *j.Data)
 		e.AimAtRecognizeBuyer(&tmp, *j.Data)
+		//根据正文获取资质要求
+		e.getQualifications(&tmp, *j.Data)
 		//城市抽取
 		//城市抽取
 		if e.IsExtractCity {
 		if e.IsExtractCity {
 			//e.NewExtractCity(j, &tmp) //旧版
 			//e.NewExtractCity(j, &tmp) //旧版

+ 15 - 1
src/jy/extract/extraxtmethod.go

@@ -216,7 +216,21 @@ func isUsedMultiPackage(pkg map[string]map[string]interface{}) bool {
 	return false
 	return false
 }
 }
 
 
-// 落款识别~采购单位
+//getQualifications 添加所有资质新字段
+func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[string]interface{}) {
+	/**
+	qualifications 资质要求
+	*/
+	detail := qu.ObjToString(j_data["detail"])
+	new_detail := pretreated.HtmlToText(detail)
+
+	qualifications := ju.GetQualifications(new_detail)
+	if qualifications != "" {
+		(*tmp)["qualifications"] = qualifications
+	}
+}
+
+//落款识别~采购单位
 func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}) {
 func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}) {
 	//落款实体
 	//落款实体
 	if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
 	if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&

+ 1 - 1
src/jy/pretreated/analycore.go

@@ -62,7 +62,7 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}, isSite bool,
 	if len(res) > 0 {
 	if len(res) > 0 {
 		for _, t1 := range res {
 		for _, t1 := range res {
 			//降低冒号值的权重-不适合日期格式的数据
 			//降低冒号值的权重-不适合日期格式的数据
-			if MhSpilt.MatchString(v1) && !(UnTimeSpiltKey.MatchString(t1.Value) && UnTimeSpiltValue.MatchString(v1)) {
+			if MhSpilt.MatchString(v1) && !(UnTimeSpiltKey.MatchString(t1.Value) && UnTimeSpiltValue.MatchString(v1)) && t1.Value != "企业信用" {
 				t1.Weight -= 50
 				t1.Weight -= 50
 			}
 			}
 			if winnerOrderAndBidResult.MatchString(tabletag) && t1.Value == "采购单位联系人" { //处理table中项目负责人
 			if winnerOrderAndBidResult.MatchString(tabletag) && t1.Value == "采购单位联系人" { //处理table中项目负责人

+ 84 - 0
src/jy/util/util.go

@@ -8,6 +8,7 @@ import (
 	qu "qfw/util"
 	qu "qfw/util"
 	"regexp"
 	"regexp"
 	"strconv"
 	"strconv"
+	"strings"
 
 
 	. "gopkg.in/mgo.v2/bson"
 	. "gopkg.in/mgo.v2/bson"
 )
 )
@@ -274,3 +275,86 @@ func IsMarkInterfaceMap(t interface{}) []map[string]interface{} {
 	}
 	}
 	return p_list
 	return p_list
 }
 }
+
+//GetQualifications 从正文中获取气质要求
+func GetQualifications(text string) (qualifications string) {
+	re1 := regexp.MustCompile(`(\n(\s)*(\d)[、..](\s*)[\p{Han}]+[::]?)`)
+	re2 := regexp.MustCompile(`(?m)^(★)?(一|二|三|四|五|六|七|八|九|十|十一)[、.](.+?)[\p{Han}]?[::]?$`)
+	match := regexp.MustCompile(`(投标人|参选单位|投标商|供应商|申请人|参选人|应答人|承包人|应答方|报名人|报价人|投标单位|服务商|竞价方).*(资质|资格|要求|准入条件|条件).*`)
+	matchWords := []string{"合格的投标人", "资格要求", "入围审核条价", "报名要求"}
+
+	var sections = make([]map[string]interface{}, 0)
+	sections = ExtractSections(text, re2)
+	if len(sections) > 0 {
+
+	} else {
+		sections = ExtractSections(text, re1)
+	}
+
+	for _, section := range sections {
+		matches := match.FindStringSubmatch(section["title"].(string))
+		if len(matches) > 1 {
+			return section["content"].(string)
+		} else {
+			for _, mword := range matchWords {
+				if strings.Contains(section["title"].(string), mword) {
+					return section["content"].(string)
+				}
+			}
+		}
+	}
+	return
+}
+
+// ExtractSections 函数提取文本中的标题和内容,并返回一个包含map的切片
+func ExtractSections(text string, re *regexp.Regexp) []map[string]interface{} {
+	// 使用正则表达式在文本中找到所有章节标题的起始索引位置
+	sectionIndexes := re.FindAllStringIndex(text, -1)
+	// 将文本的结束索引位置也添加为一个哨兵,方便处理最后一个章节
+	sectionIndexes = append(sectionIndexes, []int{len(text), len(text)})
+	// 初始化一个map切片,用于保存提取的各个章节内容
+	sections := make([]map[string]interface{}, len(sectionIndexes)-1)
+	// 遍历各个章节标题的索引位置,将文本切分为不同的章节
+	for i := range sections {
+		// 当前章节的起始和结束索引位置
+		start := sectionIndexes[i][1]
+		end := sectionIndexes[i+1][0]
+		// 提取当前章节的标题和文本内容,并去除前后的空白字符
+		title := strings.TrimSpace(text[sectionIndexes[i][0]:start])
+		content := strings.TrimSpace(text[start:end])
+		// 将标题和内容添加到map中
+		sections[i] = map[string]interface{}{
+			"title":   title,
+			"content": content,
+		}
+	}
+	// 返回提取的各个章节内容组成的map切片
+	return sections
+}
+
+//ContainSpecialWord 判断一个字符串是否包含特殊关键词
+func ContainSpecialWord(key string, words []string) bool {
+	//含有排除关键词,直接跳过
+	for _, word := range words {
+		if strings.Contains(key, word) {
+			return true
+		}
+	}
+
+	return false
+}
+
+//RemoveDuplicates 移除重复字符串
+func RemoveDuplicates(input []string) []string {
+	seen := make(map[string]bool)
+	output := []string{}
+
+	for _, str := range input {
+		if !seen[str] {
+			seen[str] = true
+			output = append(output, str)
+		}
+	}
+
+	return output
+}