1 year ago · db186e72bc
--- a/src/jy/admin/rulecheck.go
+++ b/src/jy/admin/rulecheck.go
@@ -313,6 +313,35 @@ func checkCoreReg(field, content, ruleText string) map[string]string {
 
															 					}
														
 
															 				}
														
 
															 			}
														
 
															+		} else {
														
 
															+			//添加 "<<"符号，返回所有匹配结果
														
 
															+			regs := strings.Split(ptmp[0], "<<")
														
 
															+			if len(regs) == 2 {
														
 
															+				parses := regs[1] //定义的排除关键词，含有这些关键词，则视为作废
														
 
															+				pattern := regs[0]
														
 
															+				reg := regexp.MustCompile(pattern)
														
 
															+				results := reg.FindAllString(content, -1)
														
 
															+				if len(results) > 0 {
														
 
															+					results = ju.RemoveDuplicates(results)
														
 
															+					result := ""
														
 
															+					//有排除词时需要过滤
														
 
															+					if len(parses) > 0 {
														
 
															+						var res []string
														
 
															+						for _, v := range results {
														
 
															+							reg2 := regexp.MustCompile(parses)
														
 
															+							res2 := reg2.FindAllString(v, -1)
														
 
															+							if len(res2) == 0 {
														
 
															+								res = append(res, v)
														
 
															+							}
														
 
															+						}
														
 
															+						result = strings.Join(res, "\n")
														
 
															+					} else {
														
 
															+						result = strings.Join(results, "\n")
														
 
															+					}
														
 
															+
														
 
															+					rep[field] = result
														
 
															+				}
														
 
															+			}
														
 
															 		}
														
 
															 	}, func(err interface{}) {
														
 
															 		rep["err"] = fmt.Sprint(err)
														
--- a/src/jy/extract/extractcheck.go
+++ b/src/jy/extract/extractcheck.go
@@ -256,5 +256,25 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
 
															 		}
														
 
															 	}
														
 
															+	//企业资质检验,不含有资质时删除
														
 
															+	if enterprise_qualification, ok := tmp["enterprise_qualification"]; ok {
														
 
															+		special := `(甲级|乙级|丙级|丁级|一级|二级|三级|叁级|壹级|贰级|四级|五级|工程设计|市政公用工程|铁路工程|建筑工程|公路工程|人防工程|工程勘察|岩土工程|水文地质勘察|工程测量|工程钻探|电力工程|大地测量|消防设施工程|特种工程|房屋建筑工程|信息技术服务|信息系统安全|机电工程|建筑机电安装工程|消防设施工程|水利水电工程|水利工程|地基基础工程)`
														
 
															+		reg := regexp.MustCompile(special)
														
 
															+		var res = make([]string, 0)
														
 
															+		datas := strings.Split(qu.ObjToString(enterprise_qualification), "\n")
														
 
															+		for _, data := range datas {
														
 
															+			results := reg.FindAllString(data, -1)
														
 
															+			if len(results) > 0 {
														
 
															+				res = append(res, data)
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		if len(res) == 0 {
														
 
															+			delete(tmp, "enterprise_qualification")
														
 
															+		} else {
														
 
															+			tmp["enterprise_qualification"] = strings.Join(res, "\n")
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															 	return tmp
														
 
															 }
														
--- a/src/jy/extract/extractrule.go
+++ b/src/jy/extract/extractrule.go
@@ -172,6 +172,30 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 
															 			if len(extinfo) > 0 {
														
 
															 				AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
														
 
															 			}
														
 
															+		} else if in.Field == "enterprise_qualification" || in.Field == "personnel_qualification" || in.Field == "performance_qualification" || in.Field == "enterprise_credit" {
														
 
															+			hasResult := false
														
 
															+			//1.通过文中的资质要求抽取四个资质
														
 
															+			qualifications := ju.GetQualifications(pretreated.HtmlToText(qu.ObjToString(doc[extfrom])))
														
 
															+			if qualifications != "" {
														
 
															+				extinfo := extRegCoreToResult(extfrom, qualifications, &map[string]string{}, j, in, isSite, "")
														
 
															+				if len(extinfo) > 0 {
														
 
															+					hasResult = true
														
 
															+					AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
														
 
															+				}
														
 
															+			}
														
 
															+			//2.通过抽取资质要求段落匹配不到时，匹配全文或者附件内容，
														
 
															+			if !hasResult {
														
 
															+				content := ""
														
 
															+				if j.IsFile {
														
 
															+					content = j.Content
														
 
															+				} else {
														
 
															+					content = doc["detail"].(string)
														
 
															+				}
														
 
															+				extinfo := extRegCoreToResult(extfrom, content, &map[string]string{}, j, in, isSite, "")
														
 
															+				if len(extinfo) > 0 {
														
 
															+					AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
														
 
															+				}
														
 
															+			}
														
 
															 		} else {
														
 
															 			for _, v := range j.Block {
														
 
															 				btag := make(map[string]string)
														
@@ -651,14 +675,43 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
 
															 			}
														
 
															 		}
														
 
															 	} else {
														
 
															-		pos := vre.RegCore.Reg.FindStringIndex(text)
														
 
															 		val := ""
														
 
															-		if len(pos) == 2 {
														
 
															-			text = text[pos[1]:]
														
 
															-			rs := regexp.MustCompile("[^\r\n\t]+")
														
 
															-			tmp := rs.FindAllString(text, -1)
														
 
															-			if len(tmp) > 0 {
														
 
															-				val = tmp[0]
														
 
															+		//正则表达式含有"<<",表示需要所有匹配数据
														
 
															+		regs := strings.Split(vre.RuleText, "<<")
														
 
															+		if len(regs) == 2 {
														
 
															+			pattern := regs[0]
														
 
															+			parses := regs[1] //定义的排除关键词，含有这些关键词，则视为作废
														
 
															+			reg := regexp.MustCompile(pattern)
														
 
															+			results := reg.FindAllString(text, -1)
														
 
															+			if len(results) > 0 {
														
 
															+				results = ju.RemoveDuplicates(results)
														
 
															+				result := ""
														
 
															+				//有排除词时需要过滤
														
 
															+				if len(parses) > 0 {
														
 
															+					var res []string
														
 
															+					for _, v := range results {
														
 
															+						reg2 := regexp.MustCompile(parses)
														
 
															+						res2 := reg2.FindAllString(v, -1)
														
 
															+						if len(res2) == 0 {
														
 
															+							res = append(res, v)
														
 
															+						}
														
 
															+					}
														
 
															+					result = strings.Join(res, "\n")
														
 
															+				} else {
														
 
															+					result = strings.Join(results, "\n")
														
 
															+				}
														
 
															+
														
 
															+				val = result
														
 
															+			}
														
 
															+		} else {
														
 
															+			pos := vre.RegCore.Reg.FindStringIndex(text)
														
 
															+			if len(pos) == 2 {
														
 
															+				text = text[pos[1]:]
														
 
															+				rs := regexp.MustCompile("[^\r\n\t]+")
														
 
															+				tmp := rs.FindAllString(text, -1)
														
 
															+				if len(tmp) > 0 {
														
 
															+					val = tmp[0]
														
 
															+				}
														
 
															 			}
														
 
															 		}
														
 
															 		if val != "" {
														
--- a/src/jy/extract/extractsave.go
+++ b/src/jy/extract/extractsave.go
@@ -399,6 +399,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 
															 		e.inscribeRecognize(&tmp, *j.Data)
														
 
															 		//落款识别指定特殊采购单位
														
 
															 		e.AimAtRecognizeBuyer(&tmp, *j.Data)
														
 
															+		//根据正文获取资质要求
														
 
															+		e.getQualifications(&tmp, *j.Data)
														
 
															 		//城市抽取
														
 
															 		if e.IsExtractCity {
														
 
															 			//e.NewExtractCity(j, &tmp) //旧版
														
--- a/src/jy/extract/extraxtmethod.go
+++ b/src/jy/extract/extraxtmethod.go
@@ -216,7 +216,21 @@ func isUsedMultiPackage(pkg map[string]map[string]interface{}) bool {
 
															 	return false
														
 
															 }
														
 
															-// 落款识别~采购单位
														
 
															+//getQualifications 添加所有资质新字段
														
 
															+func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[string]interface{}) {
														
 
															+	/**
														
 
															+	qualifications 资质要求
														
 
															+	*/
														
 
															+	detail := qu.ObjToString(j_data["detail"])
														
 
															+	new_detail := pretreated.HtmlToText(detail)
														
 
															+
														
 
															+	qualifications := ju.GetQualifications(new_detail)
														
 
															+	if qualifications != "" {
														
 
															+		(*tmp)["qualifications"] = qualifications
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+//落款识别~采购单位
														
 
															 func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}) {
														
 
															 	//落款实体
														
 
															 	if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
														
--- a/src/jy/pretreated/analycore.go
+++ b/src/jy/pretreated/analycore.go
@@ -62,7 +62,7 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}, isSite bool,
 
															 	if len(res) > 0 {
														
 
															 		for _, t1 := range res {
														
 
															 			//降低冒号值的权重-不适合日期格式的数据
														
 
															-			if MhSpilt.MatchString(v1) && !(UnTimeSpiltKey.MatchString(t1.Value) && UnTimeSpiltValue.MatchString(v1)) {
														
 
															+			if MhSpilt.MatchString(v1) && !(UnTimeSpiltKey.MatchString(t1.Value) && UnTimeSpiltValue.MatchString(v1)) && t1.Value != "企业信用" {
														
 
															 				t1.Weight -= 50
														
 
															 			}
														
 
															 			if winnerOrderAndBidResult.MatchString(tabletag) && t1.Value == "采购单位联系人" { //处理table中项目负责人
														
--- a/src/jy/util/util.go
+++ b/src/jy/util/util.go
@@ -8,6 +8,7 @@ import (
 
															 	qu "qfw/util"
														
 
															 	"regexp"
														
 
															 	"strconv"
														
 
															+	"strings"
														
 
															 	. "gopkg.in/mgo.v2/bson"
														
 
															 )
														
@@ -274,3 +275,86 @@ func IsMarkInterfaceMap(t interface{}) []map[string]interface{} {
 
															 	}
														
 
															 	return p_list
														
 
															 }
														
 
															+
														
 
															+//GetQualifications 从正文中获取气质要求
														
 
															+func GetQualifications(text string) (qualifications string) {
														
 
															+	re1 := regexp.MustCompile(`(\n(\s)*(\d)[、.．](\s*)[\p{Han}]+[:：]?)`)
														
 
															+	re2 := regexp.MustCompile(`(?m)^(★)?(一|二|三|四|五|六|七|八|九|十|十一)[、.](.+?)[\p{Han}]?[:：]?$`)
														
 
															+	match := regexp.MustCompile(`(投标人|参选单位|投标商|供应商|申请人|参选人|应答人|承包人|应答方|报名人|报价人|投标单位|服务商|竞价方).*(资质|资格|要求|准入条件|条件).*`)
														
 
															+	matchWords := []string{"合格的投标人", "资格要求", "入围审核条价", "报名要求"}
														
 
															+
														
 
															+	var sections = make([]map[string]interface{}, 0)
														
 
															+	sections = ExtractSections(text, re2)
														
 
															+	if len(sections) > 0 {
														
 
															+
														
 
															+	} else {
														
 
															+		sections = ExtractSections(text, re1)
														
 
															+	}
														
 
															+
														
 
															+	for _, section := range sections {
														
 
															+		matches := match.FindStringSubmatch(section["title"].(string))
														
 
															+		if len(matches) > 1 {
														
 
															+			return section["content"].(string)
														
 
															+		} else {
														
 
															+			for _, mword := range matchWords {
														
 
															+				if strings.Contains(section["title"].(string), mword) {
														
 
															+					return section["content"].(string)
														
 
															+				}
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	return
														
 
															+}
														
 
															+
														
 
															+// ExtractSections 函数提取文本中的标题和内容，并返回一个包含map的切片
														
 
															+func ExtractSections(text string, re *regexp.Regexp) []map[string]interface{} {
														
 
															+	// 使用正则表达式在文本中找到所有章节标题的起始索引位置
														
 
															+	sectionIndexes := re.FindAllStringIndex(text, -1)
														
 
															+	// 将文本的结束索引位置也添加为一个哨兵，方便处理最后一个章节
														
 
															+	sectionIndexes = append(sectionIndexes, []int{len(text), len(text)})
														
 
															+	// 初始化一个map切片，用于保存提取的各个章节内容
														
 
															+	sections := make([]map[string]interface{}, len(sectionIndexes)-1)
														
 
															+	// 遍历各个章节标题的索引位置，将文本切分为不同的章节
														
 
															+	for i := range sections {
														
 
															+		// 当前章节的起始和结束索引位置
														
 
															+		start := sectionIndexes[i][1]
														
 
															+		end := sectionIndexes[i+1][0]
														
 
															+		// 提取当前章节的标题和文本内容，并去除前后的空白字符
														
 
															+		title := strings.TrimSpace(text[sectionIndexes[i][0]:start])
														
 
															+		content := strings.TrimSpace(text[start:end])
														
 
															+		// 将标题和内容添加到map中
														
 
															+		sections[i] = map[string]interface{}{
														
 
															+			"title":   title,
														
 
															+			"content": content,
														
 
															+		}
														
 
															+	}
														
 
															+	// 返回提取的各个章节内容组成的map切片
														
 
															+	return sections
														
 
															+}
														
 
															+
														
 
															+//ContainSpecialWord 判断一个字符串是否包含特殊关键词
														
 
															+func ContainSpecialWord(key string, words []string) bool {
														
 
															+	//含有排除关键词，直接跳过
														
 
															+	for _, word := range words {
														
 
															+		if strings.Contains(key, word) {
														
 
															+			return true
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return false
														
 
															+}
														
 
															+
														
 
															+//RemoveDuplicates 移除重复字符串
														
 
															+func RemoveDuplicates(input []string) []string {
														
 
															+	seen := make(map[string]bool)
														
 
															+	output := []string{}
														
 
															+
														
 
															+	for _, str := range input {
														
 
															+		if !seen[str] {
														
 
															+			seen[str] = true
														
 
															+			output = append(output, str)
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return output
														
 
															+}