Răsfoiți Sursa

备份--0322

zhengkun 3 ani în urmă
părinte
comite
22c3b774e4

+ 1 - 1
src/config.json

@@ -32,7 +32,7 @@
     "iscltlog": false,
     "brandgoods": false,
     "pricenumber":true,
-    "udptaskid": "60b493c2e138234cb4adb640",
+    "udptaskid": "612c80fcff8a32117625b289",
     "nextNode": [],
     "udpport": "6601",
     "esconfig": {

+ 13 - 1
src/jy/clear/tonumber.go

@@ -39,7 +39,7 @@ var kxjsReg *regexp.Regexp
 
 
 var unpkvBidamountReg =  regexp.MustCompile("^([Xx]\\+[1-9\\.]+元/每)")
-
+var regUnitMoneyClean =  regexp.MustCompile("^(.*单价[0-9.]+元[/][袋|块])[,,](含税总价[0-9.]+[万元]+)[.。]$")
 
 func init() {
 	regOperator, _ = regexp.Compile(`[*|+|)*)]`)
@@ -140,6 +140,11 @@ func ObjToMoney(data []interface{}, spidercode ...string) []interface{} {
 		(data)[0] = totmpstr
 	}
 
+	if regUnitMoneyClean.MatchString(totmpstr) {
+		totmpstr = regUnitMoneyClean.ReplaceAllString(totmpstr,"$2")
+		(data)[0] = totmpstr
+	}
+
 	//未含税总价1454400.00元,税率6%,含税总价1541664.00元
 	Percent:= regPercentMoney.FindAllString(totmpstr,-1)
 	for _,v:=range Percent{
@@ -309,7 +314,14 @@ func capitalMoney(data []interface{}) []interface{} {
 	end := false
 	str := fmt.Sprint(data[0])
 	//提取第一个大写信息
+
+	if strings.Contains(str,"壹") {
+		str = strings.ReplaceAll(str,"一","壹")
+	}
+
 	strmatch := numCapitals.FindAllStringSubmatch(str, -1)
+
+
 	if len(strmatch) > 0 {
 		str = strmatch[0][0]
 	}

+ 9 - 3
src/jy/extract/extract.go

@@ -369,6 +369,12 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	if isextFile {
 		file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 	}
+
+
+
+
+
+
 	//正文小于200个字,有附件把附件内容加到正文
 	//tmpDeatil := detail
 	//tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
@@ -2118,7 +2124,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				}
 			}
 			//包预算,中标金额合并大于抽取就覆盖
-			if len(j.PackageInfo) >= 1 {
+			if len(j.PackageInfo) > 1 {
 				//包数大于1累加
 				for _, v := range j.PackageInfo {
 					if v["budget"] != nil {
@@ -2294,8 +2300,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			}
 		}
 
-		//添加字段来源
-		tmp["field_source"] = fieldSource
+		//添加字段来源 ~~ 临时注释
+		//tmp["field_source"] = fieldSource
 		//是否为不规则表格字段
 		if j.IsUnRulesTab {
 			tmp["is_UnRules_Tab"]= j.IsUnRulesTab

+ 9 - 0
src/jy/pretreated/analystep.go

@@ -27,6 +27,9 @@ var formattext3 = regexp.MustCompile("(同类项目业绩、|[1-9].[0-9]包段
 //过滤多余字符
 var formattext4 = regexp.MustCompile("(中标金额[::])设计费用[::][0-9.万元,,]+施工费用[::][0-9.万元,,]+合计[::]([¥〇0-9\\.人民币零点壹贰叁肆伍陆柒捌玖拾佰仟万亿元圆角分整]+)")
 
+//特殊影响分包候选人抽取
+var formattext5 = regexp.MustCompile("投标报价[::]包件1[::][0-9.万元]+[,,]包件2[::][0-9.万元]+[,,]投标总价([::]+)([0-9.万元]+)")
+var formattext6  = regexp.MustCompile("(投标报价[::][0-9.]+)\n([万元]+)")
 
 
 
@@ -47,6 +50,12 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 	con = formattext3.ReplaceAllString(con,"")
 	con = formattext4.ReplaceAllString(con,"\n${1}:${2}\n")
 
+
+	//特殊格式-影响分包候选人抽取-替换
+	con = formattext5.ReplaceAllString(con,"中标金额:${2}\n")
+	con = formattext6.ReplaceAllString(con,"$1$2")
+
+
 	con = formatText(con, "all")
 	job.Content = con
 	//计算表格占比,返回表格数组、占比

+ 33 - 2
src/jy/pretreated/analytable.go

@@ -240,6 +240,17 @@ var glRex *regexp.Regexp = regexp.MustCompile("(成交|中标|候选|排名|名
 var djReg *regexp.Regexp = regexp.MustCompile("^单价")
 var hxrRex *regexp.Regexp = regexp.MustCompile("((成交|中标|中选)?候选人[弟|第][1-5一二三四五]名|[弟|第][1-5一二三四五][名]?(成交|中标|中选)?候选人)")
 
+//判断数组string 是否重复
+func isRepeatArrString(arr1,arr2 []string)bool{
+	is_r := true
+	for k,v := range arr1{
+		if v!=arr2[k] {
+			is_r = false
+			break
+		}
+	}
+	return is_r
+}
 
 //对解析后的表格的kv进行过滤
 func (table *Table) KVFilter(isSite bool, codeSite string) {
@@ -253,6 +264,7 @@ func (table *Table) KVFilter(isSite bool, codeSite string) {
 	as := NewSortMap()
 
 	//遍历table.sortkv,进行过滤处理,并放入标准化KV中,如果值是数组跳到下一步处理
+	pre_k := ""
 	for _, k := range table.SortKV.Keys {
 		//遍历所有key sort.kv
 		if glRex.MatchString(k) {
@@ -273,8 +285,23 @@ func (table *Table) KVFilter(isSite bool, codeSite string) {
 			}
 			MergeKvTags(table.StandKV, kvTags)
 		} else {
-			as.AddKey(k, v)
+			//同一行表格-相同值的标签 可能含义重复~影响分包抽取~需要过滤
+			if k=="最高限价(元)" && pre_k == "项目预算(元)"{
+				isE := false
+				if v_arr, ok := v.([]string); ok && len(v_arr)>1{
+					if pre_v_arr, ok := as.Map[pre_k].([]string); ok && len(pre_v_arr)==len(v_arr) && isRepeatArrString(v_arr,pre_v_arr) {
+						isE = true
+					}
+				}
+				if !isE {
+					as.AddKey(k, v)
+				}
+			}else {
+				as.AddKey(k, v)
+			}
 		}
+
+		pre_k = k //记录上一个k
 	}
 	//处理值是数组的kv   放入标准化kv中 standKV //处理table.SortKV.value为数组的情况
 	table.sortKVArr(as, isSite, codeSite)
@@ -2655,11 +2682,15 @@ var thbf = regexp.MustCompile("(?i)</?t(head|body|foot)>")
 
 //需要保留thead
 var saveThead = regexp.MustCompile("(?is)<thead>(.+?)</thead>")
-
 var clearpkg = regexp.MustCompile("(标示|标识)")
+var clearMoneyReg1 = regexp.MustCompile("(总成交金额:[0-9.]+)[\\s ]+([((]?万元[))]?)")
+
 
 func RepairCon(con string) string {
 	con = clearpkg.ReplaceAllString(con, "")
+
+
+	con = clearMoneyReg1.ReplaceAllString(con,"$1$2")
 	res := saveThead.FindAllStringSubmatch(con, 1)
 	th := ""
 	if len(res) == 1 && len(res[0]) == 2 {

+ 5 - 0
src/jy/pretreated/division.go

@@ -129,6 +129,9 @@ var (
 	//敏感词-影响分包-替换-分割
 	replaceSenstiveReg1 = regexp.MustCompile("([一二三四五六七八九十1-9][、]项目名称[::].*采购项目)([一二三四五六七八九十1-9][、]采购结果)")
 
+	//价格~单位换行  替换
+	packageReg50  =  regexp.MustCompile("(投标报价[::][0-9.]+)\n(万元)")
+
 )
 
 //分块
@@ -793,6 +796,8 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 	content = packageReg5.ReplaceAllString(content,"\n${1}\n中标单位:${4}\n")
 	content  = packageReg6.ReplaceAllString(content,"\n$2\n中标单位:$1\n$4\n中标单位:$3")
 
+	//替换换行金额
+	content  = packageReg50.ReplaceAllString(content,"$1$2")
 
 
 

+ 17 - 6
src/jy/pretreated/winnerorder.go

@@ -33,7 +33,7 @@ var (
 	winnerReg1        = regexp.MustCompile("(^|[^为])(【?(推荐)?第[一二三四五六七八九十1-9]+(合格|名|包|标段)?】?([候|侯]选)?(入围|备选|成交|中[标|选])人?([((]成交[))])?([候|侯]选|排序)?(人(单位)?|供[应货]商|单位|机构)(名称)?为?)($|[^,;;。,])")
 	winnerReg2        = regexp.MustCompile("(排名第[一二三四五六七八九十1-9]+|[第|弟][一二三四五六七八九十1-9]+(中标|中选)?[候|侯]选人|中标候选人排名[::]\\d)")
 
-	winnerReg3     = regexp.MustCompile("((中标候选人)?第[一二三四五六七八九十1-9]+名|(中标候选人)[1-9])")
+	winnerReg3     = regexp.MustCompile("((中标候选人)?第[一二三四五六七八九十1-9]+[\\s]?名|(中标候选人)[1-9])")
 	winnerReg4     = regexp.MustCompile("((确认|推荐|评审|排[名|序])[为::]+|(由高到低排序前.名|公示下列内容|(确定|推荐)的?中[标|选]候选人|\n中[标|选]候选.{1,3}\\s*\n|\n(中[标|选]候选.{1,3}[::\u3000\u2003\u00a0\\s]|成交候选供应商)|(排[名|序]|公[示|告]|具体|推荐|结果(公示)?|中[标|选]候选人.{0,2})如下|[一二三四五六七八九十\\d]+、(中[标|选]候选[^\n::]{1,8}|.{0,8}(成交|结果)信息|成交[^\n::]{2,8}))[为::]?)")
 	winnerReg5     = regexp.MustCompile("([^,;;。,、\n投标人]+?)(为?)(第[一二三四五六七八九十1-9]+(成交|中标)?([候|侯]选(人|供应商|单位|机构)|名)|排名第[一二三四五六七八九十1-9]+)([,;;。,、]|\\s+\n)")
 	winnerReg6     = regexp.MustCompile("(^(排名)?(第[一二三四五六七八九十1-9]+[名中标成交备选候人单位供应商]*|^[中标成交备选候人单位供应商]*[第|弟][一二三四五六七八九十1-9]名))")
@@ -55,8 +55,15 @@ var (
     winnerReg19 = regexp.MustCompile("([弟|第][1-9一二三四五]名(中标候选人)?)[::]?([\u4E00-\u9FA5]{4,20}公司)[((]?[,,]?(报价|投标报价)[::]?([0-9\\.\\s万元]+)")
 
 
+	//特殊格式转化
+	winnerReg50 = regexp.MustCompile("(第[一二三]名)\n单位名称\n(.{4,20}公司)[\n\\s]+投标报价\n大写\n([\u4E00-\u9FA5]+)\n")
+	winnerReg51 = regexp.MustCompile("(中标候选人第[1-9一二三四五])[\\s](名)")
+	winnerReg52 = regexp.MustCompile("(中标金额[::][0-9.]+)\n([万元]+)")
 
-	//格式化中标金额换行
+
+
+
+//格式化中标金额换行
     winnerReg100    = regexp.MustCompile("中标金额:[\\s]+([0-9\\.万元]+)")
 
 
@@ -113,10 +120,6 @@ func (wo *WinnerOrderEntity) Find(text string, flag bool, from int, isSite bool,
 	text = winnerReg24.ReplaceAllString(text,"\n${4}:${1}\n")
 
 
-
-
-
-
 	text = winnerReg8.ReplaceAllString(text,"\n${1}:${3}\n中标金额:${5}\n")
 	text = winnerReg9.ReplaceAllString(text,"\n${1}:${3}\n中标金额:${5}\n")
 	text = winnerReg10.ReplaceAllString(text,"\n${1}:${3}\n中标金额:${2}\n")
@@ -139,6 +142,14 @@ func (wo *WinnerOrderEntity) Find(text string, flag bool, from int, isSite bool,
 	}
 
 
+	if winnerReg50.MatchString(text) && strings.Contains(text,"中标候选人公示") {
+		text = winnerReg50.ReplaceAllString(text,"\n中标候选人${1}\n中标单位:${2}\n中标金额:${3}\n")
+	}
+
+	text = winnerReg51.ReplaceAllString(text,"${1}${2}\n中标金额:${3}\n")
+	text = winnerReg52.ReplaceAllString(text,"$1$2")
+
+
 
 	text = clearSpace1.ReplaceAllString(text, "") //清理(1)	单位名称:成都维诺信科技有限公司-->单位名称:成都维诺信科技有限公司
 	if strings.TrimSpace(text) == "" {

+ 12 - 5
src/main.go

@@ -68,7 +68,7 @@ func main() {
 	}()
 
 	//临时调试
-	//testMain()
+	testMain()
 
 	lock := make(chan bool)
 	<-lock
@@ -76,11 +76,18 @@ func main() {
 
 //验证规则
 func testMain()  {
-	text :=`第一中标候选人:中国铁路设计集团有限公司,上海市城市建设设计研究总院(集团)有限公司`
 
-	var winnerNoSplitReg   = regexp.MustCompile("^(第[一二三四五六七八九十]中[选|标]?候选人)[::]([\u4E00-\u9FA5]{4,20}([((]集团[))])?(有限公司|公司))[,,]([\u4E00-\u9FA5]{4,20}([((]集团[))])?(有限公司|公司))$")
-	if winnerNoSplitReg.MatchString(text) {
-		text = winnerNoSplitReg.ReplaceAllString(text,"\n${4}:${1}\n")
+
+
+
+	text :=`中标候选人第1名:商丘大虞城市发展工程有限公司,投标报价:22000.417072
+万元,质量:/,工期/交货期/服务期:120天;
+      
+中标候选人第2名:河南国埔建筑工程有限公司,投标报价:22007.217056万元
+,`
+	var winnerReg12 = regexp.MustCompile("(中[标|选]候选人[弟|第][一二三四五六七八九十0-9]名|[弟|第][一二三四五六七八九十0-9](中标)?候选人)[::\\s ]+?(.*)[ \\s,,]+?(投标报价|投标总报价|金额)[::]?([0-9\\.\\s万元]+)")
+	if winnerReg12.MatchString(text) {
+		text = winnerReg12.ReplaceAllString(text,"\n${1}:${3}\n中标金额:${5}\n")
 		log.Debug(text)
 	}else {
 		log.Debug("不匹配")