maxiaoshan 5 年 前
コミット
86c8ebb613
3 ファイル変更30 行追加25 行削除
  1. 2 2
      src/config.json
  2. 21 21
      src/jy/pretreated/division.go
  3. 7 2
      src/res/fieldscore.json

+ 2 - 2
src/config.json

@@ -2,7 +2,7 @@
     "port": "9090",
     "mgodb": "192.168.3.207:27092",
     "dbsize": 3,
-    "dbname": "extract_kf",
+    "dbname": "extract_v3",
     "redis": "qyk_redis=127.0.0.1:6379",
     "elasticsearch": "http://127.0.0.1:9800",
     "elasticsearch_index": "winner_enterprise_tmp",
@@ -30,7 +30,7 @@
     "brandgoods": false,
     "pricenumber":true,
     "udptaskid": "5cdd3025698414032c8322b1",
-    "udpport": "1484",
+    "udpport": "1483",
     "nextNode": [
         {
             "addr": "127.0.0.1",

+ 21 - 21
src/jy/pretreated/division.go

@@ -47,26 +47,26 @@ var (
 	regTrimSpace       = regexp.MustCompile("^[\u3000\u2003\u00a0\\s]+|[\u3000\u2003\u00a0\\s]+$")
 	regReplWrapSpace   = regexp.MustCompile("^[\r\n][\u3000\u2003\u00a0\\s]*|[\r\n][\u3000\u2003\u00a0\\s]*$")
 	regReplAllSymbol   = regexp.MustCompile("[(\\(<《【\\[{{〔)\\)>》】\\]}}〕,,;;::'\"“”。.\\??/+=\\-_——*&……\\^%$¥@#!!`~·]")
-	regFilterTitle   = regexp.MustCompile("[(\\(<《【\\[{{〔].+?[)\\)>》】\\]}}〕]")
-	regDivision      = regexp.MustCompile("[::]")
-	regSpliteSegment = regexp.MustCompile("[\r\n]")
-	regFilterNumber  = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
-	regSplit         = regexp.MustCompile("或|和|以?及|与|、|或")
-	regStartWrap     = regexp.MustCompile("^[\r\n]")
-	regEndWrap       = regexp.MustCompile("[\r\n]$")
-	regMoreWrap      = regexp.MustCompile("[\r\n]{2,}")
-	regStrWrap       = regexp.MustCompile("分包名称[::]")
-	regBZJWarap      = regexp.MustCompile("(保证金.*|每包[0-9]*元|标志|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]标室)")
-	regFJWarap       = regexp.MustCompile("[a-zA-Z0-9](包|标段)[公告附件]*.(pdf|PDF|docx|doc|DOCX|DOC)")
-	regAZWarap       = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包)")
-	replSerial       = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
-	moreColonReg     = regexp.MustCompile("[::]+")
-	regFilter        = regexp.MustCompile("等$")
-	pkgFilter        = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?(标|包)(段|号)?")
-	indexTile        = regexp.MustCompile("[0-9.]{2,3}[\\s\u4e00-\u9fa5]{2,8}[::]+") //小标题
-	indexTile2       = regexp.MustCompile("[\\s\u4e00-\u9fa5]{2,8}")
-	regReplAllSpace2 = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
-	confusion        = map[string]string{
+	regFilterTitle     = regexp.MustCompile("[(\\(<《【\\[{{〔].+?[)\\)>》】\\]}}〕]")
+	regDivision        = regexp.MustCompile("[::]")
+	regSpliteSegment   = regexp.MustCompile("[\r\n]")
+	regFilterNumber    = regexp.MustCompile("^[\\d一二三四五六七八九十]+")
+	regSplit           = regexp.MustCompile("或|和|以?及|与|、|或")
+	regStartWrap       = regexp.MustCompile("^[\r\n]")
+	regEndWrap         = regexp.MustCompile("[\r\n]$")
+	regMoreWrap        = regexp.MustCompile("[\r\n]{2,}")
+	regStrWrap         = regexp.MustCompile("分包名称[::]")
+	regBZJWarap        = regexp.MustCompile("(保证金.*|每包[0-9]*元|标志|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]标室)")
+	regFJWarap         = regexp.MustCompile("[a-zA-Z0-9](包|标段)[公告附件]*.(pdf|PDF|docx|doc|DOCX|DOC)")
+	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包)")
+	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
+	moreColonReg       = regexp.MustCompile("[::]+")
+	regFilter          = regexp.MustCompile("等$")
+	pkgFilter          = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?(标|包)(段|号)+")
+	indexTile          = regexp.MustCompile("[0-9.]{2,3}[\\s\u4e00-\u9fa5]{2,8}[::]+") //小标题
+	indexTile2         = regexp.MustCompile("[\\s\u4e00-\u9fa5]{2,8}")
+	regReplAllSpace2   = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
+	confusion          = map[string]string{
 		"参与": "canyu",
 	}
 	//查找分包之前,先对内容进行预处理
@@ -792,7 +792,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 				tagtitle = regReplAllSpace2.ReplaceAllString(tagtitle, "")
 				if tagtitle == "" {
 					tagtitle = title
-				} else  if strings.Contains(tagtitle, bv[0]) && title != "" {
+				} else if strings.Contains(tagtitle, bv[0]) && title != "" {
 					tagtitle = title
 				}
 				text = tagtitle + ":" + text

+ 7 - 2
src/res/fieldscore.json

@@ -162,8 +162,13 @@
         "negativewords": [
             {
                 "describe": "包含负分",
-                "regstr": "(标人|附件|委托|认证|代理|咨询|顾问|管理有限公司|管理顾问|招标失败|交易中心|不足|公告|变更|招标|废标|废止|流标|中标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
-                "score": -20
+                "regstr": "(标人|附件|委托|认证|代理|咨询|顾问|管理顾问|招标失败|交易中心|不足|公告|变更|招标|废标|废止|流标|中标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+                "score": -15
+            },
+            {
+                "describe": "包含负分",
+                "regstr": "(代表)",
+                "score": -10
             },
             {
                 "describe": "包含负分不再展示",