فهرست منبع

Merge branch 'dev3.4' of http://39.105.157.10:10080/qmx/jy-data-extract into dev3.4

* 'dev3.4' of http://39.105.157.10:10080/qmx/jy-data-extract:
  优化
Jianghan 5 سال پیش
والد
کامیت
793f49c6c2
5فایلهای تغییر یافته به همراه36 افزوده شده و 5 حذف شده
  1. 1 1
      src/jy/clear/cutspace.go
  2. 19 0
      src/jy/pretreated/analytable.go
  3. 9 2
      src/jy/pretreated/colonkv.go
  4. 2 2
      src/res/fieldscore.json
  5. 5 0
      src/res/formattext.json

+ 1 - 1
src/jy/clear/cutspace.go

@@ -23,7 +23,7 @@ func init() {
 	cutAllSpace, _ = regexp.Compile(`\s*`)
 	catSymbol, _ = regexp.Compile(`[]+`)
 	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/。|]")
-	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$")
+	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|科|部|队|联合(会|体)|工作室)$")
 	clearNum, _ = regexp.Compile("[\\d-]+")
 	endSymblo = regexp.MustCompile(`[/\\?+]$`)
 }

+ 19 - 0
src/jy/pretreated/analytable.go

@@ -2686,6 +2686,25 @@ func (tn *Table) TdContactFormat(contactFormat *u.ContactFormat, isSite bool, co
 				//					qutil.Debug(v.Key, v.Value)
 				//				}
 				if len(thisTdKvs) != 1 {
+					preTdIndex := td_index - 1
+					if preTdIndex >= 0 {
+						preTdVal := tr.TDs[td_index-1].Val
+						tdType := "" //前一个td中是否是采购、代理、中标
+						for k, v := range ContactType {
+							if v.MatchString(preTdVal) {
+								tdType = k
+								break
+							}
+						}
+						if tdType != "" {
+							for _, this := range thisTdKvs {
+								if str := ContactInfoVagueReg.FindString(this.Key); str != "" {
+									td.SortKV.AddKey(tdType+str, this.Value)
+								}
+							}
+						}
+					}
+
 					continue
 				}
 				//1.处理带括号的()[]【】采购单位,代理机构;2.识别采购单位联系人、联系电话、代理机构联系人、联系电话

+ 9 - 2
src/jy/pretreated/colonkv.go

@@ -19,7 +19,7 @@ var (
 	regReplKV      = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0]+?[))]?)[\\s\u3000\u2003\u00a0,。;;][((]?(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].+)[))]?")
 	regReplKV2     = regexp.MustCompile("(.+?[\u4e00-\u9fa5))][\\s\u3000\u2003\u00a0]*[::].*[((]?[^\r\n\\s\u3000\u2003\u00a0标段包]+?[))]?)([一二三四五六七八九十]+[、..][^一二三四五六七八九十]+?)")
 	regKV          = regexp.MustCompile("([\\p{Han}][^,,。、.;;\r\n]{1,30}?)[::](.*)")
-	filterK        = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0]+|^[一二三四五六七八九十0-91234567890]+")
+	filterK        = regexp.MustCompile("[((\\[【].*?[))\\]】]|<[^>].+?>|[①②③¥·;;‘“'’”,*<>((\\[【、))/\\]】??,。.\".\\s\u3000\u2003\u00a0\u001c]+|^[一二三四五六七八九十0-91234567890]+")
 	filterValue    = regexp.MustCompile("(^(无)$|.+%.*|^[\r\n\\s\u3000\u2003\u00a0]+$|^<.*>)")
 	regReplKey     = regexp.MustCompile("^(包(.+[A-Za-z\\d])?|本项目|推荐|的|本次)|([约为元万亿]+|[大小]写|人民币|[全]称|姓名)$")
 	buyerAndAgency = regexp.MustCompile("(代理(机构|人)|采购(人|单位))")
@@ -569,7 +569,14 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			continue
 		}
 		//qutil.Debug("ContactInfoMustReg.MatchString(k)+++", !ContactInfoMustReg.MatchString(k))
-		if !ContactInfoMustReg.MatchString(k) { //判断是否是电话、邮箱、地址等信息
+		matchMust := ContactInfoMustReg.MatchString(k)
+		matchHas := ContactInfoVagueReg.MatchString(k)
+		qutil.Debug("matchMust:", matchMust, "matchHas:", matchHas)
+		if !matchMust && matchHas {
+			k = ContactInfoVagueReg.FindString(k)
+		}
+		//qutil.Debug("kkkkkkkkkkkk", k)
+		if !matchMust && !matchHas { //判断是否是电话、邮箱、地址等信息
 			if DoubtReg.MatchString(k) { //匹配到投诉和监督 5bc9683ea5cb26b9b72b2302 5c35f3e8a5cb26b9b72dcdbd
 				startIndex = 0
 				//notmatchCount = 0

+ 2 - 2
src/res/fieldscore.json

@@ -329,7 +329,7 @@
                 "score": -10
             },  {
                 "describe": "包含负分",
-                "regstr": "(详(见|情)|公告|名称)",
+                "regstr": "(详(见|情)|公告|名称|制作)",
                 "score": -10
             }
         ],
@@ -421,7 +421,7 @@
                 "score": -10
             },  {
                 "describe": "包含负分",
-                "regstr": "(详(见|情)|公告)",
+                "regstr": "(详(见|情)|公告|管理员)",
                 "score": -10
             }
         ],

+ 5 - 0
src/res/formattext.json

@@ -135,6 +135,11 @@
 			"reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8})(联系.{1,4})(和|及)(.{2,4})[::][\u3000\u2003\u00a0\\s]*([\u4e00-\u9fa5]{2,5})[::\\s\u3000\u2003\u00a0]*((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
             "separator": "${1}${2}:${5}\n${1}${4}:${6}",
             "desc": "采购人联系人和联系方式:雷蒙:13299985556 or 联系人及电话:  朱云鹏    13993240931"
+		},
+		{
+			"reg": "((招标|代理).{2,4})联系方式[::](.*)联系人[::](.+?)[\\s\u3000\u2003\u00a0]+联系方式[::]([\\d-转()()/、]+)",
+            "separator": "${1}联系人:${4} ${1}电话:${5}",
+            "desc": "采购项目联系方式:</td></tr><tr><td>        联系人:朱志强        联系方式:67897307"
 		},
 		{
 			"reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8}?)(联系(方式|电话|人)和?)+[::]([^\\d::]{2,8}?)[((]?[\\s\u3000\u2003\u00a0]*((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",