Selaa lähdekoodia

中标单位联系人、电话优化

maxiaoshan 5 vuotta sitten
vanhempi
commit
2bc20ffae7

+ 3 - 3
src/config.json

@@ -3,20 +3,20 @@
     "mgodb": "192.168.3.207:27092",
     "dbsize": 10,
     "dbname": "extract_kf",
-    "redis": "buyer=127.0.0.1:6379,winner=127.0.0.1:6379,agency=127.0.0.1:6379",
+    "redis": "buyer=192.168.3.207:1679,winner=192.168.3.207:1679,agency=192.168.3.207:1679",
     "elasticsearch": "http://127.0.0.1:9200",
     "elasticsearch_index": "extract_kf",
     "elasticsearch_type": "enterprise_qyxy",
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": false,
+    "saveresult": true,
     "qualityaudit": false,
     "saveblock": false,
     "filelength": 100000,
     "iscltlog": false,
     "brandgoods": false,
-    "udptaskid": "5e103206234ddc34b406c5d1",
+    "udptaskid": "5cdd3025698414032c8322b1",
     "udpport": "1484",
     "nextNode": [
         {

+ 1 - 1
src/jy/clear/cutspace.go

@@ -23,7 +23,7 @@ func init() {
 	catSymbol, _ = regexp.Compile(`[]+`)
 	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/。|]")
 	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$")
-	clearNum, _ = regexp.Compile("\\d+")
+	clearNum, _ = regexp.Compile("[\\d-]+")
 }
 
 var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)"

+ 1 - 0
src/jy/extract/extract.go

@@ -2021,4 +2021,5 @@ func resetWinnerorder(j *ju.Job) {
 		}
 	}
 	j.Result["bidamount"] = bidamounts
+
 }

+ 21 - 6
src/jy/pretreated/analykv.go

@@ -2,6 +2,7 @@ package pretreated
 
 import (
 	u "jy/util"
+	//qu "qfw/util"
 	"regexp"
 	"strings"
 )
@@ -143,6 +144,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 		"中标单位": map[string]bool{},
 		"采购单位": map[string]bool{},
 	}
+	doubtMap := map[int]bool{}
 	m = NewSortMap()
 	strs := [][]string{}
 	s1 := []string{}
@@ -186,6 +188,9 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 		pos1, pos2 := -1, -1
 		bkh := false
 		skh := ""
+		if from == 1 && DoubtReg.MatchString(str1) {
+			doubtMap[k0] = true
+		}
 		if !Key.MatchString(str1) { //此行没有冒号
 			if k0 > 0 {
 				tm1 := strs[k0-1]
@@ -200,7 +205,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 							k0++
 						}
 					}
-					keydetail(k, v, m, tag, k0, strs, matchMap, from)
+					keydetail(k, v, m, tag, k0, strs, matchMap, from, doubtMap)
 				}
 			}
 			LastStr = ""
@@ -304,7 +309,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 						if pos1 > -1 && pos2 > pos1 {
 							v = strings.Join(s1[pos1:pos2], "")
 							flag = 0
-							keydetail(k, v, m, tag, k0, strs, matchMap, from)
+							keydetail(k, v, m, tag, k0, strs, matchMap, from, doubtMap)
 						} else {
 							//u.Debug(k, pos1, pos2)
 							flag = 0
@@ -320,7 +325,7 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 				}
 				if v != "" {
 					flag = 0
-					keydetail(k, v, m, tag, k0, strs, matchMap, from)
+					keydetail(k, v, m, tag, k0, strs, matchMap, from, doubtMap)
 				}
 				//u.Debug(k, v)
 			}
@@ -332,13 +337,22 @@ func FindKv(con, tag string, from int) (m *SortMap) {
 	return
 }
 
-func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, matchMap map[string]map[string]bool, from int) {
+func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, matchMap map[string]map[string]bool, from int, doubtMap map[int]bool) {
 	if regexp.MustCompile("^[0-9]+[、]+$").MatchString(v) {
 		return
 	}
 	k = space.ReplaceAllString(k, "")
 	if len([]rune(k)) > 1 {
 		if len([]rune(k)) < 5 && strings.HasPrefix(k, "联系") || ContactInfoMustReg.MatchString(k) {
+			/*
+				5ded053fe9d1f601e4c9e3ee
+				中标人:XXXXXXXXXXXXXXXXXX
+				相关竞价人对成交结果有异议的,可自本公告发布之日起三日内书面提出。
+				联系方式:卢明珠 0871-66136373
+			*/
+			if doubtMap[pos-1] { //当识别到中标、采购、代理标签后,对其后的联系人、电话等信息判断是否属于该标签
+				goto L
+			}
 			num := 0
 			bf := false
 			for i := len(m.Keys) - 1; i > -1; i-- {
@@ -375,7 +389,7 @@ func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, ma
 						if from == 1 && ContactType["代理机构"].MatchString(str) && !IsContactKvHandle(k, matchMap["代理机构"]) {
 							matchMap["代理机构"][k] = true
 							k = "代理机构" + k
-						} else if filter_zbdw_ky.MatchString(str) && !IsContactKvHandle(k, matchMap["中标单位"]) {
+						} else if from == 1 && ContactType["中标单位"].MatchString(str) && !IsContactKvHandle(k, matchMap["中标单位"]) {
 							matchMap["中标单位"][k] = true
 							k = "中标单位" + k
 						} else if from == 1 && ContactType["采购单位"].MatchString(str) && !IsContactKvHandle(k, matchMap["采购单位"]) {
@@ -407,7 +421,8 @@ func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, ma
 			}
 
 		}
-		//u.Debug(k, v)
+	L:
+		//qu.Debug(k, v)
 		if m.Map[k] == nil {
 			m.AddKey(k, v)
 		} else {

+ 6 - 4
src/jy/pretreated/analytable.go

@@ -78,6 +78,8 @@ var (
 	//中标单位的处理
 	//包含以下字眼的Key标准化
 	filter_zbdw_ky = regexp.MustCompile("(中标|成交|拦标|合同|选中|投标|拟|预|最终)[\\p{Han}、]{0,6}(供[应货]商|企业|单位|人|机构)(名称)?.{0,4}$")
+	//识别中标单位相关信息
+	filter_zbdw_info = regexp.MustCompile("(中标|成交|中选|供(货|应))[^候选]{0,}")
 	//简单判断
 	filter_zbdw_jd = regexp.MustCompile("(投标|成交|中标|合同)(供应商|单位|人|名称).{0,4}$")
 	//且不包含以下字眼
@@ -101,7 +103,6 @@ var (
 		"代理机构": regexp.MustCompile("(代理|受托|集中采购).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"),
 		"中标单位": regexp.MustCompile("^((拟(定)?|预|最终|唯一)?(中标|成交|中选|供(货|应)))[^候选]{0,2}(人|方|单位|公司|(服务|供应)?商|企业)"),
 	}
-	OtherContactType            = regexp.MustCompile("(投诉|质疑|监督|交易中心|公示期(限)?).{0,4}(联系方式)?")
 	ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$")
 	MultipleValueSplitReg       = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]")
 	BuyerContacts               = []string{"采购单位联系人", "采购单位联系电话", "采购单位联系地址"}
@@ -113,6 +114,7 @@ var (
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 	winnerOrderAndBidResult     = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")
 	WinnerOrderStr              = regexp.MustCompile(`(集团|公司|学校|中心|家具城|门诊|\[大中小\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$`)
+	DoubtReg                    = regexp.MustCompile("((我|交易)中心|有(疑问|质疑|异议)|(书面)?提出|不再受理|投诉|质疑|监督|交易中心|公示期(限)?)")
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -1568,7 +1570,7 @@ func (table *Table) FindKV(isSite bool, codeSite string) {
 			}
 		}
 	}
-	//qutil.Debug("FindKV", table.SortKV.Map)
+	//qutil.Debug("Table-FindKV", table.SortKV.Map)
 }
 
 //初始化组装纵向数据
@@ -2117,10 +2119,10 @@ func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int,
 					continue
 				}
 				if !(len(kvTags) > 0 && regexp.MustCompile("^(项目|开标|采购单位|招标机构)").MatchString(kvTag_k)) {
-					if tn.SortKV.Map[k1] != nil{
+					if tn.SortKV.Map[k1] != nil {
 						tn.SortKV.RemoveKey(k1)
 						tn.assemblePackage(k1, val, index[0], isSite, codeSite)
-					} 
+					}
 					//log.Println("remove", k1, val)
 				}
 			}

+ 22 - 10
src/jy/pretreated/colonkv.go

@@ -123,7 +123,6 @@ func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv {
 					val = v[3]
 				}
 				//Debug("KV-key", key, val)
-				//Debug("KV-key", key, val)
 				//地址、联系人可能会重复 单位、代理机构的\时间、地点
 				if strings.TrimSpace(key) != "" {
 					prevLine, nextLine := "", ""
@@ -382,21 +381,34 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		//根据采购单位分析
 		isContinue := false
 		k := FilterContactKey(kv.Key)
-		//qutil.Debug(kv.Key, k)
+		//qutil.Debug(k, "_________", v, kv.PrevLine)
 		k_length := len([]rune(k))
 		//3.4新增winnerperson和winnertel抽取
 		if len(indexMap) == 1 && indexMap[1] == "中标单位" {
-			if !ContactInfoVagueReg.MatchString(k) { // 5c7f61dea5cb26b9b7d7bbee
+			//qutil.Debug(k, kv.PrevLine, kv.NextLine, ContactInfoVagueReg.MatchString(k))
+			if ContactInfoVagueReg.MatchString(k) { //匹配到联系人、电话、地址
+				/*
+					5ded053fe9d1f601e4c9e3ee
+
+					相关竞价人对成交结果有异议的,可自本公告发布之日起三日内书面提出。
+					联系方式:卢明珠 0871-66136373
+				*/
+				//qutil.Debug(DoubtReg.MatchString(kv.PrevLine))
+				if DoubtReg.MatchString(kv.PrevLine) {
+					keyNotMatchContactInfoCount++
+				}
+			} else if !filter_zbdw_info.MatchString(k) { //没有匹配到联系人、电话、地址和中标单位的其他信息 //例如:成交价格
 				keyNotMatchContactInfoCount++
 			}
-			//qutil.Debug(k, "==========keyNotMatchContactInfoCount+++", keyNotMatchContactInfoCount)
-			if keyNotMatchContactInfoCount >= 1 {
+			//qutil.Debug(k, "==========keyNotMatchContactInfoCount+++", keyNotMatchContactInfoCount, indexMap)
+			if keyNotMatchContactInfoCount >= 1 { //匹配到中标单位标签后,紧跟地址、电话、联系人
 				indexMap = map[int]string{}
 				startIndex = 0
+				notmatchCount = 0
 				keyNotMatchContactInfoCount = 0
 			}
 		}
-		//qutil.Debug(kv.Key, "++++++++++", kv.Value, buyers != nil, ascFind, isContinue)
+		//qutil.Debug(kv.Key, "++++++++++", kv.Value, buyers != nil, ascFind, isContinue, indexMap)
 		if buyers != nil {
 			for _, buyer := range buyers {
 				if buyer == "" {
@@ -558,7 +570,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		}
 		//qutil.Debug("ContactInfoMustReg.MatchString(k)+++", !ContactInfoMustReg.MatchString(k))
 		if !ContactInfoMustReg.MatchString(k) { //判断是否是电话、邮箱、地址等信息
-			if OtherContactType.MatchString(k) { //匹配到投诉和监督 5bc9683ea5cb26b9b72b2302 5c35f3e8a5cb26b9b72dcdbd
+			if DoubtReg.MatchString(k) { //匹配到投诉和监督 5bc9683ea5cb26b9b72b2302 5c35f3e8a5cb26b9b72dcdbd
 				startIndex = 0
 				//notmatchCount = 0
 				indexMap = map[int]string{}
@@ -683,7 +695,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	}
 	//	if buyers == nil {
 	//	for _, kv := range *kvs {
-	//		Debug("bbbbbbbbbb", kv.Key, kv.Value)
+	//		qutil.Debug("bbbbbbbbbb", kv.Key, kv.Value)
 	//	}
 	//	}
 	//Debug("totalIndexMap", len(totalIndexMap))
@@ -758,7 +770,7 @@ func HasOrderContactType(text string) []string {
 //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包
 func GetKVAll(content, title string, contactFormat *ContactFormat, from int, isSite bool, codeSite string) *JobKv {
 	content = formatText(content, "kv")
-	m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from,isSite,codeSite)
+	m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from, isSite, codeSite)
 	//	for _, kvs := range m1Kvs {
 	//		qutil.Debug(kvs.Key, kvs.Value)
 	//	}
@@ -882,7 +894,7 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string, isSite bool, codeSi
 						nextval += GetMoneyUnit(k, nextval)
 					}
 					kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: nextval, Weight: tk.Weight})
-				}else if tk.Value == "项目名称" && nextval != ""{
+				} else if tk.Value == "项目名称" && nextval != "" {
 					kvTags[tk.Value] = append(kvTags[tk.Value], &Tag{Key: k, Value: nextval, Weight: tk.Weight})
 				}
 			}

+ 1 - 0
src/jy/util/article.go

@@ -12,6 +12,7 @@ type Job struct {
 	Content           string                            //正文
 	Title             string                            //标题
 	SpiderCode        string                            //爬虫代码
+	Site              string                            //站点
 	Domain            string                            //网站域名
 	Href              string                            //原文链接
 	City              string                            //城市

+ 9 - 9
src/main_blocktest.go

@@ -50,8 +50,8 @@ func all() {
 	time.Sleep(time.Hour)
 }
 func one() {
-	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27092", "extract_kf")
-	d, _ := m.FindById("bidding_winner", "5da1489ca5cb26b9b7bd6d8e", extract.Fields)
+	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
+	d, _ := m.FindById("bidding", "5dedb8d7e9d1f601e4b9cf7b", extract.Fields)
 	com(*d)
 }
 func com(doc map[string]interface{}) {
@@ -68,7 +68,7 @@ func com(doc map[string]interface{}) {
 	e := &extract.ExtractTask{
 		TaskInfo: &extract.TaskInfo{
 			Version:     "v3.6",
-			VersionId:   "5cdd3025698414032c8322b1",
+			VersionId:   "5e02be5869841446c0005a93",
 			ProcessPool: make(chan bool, 1),
 		},
 	}
@@ -79,15 +79,15 @@ func com(doc map[string]interface{}) {
 	e.InitSite()
 	e.InitRulePres()
 	e.InitRuleBacks(false)
-	e.InitRuleBacks(true)
+	//e.InitRuleBacks(true)
 	e.InitRuleCore(false)
-	e.InitRuleCore(true)
+	//e.InitRuleCore(true)
 	e.InitBlockRule()
 	e.InitPkgCore()
 	e.InitTag(false)
-	e.InitTag(true)
+	//e.InitTag(true)
 	e.InitClearFn(false)
-	e.InitClearFn(true)
+	//e.InitClearFn(true)
 	if e.IsExtractCity { //版本上控制是否开始城市抽取
 		//初始化城市DFA信息
 		e.InitCityDFA()
@@ -109,6 +109,7 @@ func com(doc map[string]interface{}) {
 		CategorySecond: subtype,
 		Content:        qu.ObjToString(doc["detail"]),
 		SpiderCode:     qu.ObjToString(doc["spidercode"]),
+		Site:           qu.ObjToString(doc["site"]),
 		//Domain:     qu.ObjToString(doc["domain"]),
 		//Href:       qu.ObjToString(doc["href"]),
 		Title:     qu.ObjToString(doc["title"]),
@@ -150,8 +151,7 @@ func com(doc map[string]interface{}) {
 		//log.Println("Tag", v.Tag)
 	}
 	log.Println("=============抽取结果================")
-
-	log.Println(e.ResultArr)
+	//	log.Println(e.ResultArr)
 	set := (e.ResultArr[0][1]["$set"]).(map[string]interface{})
 	for k, v := range set {
 		//if k == "budget" || k == "bidamount" || k == "winner" || k == "amount" || k == "projectname" || k == "projectcode" || k == "buyer" || k == "buyerperson" || k == "buyertel" || k == "agency" {

+ 1 - 1
src/main_test.go

@@ -28,7 +28,7 @@ func Test_han(t *testing.T) {
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5da43587a5cb26b9b729a7ff", "1", "result_mxs", "result_mxs")
+	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5ded053fe9d1f601e4c9e3ee", "1", "result_mxs", "result_mxs")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }

+ 1 - 1
src/res/fieldscore.json

@@ -398,7 +398,7 @@
 			{
                 "describe": "[gt,∞,score]",
                 "range": [
-                    3,
+                    10,
                     -1,
                     -10
                 ]