Browse Source

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

zhangjinkun 5 years ago
parent
commit
0b83c0b204

+ 8 - 1
src/jy/clear/tonumber.go

@@ -98,10 +98,17 @@ func ObjToMoney(data []interface{}) []interface{} {
 			f = f * 10000
 		}
 	}
+	if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(data[0])) {
+		data = append(data, false)
+		return data
+	}
+	data = append(data, true)
 	data[0] = f
 	return data
 }
-
+//["中标金额","成交金额","合同金额","中标价","成交价","成交价格","中标(成交)金额","投标报价","中标标价","成交结果"]
+//["0元","零元","0.0万元","¥0元"]
+var moneyUnitRegBool = regexp.MustCompile(`(中标金额|成交金额|合同金额|中标价|成交价|成交价格|中标\(成交\)金额|投标报价|中标标价|成交结果)?[::\s]?(0元|零元|0.0万元|¥0元|0)+`)
 //数字金额转换
 func numMoney(data []interface{}) ([]interface{}, bool) {
 	tmp := fmt.Sprintf("%f",data[0])

+ 17 - 2
src/jy/extract/extpackage.go

@@ -7,7 +7,7 @@ import (
 	"log"
 	qu "qfw/util"
 	"reflect"
-	"regexp"
+	regexp "regexp"
 	"sort"
 )
 
@@ -60,6 +60,13 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 					cfn := e.ClearFn["budget"]
 					lock.Unlock()
 					data := clear.DoClearFn(cfn, []interface{}{pv[0].Value, ""})
+					if data[0] ==0{
+						if istrue,ok:= data[len(data)-1].(bool);istrue&&ok{
+							(*sonJobResult)["budget"] = data[0]
+						}else {
+							continue
+						}
+					}
 					(*sonJobResult)["budget"] = data[0]
 					continue
 				}
@@ -68,10 +75,18 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 					cfn := e.ClearFn["budget"]
 					lock.Unlock()
 					data := clear.DoClearFn(cfn, []interface{}{pv[0].Value, ""})
+					if istrue,ok:= data[len(data)-1].(bool);istrue&&ok{
+						(*sonJobResult)["budget"] = data[0]
+					}else {
+						continue
+					}
 					(*sonJobResult)["bidamount"] = data[0]
 					continue
 				}
 				if ((*sonJobResult)["winner"] == nil || (*sonJobResult)["winner"] == "") && tags[0].Key == "中标单位" {
+					if winnerorderNotReg.MatchString(pv[0].Value){
+						continue
+					}
 					(*sonJobResult)["winner"] = pv[0].Value
 				}
 
@@ -82,7 +97,7 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 		}
 	}
 }
-
+var winnerorderNotReg =regexp.MustCompile(`(附件|否决原因|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\d[\s]{0,10}(\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})`)
 //处理分包信息
 func PackageDetail(j *ju.Job, e *ExtractTask, isSite bool, codeSite string) {
 	qu.Try(func() {

+ 38 - 22
src/jy/extract/extract.go

@@ -26,12 +26,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 100                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 100                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -568,9 +568,8 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 			}
 		}
 		//函数清理
-
 		for key, val := range j.Result {
-			for _, v := range val {
+			for i, v := range val {
 				//qu.Debug(key, v.Value)
 				lockclear.Lock()
 				var cfn = []string{}
@@ -584,6 +583,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 					continue
 				}
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
+				if key == "budget" || key == "bidamount" {
+						if istrue, ok := data[len(data)-1].(bool); istrue && ok {
+							j.Result[key][i].IsTrue = true
+						} else {
+							continue
+						}
+				}
 				before, _ := v.Value.(string)
 				v.Value = data[0]
 				BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
@@ -1576,6 +1582,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				} else if v.Field == "projectname" {
 					tmp[v.Field] = v.Value
 					break
+				} else if v.Field == "bidamount"||v.Field =="budget"{
+					if v.IsTrue{
+						tmp[v.Field] =v.Value
+						break
+					}
 				}
 			}
 		}
@@ -1691,19 +1702,19 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
-			/*	if len(e.SiteFields) <= 0 {
-					//for field, _ := range e.Fields {
-					//	if tmp[field] == nil &&  {
-					//		tmp[field] = "" //覆盖之前版本数据
-					//	}
-					//}
-				} else {
-					//for field, _ := range e.SiteFields {
-					//	if tmp[field] == nil &&{
-					//		tmp[field] = "" //覆盖之前版本数据
-					//	}
-					//}
-				}*/
+				/*	if len(e.SiteFields) <= 0 {
+						//for field, _ := range e.Fields {
+						//	if tmp[field] == nil &&  {
+						//		tmp[field] = "" //覆盖之前版本数据
+						//	}
+						//}
+					} else {
+						//for field, _ := range e.SiteFields {
+						//	if tmp[field] == nil &&{
+						//		tmp[field] = "" //覆盖之前版本数据
+						//	}
+						//}
+					}*/
 				tmp["repeat"] = 0
 				tmparr := []map[string]interface{}{
 					map[string]interface{}{
@@ -1877,6 +1888,11 @@ func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
 					standardized = true
 				}
 			}
+			if field == "budget"||field == "bidamount"{
+				if !v.IsTrue{
+					continue
+				}
+			}
 			sfield := map[string]interface{}{
 				"val":          v.Value,
 				"type":         v.Type,
@@ -1929,7 +1945,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 14 - 4
src/jy/extract/score.go

@@ -114,9 +114,19 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 		for tmpsindex, tmpsvalue := range tmps {
 			//没有抽取到值,不打分
 			if string_value := fmt.Sprint(tmpsvalue.Value); string_value == "" || string_value == "0" || string_value == "<nil>" {
-				tmps[tmpsindex].Score = -10
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
-				continue
+				if field == "budget" || field == "bidamount" {
+					if tmpsvalue.IsTrue {
+						//continue
+					}else {
+						tmps[tmpsindex].Score = -10
+						tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
+						continue
+					}
+				}else {
+					tmps[tmpsindex].Score = -10
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
+					continue
+				}
 			}
 			lockscore.Lock()
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
@@ -270,7 +280,7 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 				max := qu.IntAll(scoreRule["max"])
 				val := qu.IntAll(tmpsvalue.Value)
 				scores, _ := scoreRule["score"].([]interface{})
-				if len(scores) < 3 || val == 0 {
+				if len(scores) < 3 || val < 0 {
 					continue
 				}
 				if val < min && 0 < val {

+ 2 - 1
src/jy/pretreated/analykv.go

@@ -350,12 +350,13 @@ func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, ma
 				相关竞价人对成交结果有异议的,可自本公告发布之日起三日内书面提出。
 				联系方式:卢明珠 0871-66136373
 			*/
-			if doubtMap[pos-1] { //当识别到中标、采购、代理标签后,对其后的联系人、电话等信息判断是否属于该标签
+			if doubtMap[pos-1] && len(m.Map) == 1 { //当识别到中标、采购、代理标签后,对其后的联系人、电话等信息判断是否属于该标签
 				goto L
 			}
 			num := 0
 			bf := false
 			for i := len(m.Keys) - 1; i > -1; i-- {
+				//u.Debug("k", k)
 				num++
 				if from == 1 && !ContactType["代理机构"].MatchString(k) && ContactType["代理机构"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["代理机构"]) {
 					matchMap["代理机构"][k] = true

+ 11 - 11
src/jy/pretreated/analytable.go

@@ -114,7 +114,7 @@ var (
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 	winnerOrderAndBidResult     = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")
 	WinnerOrderStr              = regexp.MustCompile(`(集团|公司|学校|中心|家具城|门诊|\[大中小\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$`)
-	DoubtReg                    = regexp.MustCompile("((|交易)中心|有(疑问|质疑|异议|意见)|(书面)?提出|不再受理|投诉|质疑|书面形式|监督|交易中心|公示期(限)?|招标|采购)")
+	DoubtReg                    = regexp.MustCompile("(我中心|有(疑问|质疑|异议|意见)|(书面)?提出|不再受理|投诉|质疑|书面形式|监督|公示期(限)?)")
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -244,10 +244,10 @@ func (table *Table) KVFilter(isSite bool, codeSite string) {
 			MergeKvTags(table.StandKV, kvTags)
 		} else {
 			//u.Debug(k, v, "---------")
-			if strings.Contains(k,"总价"){
-				if vvvv,ok := v.([]string);ok && len(vvvv)>0{
+			if strings.Contains(k, "总价") {
+				if vvvv, ok := v.([]string); ok && len(vvvv) > 0 {
 					as.RemoveKey("报价")
-					as.AddKey(k,vvvv[len(vvvv)-1])
+					as.AddKey(k, vvvv[len(vvvv)-1])
 					continue
 				}
 			}
@@ -477,15 +477,15 @@ func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) {
 							tmp.Weight = vv[0].Weight
 							tmp.Key = vv[0].Key
 							tmp.IsInvalid = vv[0].IsInvalid
-							if kk == "单品报价"||kk == "中标金额"||kk == "预算"{
-								if strings.Contains(k,"万"){
-									tmp.Value = vvvvvv+"万"
-								}else if strings.Contains(k,"亿"){
-									tmp.Value = vvvvvv+"亿"
-								}else {
+							if kk == "单品报价" || kk == "中标金额" || kk == "预算" {
+								if strings.Contains(k, "万") {
+									tmp.Value = vvvvvv + "万"
+								} else if strings.Contains(k, "亿") {
+									tmp.Value = vvvvvv + "亿"
+								} else {
 									tmp.Value = vvvvvv
 								}
-							}else {
+							} else {
 								tmp.Value = vvvvvv
 							}
 							table.StandKV[kk] = append(table.StandKV[kk], &tmp)

+ 1 - 0
src/jy/util/article.go

@@ -55,6 +55,7 @@ type ExtField struct {
 	ScoreItem   []*ScoreItem      //打分项
 	Weight      int               //权重值
 	ValRepeat   int               //结果值重复次数,打分参考
+	IsTrue      bool              //针对金额0是否有效的值,其他字段不参考
 }
 
 //打分项

+ 2 - 2
src/main.go

@@ -13,10 +13,10 @@ import (
 	_ "net/http/pprof"
 	qu "qfw/util"
 
-	log "github.com/donnie4w/go-logger/logger"
 	"qfw/util/elastic"
 	"qfw/util/redis"
 
+	log "github.com/donnie4w/go-logger/logger"
 )
 
 func init() {
@@ -42,7 +42,7 @@ func init() {
 	//	log.Fatal("ElasticClient err:", err)
 	//} else {
 	//	util.ElasticClient = eClient
-		util.ElasticClientIndex = qu.ObjToString(util.Config["elasticsearch_index"])
+	util.ElasticClientIndex = qu.ObjToString(util.Config["elasticsearch_index"])
 	util.ElasticClientType = qu.ObjToString(util.Config["elasticsearch_type"])
 	util.ElasticClientDB = qu.ObjToString(util.Config["winner_enterprise"])
 	//}

+ 1 - 1
src/main_blocktest.go

@@ -51,7 +51,7 @@ func all() {
 }
 func one() {
 	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
-	d, _ := m.FindById("bidding", "59e47b5a40d2d9bbe82296bf", extract.Fields)
+	d, _ := m.FindById("bidding", "5e17deee50b5ea296ec939fb", extract.Fields)
 	com(*d)
 }
 func com(doc map[string]interface{}) {

+ 3 - 3
src/main_test.go

@@ -26,10 +26,10 @@ func Test_han(t *testing.T) {
 	os.Exit(0)
 }
 func Test_task(t *testing.T) {
-	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_kf")
+	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_dev32")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	//extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "59e47b5a40d2d9bbe82296bf", "1", "result_mxs", "result_mxs")
-	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5e17e00e85a9271abf0860a6", "1", "result_mxs", "result_mxs")
+	extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "5e17deee50b5ea296ec939fb", "1", "mxs_v1", "mxs_v1")
+	//extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5e17e00e85a9271abf0860a6", "1", "mxs_v1", "mxs_v1")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }

+ 46 - 3
src/res/fieldscore.json

@@ -200,7 +200,7 @@
         "negativewords": [
             {
                 "describe": "包含负分",
-                "regstr": "(附件|否决原因|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+                "regstr": "(附件|否决原因|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
                 "score": -10
             },
 			{
@@ -361,7 +361,50 @@
             }
         ]
     },
-    "winnerperson": {
+    "buyeraddr": {
+        "type": "string",
+        "negativewords": [
+            {
+                "describe": "出现符号",
+                "regstr": "[*]",
+                "score": -10
+            },
+			{
+                "describe": "是数字",
+                "regstr": "^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$",
+                "score": -10
+            },
+			{
+                "describe": "出现日期",
+                "regstr": "(\d)+(年|月|日)+",
+                "score": -10
+            },
+			{
+                "describe": "包含负分",
+                "regstr": "(详见公告)",
+                "score": -10
+            }
+        ],
+        "length": [
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    6,
+                    -10
+                ]
+            },
+			 {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    90,
+                    -1,
+                    -10
+                ]
+            }
+        ]
+    },
+	 "winnerperson": {
         "type": "string",
         "positivewords": [
             {
@@ -557,7 +600,7 @@
     "bidamount": {
         "type": "float",
         "describe": "min>val:1,min<=val<=max:3,max<val:1",
-        "min": 1000,
+        "min": -0.999,
         "max": 1000000000,
         "score": [
             -3,