maxiaoshan 5 жил өмнө
parent
commit
423bb9688e

+ 8 - 1
src/jy/clear/tonumber.go

@@ -98,10 +98,17 @@ func ObjToMoney(data []interface{}) []interface{} {
 			f = f * 10000
 		}
 	}
+	if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(data[0])) {
+		data = append(data, false)
+		return data
+	}
+	data = append(data, true)
 	data[0] = f
 	return data
 }
-
+//["中标金额","成交金额","合同金额","中标价","成交价","成交价格","中标(成交)金额","投标报价","中标标价","成交结果"]
+//["0元","零元","0.0万元","¥0元"]
+var moneyUnitRegBool = regexp.MustCompile(`(中标金额|成交金额|合同金额|中标价|成交价|成交价格|中标\(成交\)金额|投标报价|中标标价|成交结果)?[::\s]?(0元|零元|0.0万元|¥0元|0)+`)
 //数字金额转换
 func numMoney(data []interface{}) ([]interface{}, bool) {
 	tmp := fmt.Sprintf("%f",data[0])

+ 17 - 2
src/jy/extract/extpackage.go

@@ -7,7 +7,7 @@ import (
 	"log"
 	qu "qfw/util"
 	"reflect"
-	"regexp"
+	regexp "regexp"
 	"sort"
 )
 
@@ -60,6 +60,13 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 					cfn := e.ClearFn["budget"]
 					lock.Unlock()
 					data := clear.DoClearFn(cfn, []interface{}{pv[0].Value, ""})
+					if data[0] ==0{
+						if istrue,ok:= data[len(data)-1].(bool);istrue&&ok{
+							(*sonJobResult)["budget"] = data[0]
+						}else {
+							continue
+						}
+					}
 					(*sonJobResult)["budget"] = data[0]
 					continue
 				}
@@ -68,10 +75,18 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 					cfn := e.ClearFn["budget"]
 					lock.Unlock()
 					data := clear.DoClearFn(cfn, []interface{}{pv[0].Value, ""})
+					if istrue,ok:= data[len(data)-1].(bool);istrue&&ok{
+						(*sonJobResult)["budget"] = data[0]
+					}else {
+						continue
+					}
 					(*sonJobResult)["bidamount"] = data[0]
 					continue
 				}
 				if ((*sonJobResult)["winner"] == nil || (*sonJobResult)["winner"] == "") && tags[0].Key == "中标单位" {
+					if winnerorderNotReg.MatchString(pv[0].Value){
+						continue
+					}
 					(*sonJobResult)["winner"] = pv[0].Value
 				}
 
@@ -82,7 +97,7 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 		}
 	}
 }
-
+var winnerorderNotReg =regexp.MustCompile(`(附件|否决原因|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\d[\s]{0,10}(\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})`)
 //处理分包信息
 func PackageDetail(j *ju.Job, e *ExtractTask, isSite bool, codeSite string) {
 	qu.Try(func() {

+ 38 - 22
src/jy/extract/extract.go

@@ -26,12 +26,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 100                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 100                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -568,9 +568,8 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 			}
 		}
 		//函数清理
-
 		for key, val := range j.Result {
-			for _, v := range val {
+			for i, v := range val {
 				//qu.Debug(key, v.Value)
 				lockclear.Lock()
 				var cfn = []string{}
@@ -584,6 +583,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 					continue
 				}
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
+				if key == "budget" || key == "bidamount" {
+						if istrue, ok := data[len(data)-1].(bool); istrue && ok {
+							j.Result[key][i].IsTrue = true
+						} else {
+							continue
+						}
+				}
 				before, _ := v.Value.(string)
 				v.Value = data[0]
 				BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
@@ -1576,6 +1582,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				} else if v.Field == "projectname" {
 					tmp[v.Field] = v.Value
 					break
+				} else if v.Field == "bidamount"||v.Field =="budget"{
+					if v.IsTrue{
+						tmp[v.Field] =v.Value
+						break
+					}
 				}
 			}
 		}
@@ -1691,19 +1702,19 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
-			/*	if len(e.SiteFields) <= 0 {
-					//for field, _ := range e.Fields {
-					//	if tmp[field] == nil &&  {
-					//		tmp[field] = "" //覆盖之前版本数据
-					//	}
-					//}
-				} else {
-					//for field, _ := range e.SiteFields {
-					//	if tmp[field] == nil &&{
-					//		tmp[field] = "" //覆盖之前版本数据
-					//	}
-					//}
-				}*/
+				/*	if len(e.SiteFields) <= 0 {
+						//for field, _ := range e.Fields {
+						//	if tmp[field] == nil &&  {
+						//		tmp[field] = "" //覆盖之前版本数据
+						//	}
+						//}
+					} else {
+						//for field, _ := range e.SiteFields {
+						//	if tmp[field] == nil &&{
+						//		tmp[field] = "" //覆盖之前版本数据
+						//	}
+						//}
+					}*/
 				tmp["repeat"] = 0
 				tmparr := []map[string]interface{}{
 					map[string]interface{}{
@@ -1877,6 +1888,11 @@ func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
 					standardized = true
 				}
 			}
+			if field == "budget"||field == "bidamount"{
+				if !v.IsTrue{
+					continue
+				}
+			}
 			sfield := map[string]interface{}{
 				"val":          v.Value,
 				"type":         v.Type,
@@ -1929,7 +1945,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 14 - 4
src/jy/extract/score.go

@@ -114,9 +114,19 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 		for tmpsindex, tmpsvalue := range tmps {
 			//没有抽取到值,不打分
 			if string_value := fmt.Sprint(tmpsvalue.Value); string_value == "" || string_value == "0" || string_value == "<nil>" {
-				tmps[tmpsindex].Score = -10
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
-				continue
+				if field == "budget" || field == "bidamount" {
+					if tmpsvalue.IsTrue {
+						//continue
+					}else {
+						tmps[tmpsindex].Score = -10
+						tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
+						continue
+					}
+				}else {
+					tmps[tmpsindex].Score = -10
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
+					continue
+				}
 			}
 			lockscore.Lock()
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
@@ -270,7 +280,7 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 				max := qu.IntAll(scoreRule["max"])
 				val := qu.IntAll(tmpsvalue.Value)
 				scores, _ := scoreRule["score"].([]interface{})
-				if len(scores) < 3 || val == 0 {
+				if len(scores) < 3 || val < 0 {
 					continue
 				}
 				if val < min && 0 < val {

+ 1 - 0
src/jy/util/article.go

@@ -55,6 +55,7 @@ type ExtField struct {
 	ScoreItem   []*ScoreItem      //打分项
 	Weight      int               //权重值
 	ValRepeat   int               //结果值重复次数,打分参考
+	IsTrue      bool              //针对金额0是否有效的值,其他字段不参考
 }
 
 //打分项

+ 2 - 2
src/res/fieldscore.json

@@ -200,7 +200,7 @@
         "negativewords": [
             {
                 "describe": "包含负分",
-                "regstr": "(附件|否决原因|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+                "regstr": "(附件|否决原因|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
                 "score": -10
             },
 			{
@@ -600,7 +600,7 @@
     "bidamount": {
         "type": "float",
         "describe": "min>val:1,min<=val<=max:3,max<val:1",
-        "min": 1000,
+        "min": -0.999,
         "max": 1000000000,
         "score": [
             -3,

+ 25 - 12
udpfilterdup/src/datamap.go

@@ -214,7 +214,9 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	return info
 }
 //判重方法
-func (d *datamap) check(info *Info) (b bool, source *Info, reason string) {
+func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
+
+	reason:=""
 	keys := []string{}
 	d.lock.Lock()
 	for k, _ := range d.keys { //不同时间段
@@ -258,6 +260,7 @@ L:
 							reason = "href相同"
 							b = true
 							source = v
+							reasons = reason
 							break L
 						}
 						if info.href != "" && info.href != v.href {
@@ -283,6 +286,7 @@ L:
 							if !againRepeat(v,info){
 								b = true
 								source = v
+								reasons = reason
 								break
 							}
 						}
@@ -295,6 +299,7 @@ L:
 						if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 							b = true
 							source = v
+							reasons = reason
 							break
 						}
 					} else {
@@ -305,6 +310,7 @@ L:
 							if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 								b = true
 								source = v
+								reasons = reason
 								break
 							}
 						} else {
@@ -313,6 +319,7 @@ L:
 							if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
 								b = true
 								source = v
+								reasons = reason
 								break
 							}
 						}
@@ -346,7 +353,8 @@ L:
 	return
 }
 
-func (h *historymap) checkHistory(info *Info) (b bool, source *Info, reason string) {
+func (h *historymap) checkHistory(info *Info) (b bool, source *Info, reasons string) {
+	reason:=""
 	keys := []string{}
 	h.lock.Lock()
 	for k, _ := range h.keys { //不同时间段
@@ -391,6 +399,7 @@ L:
 							reason = "href相同"
 							b = true
 							source = v
+							reasons = reason
 							break L
 						}
 						if info.href != "" && info.href != v.href {
@@ -416,6 +425,7 @@ L:
 							if !againRepeat(v,info){
 								b = true
 								source = v
+								reasons = reason
 								break
 							}
 
@@ -429,6 +439,7 @@ L:
 						if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 							b = true
 							source = v
+							reasons = reason
 							break
 						}
 					} else {
@@ -439,6 +450,7 @@ L:
 							if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 								b = true
 								source = v
+								reasons = reason
 								break
 							}
 						} else {
@@ -447,6 +459,7 @@ L:
 							if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
 								b = true
 								source = v
+								reasons = reason
 								break
 							}
 						}
@@ -466,6 +479,7 @@ L:
 			if source.repeatid != "" {//未判重-有变化--记录
 				b = true
 				reason = "未判重记录"
+				reasons = reason
 			}
 		}
 	}
@@ -589,7 +603,6 @@ func (d *datamap) GetLatelyFiveDay(t int64) []string {
 ******* 以下为判重 ********
 **************************
 */
-
 //判重方法1
 func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 
@@ -611,9 +624,6 @@ func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 
 	} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
 		//中标结果
-		if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
-
-		}
 		if isMeet, reason = winningRepeat_A(v, info, reason);isMeet {
 			if winningRepeat_C(v, info) {
 				return false, reason
@@ -938,11 +948,13 @@ func winningRepeat_C(v *Info, info *Info) bool {
 //合同_A
 func contractRepeat_A(v *Info, info *Info, reason string) (bool,string) {
 
-	isMeet := false
-	if isMeet, reason = tenderRepeat_A(v, info, reason);isMeet {
+	isMeet_1 := false
+	if isMeet_1, reason = tenderRepeat_A(v, info, reason);isMeet_1 {
 		return true,reason
 	}
-	if isMeet, reason = winningRepeat_A(v, info, reason);isMeet {
+
+	isMeet_2 := false
+	if isMeet_2, reason = winningRepeat_A(v, info, reason);isMeet_2 {
 		return true,reason
 	}
 	return false,reason
@@ -951,11 +963,12 @@ func contractRepeat_A(v *Info, info *Info, reason string) (bool,string) {
 //合同_B
 func contractRepeat_B(v *Info, info *Info, reason string) (bool,string) {
 
-	isMeet := false
-	if isMeet, reason = tenderRepeat_B(v, info, reason);isMeet {
+	isMeet_1 := false
+	if isMeet_1, reason = tenderRepeat_B(v, info, reason);isMeet_1 {
 		return true,reason
 	}
-	if isMeet, reason = winningRepeat_B(v, info, reason);isMeet {
+	isMeet_2 := false
+	if isMeet_2, reason = winningRepeat_B(v, info, reason);isMeet_2 {
 		return true,reason
 	}
 	return false,reason

+ 17 - 10
udpfilterdup/src/main.go

@@ -63,7 +63,6 @@ func init() {
 	extract = mconf["extract"].(string)
 	mgo.InitPool()
 
-	//测试可以临时注释
 	dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
 	//加载数据
 	DM = NewDatamap(dupdays, lastid)
@@ -104,8 +103,13 @@ func main() {
 
 //测试组人员使用
 func mainT() {
+	/*
+	ObjectId("5da3f31aa5cb26b9b798d3aa")
+	ObjectId("5da418c4a5cb26b9b7e3e9a6")
+	*/
 	//sid = "5da3f31aa5cb26b9b798d3aa"
-	//eid = "5da422fba5cb26b9b706984b"
+	//eid = "5da418c4a5cb26b9b7e3e9a6"
+
 	mapinfo := map[string]interface{}{}
 	if sid == "" || eid == "" {
 		log.Println("sid,eid参数不能为空")
@@ -236,9 +240,9 @@ func task(data []byte, mapInfo map[string]interface{}) {
 							newData, mergeArr = mergeDataFields(source, info)
 							DM.replaceSourceData(newData, source.id) //替换
 							if idtype == "1" {
-								id_map["_id"] = source.id
+								id_map["_id"] = info.id
 							} else {
-								id_map["_id"] = util.StringTOBsonId(source.id)
+								id_map["_id"] = util.StringTOBsonId(info.id)
 							}
 
 							repeat_id = source.id
@@ -247,9 +251,9 @@ func task(data []byte, mapInfo map[string]interface{}) {
 							newData, mergeArr = mergeDataFields(info, source)
 							DM.replaceSourceData(newData, source.id) //替换
 							if idtype == "1" {
-								id_map["_id"] = info.id
+								id_map["_id"] = source.id
 							} else {
-								id_map["_id"] = util.StringTOBsonId(info.id)
+								id_map["_id"] = util.StringTOBsonId(source.id)
 							}
 
 							repeat_id = info.id
@@ -483,23 +487,26 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 								newData, mergeArr = mergeDataFields(source, info)
 								DM.replaceSourceData(newData, source.id) //替换
 								if idtype == "1" {
-									id_map["_id"] = source.id
+									id_map["_id"] = info.id
 								} else {
-									id_map["_id"] = util.StringTOBsonId(source.id)
+									id_map["_id"] = util.StringTOBsonId(info.id)
 								}
+
 								repeat_id = source.id
 							} else {
 								//已对比数据为标准 ,数据池的数据打判重标签
 								newData, mergeArr = mergeDataFields(info, source)
 								DM.replaceSourceData(newData, source.id) //替换
 								if idtype == "1" {
-									id_map["_id"] = info.id
+									id_map["_id"] = source.id
 								} else {
-									id_map["_id"] = util.StringTOBsonId(info.id)
+									id_map["_id"] = util.StringTOBsonId(source.id)
 								}
+
 								repeat_id = info.id
 							}
 						}
+
 						var update_map = map[string]interface{}{
 							"$set": map[string]interface{}{
 								"repeat_reason": reason,

+ 4 - 0
udps/main.go

@@ -22,6 +22,10 @@ func main() {
 	//2017-04-01,2017-06-01
 	//2017-06-01,2018-06-01
 	//2018-06-01,2019-02-20
+	/*
+ObjectId("5da3f31aa5cb26b9b798d3aa")
+ObjectId("5da422fba5cb26b9b706984b")
+*/
 
 	flag.StringVar(&sid, "sid", "", "开始id")
 	flag.StringVar(&eid, "eid", "", "结束id")