Forráskód Böngészése

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

maxiaoshan 5 éve
szülő
commit
111a3b6edc

+ 5 - 5
fullproject/src_v1/config.json

@@ -2,12 +2,12 @@
     "loadStart": 0,
 	"validdays":150,
     "statusdays": 7,
-	"mongodbServers": "192.168.3.207:27092",
+	"mongodbServers": "192.168.3.166:27082",
     "mongodbPoolSize": 10,
-    "mongodbName": "extract_kf",
+    "mongodbName": "zhaolongyue",
 	"hints":"publishtime_1",
-    "extractColl": "ceshi_info",
-    "projectColl": "jh_project",
+    "extractColl": "huawei_bidding_all_0110_v2",
+    "projectColl": "huawei_project_0113_v2",
     "backupFlag": false,
     "siteColl": "site",
     "thread": 1,
@@ -15,7 +15,7 @@
         "to": "wangjianghan@topnet.net.cn",
         "api": "http://10.171.112.160:19281/_send/_mail"
     },
-    "udpport": ":1182",
+    "udpport": "1182",
     "nextNode": [
     ]
 }

+ 1 - 2
fullproject/src_v1/init.go

@@ -64,7 +64,6 @@ func init() {
 	ExtractColl = Sysconfig["extractColl"].(string)
 	ProjectColl = Sysconfig["projectColl"].(string)
 	BackupColl = Sysconfig["projectColl"].(string)+"_back"
-	log.Println(BackupColl)
 	SiteColl = Sysconfig["siteColl"].(string)
 	Thread = util.IntAll(Sysconfig["thread"])
 	//NextNode = Sysconfig["nextNode"].([]interface{})
@@ -275,7 +274,7 @@ type ProjectInfo struct {
 	Ids           []string           `json:"ids,omitempty"`
 	Topscopeclass []string           `json:"topscopeclass,omitempty"`
 	Subscopeclass []string           `json:"subscopeclass,omitempty"` //子行业分类
-	Winners       []string           `json:"winners,omitempty"`       //中标人
+	Winners       []string           `json:"s_winner,omitempty"`       //中标人
 	ProjectName   string             `json:"projectname,omitempty"`   //项目名称
 	ProjectCode   string             `json:"projectcode,omitempty"`   //项目代码唯一(纯数字的权重低)
 	ContractCode  string			 `json:"contractcode,omitempty"`  //项目编号

+ 11 - 52
fullproject/src_v1/main.go

@@ -9,7 +9,6 @@ import (
 	"os"
 	"os/signal"
 	"qfw/util"
-	qu "qfw/util"
 	"syscall"
 	"time"
 )
@@ -20,7 +19,7 @@ var (
 	toaddr       = []*net.UDPAddr{} //下节点对象
 	ChSign       = make(chan os.Signal)
 
-	sid, eid string //测试使用
+	sid, eid string 	//测试使用
 )
 
 func init() {
@@ -59,7 +58,7 @@ func DealSign() {
 	}
 }
 
-func main() {
+func mainT() {
 	//udp跑增量  id段   project
 	//udp跑全量			ql
 	//udp跑历史数据  信息id1,id2/或id段  ls
@@ -78,9 +77,9 @@ func main() {
 }
 
 //测试组人员使用
-func mainT() {
-	//sid = "5dfbd43ce9d1f601e43fa402"
-	//eid = "5e0954b30cf41612e061d0c8"
+func main() {
+	//sid = "5649a0fcaf5374672e005704"
+	//eid = "5e169e5250b5ea296ec896f0"
 
 	flag.StringVar(&sid, "sid", "", "开始id")
 	flag.StringVar(&eid, "eid", "", "结束id")
@@ -91,8 +90,8 @@ func mainT() {
 		log.Println("sid, eid参数不能为空")
 		os.Exit(0)
 	}
-	mapinfo["gtid"] = qu.StringTOBsonId(sid)
-	mapinfo["lteid"] = qu.StringTOBsonId(eid)
+	mapinfo["gtid"] = sid
+	mapinfo["lteid"] = eid
 	mapinfo["stype"] = "ql"
 	mapinfo["ip"] = "127.0.0.1"
 	mapinfo["port"] = Sysconfig["udpport"]
@@ -103,52 +102,12 @@ func mainT() {
 		}
 	}
 	P_QL.loadSite()
-	task(mapinfo)
-	SingleThread <- true
+	P_QL.currentType = mapinfo["stype"].(string)
+	P_QL.pici = time.Now().Unix()
+	P_QL.taskQl(mapinfo)
 	time.Sleep(20 * time.Second)
 }
 
-func task(mapInfo map[string]interface{}) {
-	SingleThread <- true
-	tasktype, _ := mapInfo["stype"].(string)
-	log.Println("tasktype:", tasktype)
-	switch tasktype {
-	case "ql": //全量合并
-		go func() {
-			defer func() {
-				<-SingleThread
-			}()
-			P_QL.currentType = tasktype
-			P_QL.pici = time.Now().Unix()
-			P_QL.taskQl(mapInfo)
-		}()
-	case "project": //增量合并,未抽取到项目名称或项目编号的不合并  bidding中mergestatus 1已合并 2字段问题不合并 3历史待合并
-		//合同、验收公告在6个月内查询不到可扩展到两年
-		go func() {
-			defer func() {
-				<-SingleThread
-			}()
-			P_QL.currentType = tasktype
-			P_QL.pici = time.Now().Unix()
-			P_QL.taskZl(mapInfo)
-		}()
-	case "updateInfo": //招标字段变更
-		go func() {
-			defer func() {
-				<-SingleThread
-			}()
-			P_QL.currentType = tasktype
-			P_QL.pici = time.Now().Unix()
-			P_QL.taskUpdateInfo(mapInfo)
-		}()
-	case "history": //历史数据合并,暂时不写
-		go func() {
-			defer func() {
-				<-SingleThread
-			}()
-		}()
-	}
-}
 
 //udp调用信号
 func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
@@ -188,7 +147,7 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 					P_QL.pici = time.Now().Unix()
 					P_QL.taskZl(mapInfo)
 				}()
-			case "updateInfo": //招标字段变更
+			case "updateInfo":		//招标字段变更
 				go func() {
 					defer func() {
 						<-SingleThread

+ 29 - 25
fullproject/src_v1/project.go

@@ -176,8 +176,8 @@ func (p *ProjectTask) startProjectMerge(info *Info, tmp map[string]interface{})
 
 			ex := 0
 			resArr := []*ProjectInfo{}
-			for _, res := range resN {
-				choose, e := p.CompareStatus(resN[0], info)
+			for i, res := range resN {
+				choose, e := p.CompareStatus(resN[i], info)
 				if !choose {
 					ex = e
 					resArr = append(resArr, res)
@@ -186,6 +186,7 @@ func (p *ProjectTask) startProjectMerge(info *Info, tmp map[string]interface{})
 			if len(resArr) > 0 {
 				bFindProject = true
 				findPid = resArr[0].Id.Hex()
+				p.UpdateProject(tmp, info, resArr[0], kv+1, resArr[0].comStr, ex)
 				for k2, bv := range []int{bpn, bpc, bptc, bpb} {
 					if bv > -1 {
 						pids[bv].Arr = append(pids[bv].Arr, findPid)
@@ -210,7 +211,6 @@ func (p *ProjectTask) startProjectMerge(info *Info, tmp map[string]interface{})
 									resArr[0].MPC = append(resArr[0].MPC, qu.If(k2 == 1, info.ProjectCode, info.PTC).(string))
 								}
 							}
-
 						} else {
 							if resArr[0].Buyer == "" {
 								resArr[0].Buyer = info.Buyer
@@ -218,7 +218,6 @@ func (p *ProjectTask) startProjectMerge(info *Info, tmp map[string]interface{})
 						}
 					}
 				}
-				p.UpdateProject(tmp, info, resArr[0], kv+1, resArr[0].comStr, ex)
 			} else {
 				bFindProject = false
 				findPid = ""
@@ -506,12 +505,11 @@ func (p *ProjectTask) NewProject(tmp map[string]interface{}, thisinfo *Info) (st
 	bt := qu.ObjToString(tmp["toptype"])
 	bs := qu.ObjToString(tmp["subtype"])
 	p.mapBidLock.Lock()
+	set["bidtype"] = bidtype[bs]
 	if bt == "招标" {
 		set["projectscope"] = qu.ObjToString(tmp["projectscope"])
-		set["bidtype"] = bidtype[bs]
 		set["bidstatus"] = bs
 	}else {
-		set["bidtype"] = bt
 		if bidstatus[bs] != "" {
 			set["bidstatus"] = thisinfo.SubType
 		} else if tmp["infoformat"] == 2 {
@@ -522,7 +520,8 @@ func (p *ProjectTask) NewProject(tmp map[string]interface{}, thisinfo *Info) (st
 	}
 	p.mapBidLock.Unlock()
 
-	p1, pkg := p.NewCachePinfo(pId, thisinfo, bt)
+	pkg := PackageFormat(thisinfo, nil)
+	p1 := p.NewCachePinfo(pId, thisinfo, bt, pkg)
 	if len(thisinfo.Subscopeclass) > 0 {
 		s_subscopeclass := strings.Join(thisinfo.Subscopeclass, ",")
 		set["s_subscopeclass"] = s_subscopeclass
@@ -567,6 +566,7 @@ func (p *ProjectTask) NewProject(tmp map[string]interface{}, thisinfo *Info) (st
 			"$set": set,
 		},
 	}
+	//log.Println(set)
 	return pId.Hex(), &p1
 }
 
@@ -614,11 +614,7 @@ func (p *ProjectTask) PushListInfo(tmp map[string]interface{}, infoid string) bs
 }
 
 //生成存放在内存中的对象
-func (p *ProjectTask) NewCachePinfo(id primitive.ObjectID, thisinfo *Info, bidtype string) (ProjectInfo, map[string]interface{}) {
-	pkg := map[string]interface{}{}
-	if thisinfo.HasPackage {
-		pkg = PackageFormat(thisinfo, nil)
-	}
+func (p *ProjectTask) NewCachePinfo(id primitive.ObjectID, thisinfo *Info, bidtype string, pkg map[string]interface{}) ProjectInfo {
 	p1 := ProjectInfo{
 		Id:            id,
 		Ids:           []string{thisinfo.Id},
@@ -649,7 +645,7 @@ func (p *ProjectTask) NewCachePinfo(id primitive.ObjectID, thisinfo *Info, bidty
 	if thisinfo.LenPTC > 5 {
 		p1.MPC = append(p1.MPC, thisinfo.PTC)
 	}
-	return p1, pkg
+	return p1
 }
 
 //更新项目
@@ -683,15 +679,16 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 	bt := qu.ObjToString(tmp["toptype"])
 	bs := qu.ObjToString(tmp["subtype"])
 	p.mapBidLock.Lock()
+	if bidtype[bs] != "" {
+		set["bidtype"] = bidtype[bs]
+	}
 	if bt == "招标" {
 		//招标状态,更新projectscope
 		if tmp["projectscope"] != nil {
 			set["projectscope"] = qu.ObjToString(tmp["projectscope"])
 		}
-		set["bidtype"] = bidtype[bs]
 		set["bidstatus"] = bs
 	}else {
-		set["bidtype"] = bt
 		if bidstatus[bs] != "" {
 			set["bidstatus"] = thisinfo.SubType
 		} else if tmp["infoformat"] == 2 {
@@ -807,6 +804,10 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 	}
 	//winner
 	if len(thisinfo.Winners) > 0 {
+		if len(pInfo.Winners) <= 0 {
+			set["winner"] = tmp["winner"].(string)
+		}
+
 		sort.Strings(pInfo.Winners)
 		for _, k := range thisinfo.Winners {
 			if BinarySearch(pInfo.Winners, k) == -1 {
@@ -944,7 +945,7 @@ func ComparePlace(project *ProjectInfo, info *Info) bool {
 var PackageEle = []string{
 	"origin",
 	"name",
-	"text",
+	//"text",
 	"budget",
 	"winner",
 	"bidamount",
@@ -969,11 +970,8 @@ func packageEle(map1 map[string]interface{}, id string) map[string]interface{} {
 
 func PackageFormat(info *Info, project *ProjectInfo) map[string]interface{} {
 	p1 := map[string]interface{}{}
-	packageCopy := make(map[string]interface{})
-	if project != nil && project.Package != nil {
-		packageCopy = Copy(project.Package).(map[string]interface{})
-	}
-	if packageCopy != nil && len(packageCopy) > 0 {
+	if project != nil && project.Package != nil && len(project.Package) > 0 {
+		packageCopy := Copy(project.Package).(map[string]interface{})
 		p1 = packageCopy
 		for k, v := range info.Package {
 			if v1, ok := v.(map[string]interface{}); ok {
@@ -985,7 +983,8 @@ func PackageFormat(info *Info, project *ProjectInfo) map[string]interface{} {
 				addFlag := false
 				for k1, v3 := range p1 {
 					if v4, ok := v3.([]map[string]interface{}); ok {
-						if qu.ObjToString(v4[0]["origin"]) == qu.ObjToString(v2["origin"]) && qu.ObjToString(v4[0]["name"]) == qu.ObjToString(v2["name"]) {
+						//if qu.ObjToString(v4[0]["origin"]) == qu.ObjToString(v2["origin"]) && qu.ObjToString(v4[0]["name"]) == qu.ObjToString(v2["name"]) {
+						if k1 == k {
 							v4 = append(v4, v2)
 							p1[k1] = v4
 							addFlag = true
@@ -998,8 +997,9 @@ func PackageFormat(info *Info, project *ProjectInfo) map[string]interface{} {
 				}
 			}
 		}
+		p1 = packageCopy
 	} else {
-		for k, v := range packageCopy {
+		for k, v := range info.Package {
 			v1, _ := v.(map[string]interface{})
 			p2 := map[string]interface{}{}
 			p2 = packageEle(v1, info.Id)
@@ -1009,7 +1009,6 @@ func PackageFormat(info *Info, project *ProjectInfo) map[string]interface{} {
 			p1[k] = []map[string]interface{}{p2}
 		}
 	}
-	p1 = packageCopy
 	return p1
 }
 
@@ -1027,7 +1026,12 @@ func CountAmount(project *ProjectInfo, info *Info) {
 				}
 			}
 		}
-		project.Budget = budget
+		if budget == 0 && info.Budget > 0 {
+			budget = info.Budget
+		}
+		if budget > 0 {
+			project.Budget = budget
+		}
 	}else {
 		//招标没有多包
 		k := KeyPackage.FindStringSubmatch(project.ProjectName)

+ 1 - 2
fullproject/src_v1/task.go

@@ -83,7 +83,7 @@ func NewPT() *ProjectTask {
 		mapPc:     make(map[string]*Key, 5000000),
 		mapHref:   make(map[string]string, 1500000),
 		mapSite:   make(map[string]*Site, 1000000),
-		saveSize:  400,
+		saveSize:  100,
 
 		//saveSign:   make(chan bool, 1),
 		//updateSign: make(chan bool, 1),
@@ -377,7 +377,6 @@ func (p *ProjectTask) enter(db, coll string, q map[string]interface{}) {
 	log.Println("start project", q)
 	sess := MongoTool.GetMgoConn()
 	defer MongoTool.DestoryMongoConn(sess)
-
 	infoPool := make(chan map[string]interface{}, 2000)
 	over := make(chan bool)
 	go func() {

+ 2 - 1
src/jy/clear/tonumber.go

@@ -79,6 +79,7 @@ func ObjToFloat(data []interface{}) []interface{} {
 //金额转换
 func ObjToMoney(data []interface{}) []interface{} {
 	isfindUnit := true
+	tmpstr :=(data)[0]
 	ret := capitalMoney(data)[0]
 	if ret.(float64) < float64(10000) || ret.(float64) > float64(50000000000) {
 		ret2, b := numMoney(data)
@@ -98,7 +99,7 @@ func ObjToMoney(data []interface{}) []interface{} {
 			f = f * 10000
 		}
 	}
-	if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(data[0])) {
+	if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(tmpstr)) {
 		data = append(data, false)
 		return data
 	}

+ 53 - 30
src/jy/extract/extract.go

@@ -795,7 +795,10 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
 				j.Result[k] = [](*ju.ExtField){}
 			}
 			for _, tmp := range v {
-				field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+				field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]),ExtFrom:qu.ObjToString(tmp["extfrom"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
+				if k == "bidamount" && field.ExtFrom=="第一候选人"{
+					field.Score = 1
+				}
 				if isSite {
 					field.Score = 1
 				}
@@ -1036,12 +1039,15 @@ func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]ma
 	if len(j.Winnerorder) > 1 {
 		if vc.Field == "bidamount" {
 			for _, v := range j.Winnerorder {
+				if v["price"] == nil{
+					continue
+				}
 				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
 					"code":        "winnerorder",
 					"field":       vc.Field,
 					"ruletext":    "中标候选人",
-					"extfrom":     vc.ExtFrom,
-					"sourcevalue": "中标候选人",
+					"extfrom":     v["sortstr"],
+					"sourcevalue": v["price"],
 					"value":       v["price"],
 					"type":        "winnerorder",
 					"matchtype":   "winnerorder",
@@ -1053,8 +1059,8 @@ func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]ma
 					"code":        "CL_中标候选人",
 					"field":       vc.Field,
 					"ruletext":    "中标候选人",
-					"extfrom":     vc.ExtFrom,
-					"sourcevalue": "中标候选人",
+					"extfrom":      j.Winnerorder[0]["sortstr"],
+					"sourcevalue": price,
 					"value":       price,
 					"type":        "winnerorder",
 					"matchtype":   "winnerorder",
@@ -2023,34 +2029,51 @@ func resetWinnerorder(j *ju.Job) {
 	}
 	maxlen := len(j.Winnerorder) - 1
 	//中标单位
-	i := 0
+	//i := 0
 	winners := []*ju.ExtField{}
-	for _, v := range j.Result["winner"] {
-		if v.Code == "winnerorder" {
-			if maxlen < i {
-				continue
-			}
-			j.Winnerorder[i]["entname"] = v.Value
-			i++
-		} else {
-			winners = append(winners, v)
-		}
-	}
-	j.Result["winner"] = winners
-	//中标金额
-	i = 0
 	bidamounts := []*ju.ExtField{}
-	for _, v := range j.Result["bidamount"] {
-		if v.Code == "winnerorder" {
-			if maxlen < i {
-				continue
-			}
-			j.Winnerorder[i]["price"] = v.Value
-			i++
-		} else {
-			bidamounts = append(bidamounts, v)
+	//for _, v := range j.Result["winner"] {
+	//	if v.Code == "winnerorder" {
+	//		if maxlen < i {
+	//			continue
+	//		}
+	//		j.Winnerorder[i]["entname"] = v.Value
+	//		i++
+	//	} else {
+	//		winners = append(winners, v)
+	//	}
+	//}
+	if maxlen > 0 {
+		winners = append(winners,&ju.ExtField{Code:"winnerorder",Field:"winner",ExtFrom:"j.Winnerorder",Value:j.Winnerorder[0]["entname"],Score:0.5} )
+		if j.Winnerorder[0]["price"]!= nil{
+			bidamounts = append(bidamounts,&ju.ExtField{Code:"winnerorder",Field:"bidamount",ExtFrom:"j.Winnerorder",Value:j.Winnerorder[0]["price"],Score:0.5} )
 		}
 	}
-	j.Result["bidamount"] = bidamounts
+	if j.Result["winner"] == nil{
+		j.Result["winner"] = winners
+	}else {
+		j.Result["winner"] = append(j.Result["winner"],winners... )
+	}
+	if j.Result["bidamount"]==nil{
+		j.Result["bidamount"] = bidamounts
+	}else {
+		j.Result["bidamount"] =  append(j.Result["bidamount"],bidamounts... )
+	}
+	//j.Result["winner"] = winners
+	//中标金额
+	//i = 0
+	//bidamounts := []*ju.ExtField{}
+	//for _, v := range j.Result["bidamount"] {
+	//	if v.Code == "winnerorder" {
+	//		if maxlen < i {
+	//			continue
+	//		}
+	//		j.Winnerorder[i]["price"] = v.Value
+	//		i++
+	//	} else {
+	//		bidamounts = append(bidamounts, v)
+	//	}
+	//}
+	//j.Result["bidamount"] = bidamounts
 
 }

+ 8 - 0
src/jy/extract/score.go

@@ -104,6 +104,7 @@ func init() {
 	}
 }
 
+var CNreg = regexp.MustCompile("[\u4e00-\u9fa5]")
 //结果打分
 func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 	qu.Catch()
@@ -112,6 +113,13 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 		if field == "projectcode" {
 			tmps = projectWeightClear(tmps)
 		}
+		if field == "budget" || field == "bidamount" {
+			for tmpsindex, tmpsvalue := range tmps {
+				tmps[tmpsindex].Score = -10
+				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: field+`value结果含中文直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
+				continue
+			}
+		}
 		locktag.Lock()
 		taglength := len(ftag[field])
 		locktag.Unlock()

+ 3 - 0
src/jy/extract/score_jsondata.go

@@ -176,6 +176,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 									score := util2.Float64All(ranges[2])
 									if valueLen > gt && valueLen <= lte {
 										v[i].Score += score
+										v[i].ScoreItem = append(v[i].ScoreItem, &util.ScoreItem{Des: "JsonData长度打分", Code: fmt.Sprint(gt, "<", valueLen, "<=", lte), ScoreFrom: "fieldscore.json.length", Value: tmpsvalue.Value, Score: score})
 										break
 									}
 								}
@@ -191,6 +192,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 										reg := p["regexp"].(*regexp.Regexp)
 										if reg.MatchString(util2.ObjToString(tmpsvalue.Value)) {
 											v[i].Score += util2.Float64All(p["score"])
+											v[i].ScoreItem = append(v[i].ScoreItem, &util.ScoreItem{Des: "JsonData负面词打分" + fmt.Sprint(p["describe"]), Code: "negativewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.negativewords", Value: tmpsvalue.Value, Score: util2.Float64All(p["score"])})
 										}
 									}
 								}, func(err interface{}) {
@@ -208,6 +210,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 										reg := p["regexp"].(*regexp.Regexp)
 										if reg.MatchString(util2.ObjToString(tmpsvalue.Value)) {
 											v[i].Score += util2.Float64All(p["score"])
+											v[i].ScoreItem = append(v[i].ScoreItem, &util.ScoreItem{Des: "Jsondata正面词打分" + fmt.Sprint(p["describe"]), Code: "positivewords", RuleText: reg.String(), ScoreFrom: "fieldscore.json.positivewords", Value: tmpsvalue.Value, Score: util2.Float64All(p["score"])})
 										}
 									}
 								}, func(err interface{}) {

+ 14 - 6
src/jy/pretreated/spacekv.go

@@ -16,18 +16,18 @@ var (
 	excludeSpaceKey = regexp.MustCompile("[.、�\\[【{{〔<《\\]】}}〕>》]")
 )
 
-func (se *SpacekvEntity) Entrance(text, title string, contactFormat *util.ContactFormat,isSite bool,codeSite string) *util.JobKv {
+func (se *SpacekvEntity) Entrance(text, title string, contactFormat *util.ContactFormat, isSite bool, codeSite string) *util.JobKv {
 	lines := se.getLines(text)
 	kvMaps := []*util.Kv{}
 	for _, line := range lines {
-		kvMap := se.divideKV(line,isSite,codeSite)
+		kvMap := se.divideKV(line, isSite, codeSite)
 		if kvMap == nil {
 			continue
 		}
 		kvMaps = append(kvMaps, kvMap...)
 	}
-	FormatContactKv(&kvMaps, title, nil, contactFormat,isSite,codeSite)
-	kvTags := GetKvTags(kvMaps, title, nil,isSite,codeSite)
+	FormatContactKv(&kvMaps, title, nil, contactFormat, isSite, codeSite)
+	kvTags := GetKvTags(kvMaps, title, nil, isSite, codeSite)
 	return &util.JobKv{
 		Kvs:    kvMaps,
 		KvTags: kvTags,
@@ -35,7 +35,7 @@ func (se *SpacekvEntity) Entrance(text, title string, contactFormat *util.Contac
 }
 
 //空格分kv
-func (se *SpacekvEntity) divideKV(line string,isSite bool,codeSite string) []*util.Kv {
+func (se *SpacekvEntity) divideKV(line string, isSite bool, codeSite string) []*util.Kv {
 	line = strings.TrimSpace(line)
 	line = regReplAllSpace.ReplaceAllString(line, " ")
 	line = TimeHM.ReplaceAllString(line, "D$1H$2M")
@@ -55,8 +55,16 @@ func (se *SpacekvEntity) divideKV(line string,isSite bool,codeSite string) []*ut
 		if excludeSpaceKey.MatchString(k) {
 			continue
 		}
+		ktags := util.GetTags(k, isSite, codeSite)
 		//value为key值跳过
-		if util.GetTags(v,isSite,codeSite).Len() > 0 && util.GetTags(k,isSite,codeSite).Len() > 0{
+		if util.GetTags(v, isSite, codeSite).Len() > 0 && ktags.Len() > 0 {
+			continue
+		}
+		if ktags.Len() > 0 && (ktags[0].Value == "中标金额" || ktags[0].Value == "预算") && strings.Contains(kv[i], "万") {
+			kvs = append(kvs, &util.Kv{Key: k, Value: v + "万"})
+			continue
+		} else if ktags.Len() > 0 && (ktags[0].Value == "中标金额" || ktags[0].Value == "预算") && strings.Contains(kv[i], "亿") {
+			kvs = append(kvs, &util.Kv{Key: k, Value: v + "亿"})
 			continue
 		}
 		kvs = append(kvs, &util.Kv{Key: k, Value: v})

+ 7 - 0
src/res/fieldscore.json

@@ -40,6 +40,13 @@
                 "space": 3,
                 "regexp": 2,
                 "kvweight": 5
+            },
+            "bidamount": {
+                "table": 3,
+                "colon": 3,
+                "space": 2,
+                "regexp": 2,
+                "kvweight": 1
             }
         }
     },

+ 193 - 203
udpfilterdup/src/main.go

@@ -89,7 +89,7 @@ func init() {
 		}
 		SiteMap[util.ObjToString(site_dict["site"])] = data_map
 	}
-	fmt.Printf("站点加载用时:%d秒,%d个\n", int(time.Now().Unix())-start, len(SiteMap))
+	log.Printf("站点加载用时:%d秒,%d个\n", int(time.Now().Unix())-start, len(SiteMap))
 }
 
 func main() {
@@ -223,84 +223,110 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				b, source, reason := DM.check(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
 					repeateN++
+					var is_replace  = false
 					var mergeArr = []int64{} //更改合并数组记录
 					var newData = &Info{}    //更换新的数据池数据
-					var id_map = map[string]interface{}{}
-					repeat_id := source.id
-					if idtype == "1" {
-						id_map["_id"] = info.id
+					var repeat_idMap = map[string]interface{}{} //记录判重的
+					var merge_idMap = map[string]interface{}{} //记录合并的
+					if idtype == "1" { //先临时决定一个id
+						repeat_idMap["_id"] = info.id
+						merge_idMap["_id"] = source.id
 					} else {
-						id_map["_id"] = util.StringTOBsonId(info.id)
+						repeat_idMap["_id"] = util.StringTOBsonId(info.id)
+						merge_idMap["_id"] = util.StringTOBsonId(source.id)
 					}
+					repeat_id:=source.id
+					//以下合并相关
 					if isMerger {
-						//需要合并相关操作-合并操作--评功权重打分-合并完替换原始数据池
 						basic_bool := basicDataScore(source, info)
 						if basic_bool {
-							//已原始数据为标准-对比数据打判重标签
-							newData, mergeArr = mergeDataFields(source, info)
+							//已原始数据为标准 - 对比数据打判重标签-
+							newData, mergeArr,is_replace = mergeDataFields(source, info)
 							DM.replaceSourceData(newData, source.id) //替换
+							//对比数据打重复标签的id,原始数据id的记录
 							if idtype == "1" {
-								id_map["_id"] = info.id
+								repeat_idMap["_id"] = info.id
+								merge_idMap["_id"] = source.id
 							} else {
-								id_map["_id"] = util.StringTOBsonId(info.id)
+								repeat_idMap["_id"] = util.StringTOBsonId(info.id)
+								merge_idMap["_id"] = util.StringTOBsonId(source.id)
 							}
-
 							repeat_id = source.id
 						} else {
 							//已对比数据为标准 ,数据池的数据打判重标签
-							newData, mergeArr = mergeDataFields(info, source)
+							newData, mergeArr,is_replace = mergeDataFields(info, source)
 							DM.replaceSourceData(newData, source.id) //替换
+
+							//原始数据打重复标签的id,   对比数据id的记录
 							if idtype == "1" {
-								id_map["_id"] = source.id
+								repeat_idMap["_id"] = source.id
+								merge_idMap["_id"] = info.id
 							} else {
-								id_map["_id"] = util.StringTOBsonId(source.id)
+								repeat_idMap["_id"] = util.StringTOBsonId(source.id)
+								merge_idMap["_id"] = util.StringTOBsonId(info.id)
 							}
-
 							repeat_id = info.id
 						}
-					}
 
-					var update_map = map[string]interface{}{
-						"$set": map[string]interface{}{
-							"repeat_reason": reason,
-							"repeat":        1,
-							"repeatid":      repeat_id,
-						},
-					}
-					if isMerger {
-						if len(newData.mergemap) > 0 {
-							update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
-						}
-						//更新合并后的数据
-						for _, value := range mergeArr {
-							if value == 1 {
-								update_map["$set"].(map[string]interface{})["area"] = newData.area
-								update_map["$set"].(map[string]interface{})["city"] = newData.city
-							} else if value == 2 {
-								update_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-							} else if value == 3 {
-								update_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-							} else if value == 4 {
-								update_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-							} else if value == 5 {
-								update_map["$set"].(map[string]interface{})["budget"] = newData.budget
-							} else if value == 6 {
-								update_map["$set"].(map[string]interface{})["winner"] = newData.winner
-							} else if value == 7 {
-								update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-							} else if value == 8 {
-								update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-							} else if value == 9 {
-								update_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
-							}else {
+
+						merge_map := make(map[string]interface{},0)
+						if is_replace {//有过合并-更新数据
+
+							merge_map = map[string]interface{}{
+								"$set": map[string]interface{}{
+									"merge":newData.mergemap,
+								},
+							}
+
+							//更新合并后的数据
+							for _, value := range mergeArr {
+								if value == 1 {
+									merge_map["$set"].(map[string]interface{})["area"] = newData.area
+									merge_map["$set"].(map[string]interface{})["city"] = newData.city
+								} else if value == 2 {
+									merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
+								} else if value == 3 {
+									merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
+								} else if value == 4 {
+									merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
+								} else if value == 5 {
+									merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
+								} else if value == 6 {
+									merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
+								} else if value == 7 {
+									merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
+								} else if value == 8 {
+									merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
+								} else if value == 9 {
+									merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+								}else {
+								}
+
+								if value==0 {
+
+								}
 							}
+							//模板数据更新
+							updateExtract = append(updateExtract, []map[string]interface{}{
+								merge_idMap,
+								merge_map,
+							})
 						}
 					}
-					//构建数据库更新用到的
+
+
+					//重复数据打标签
 					updateExtract = append(updateExtract, []map[string]interface{}{
-						id_map,
-						update_map,
+						repeat_idMap,
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"repeat": 1,
+								"repeat_reason": reason,
+								"repeat_id":repeat_id,
+							},
+						},
 					})
+
 				}
 			}
 		}(tmp)
@@ -434,7 +460,6 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 			}()
 			info := NewInfo(tmp)
 			if invalidData(info.buyer, info.projectname, info.projectcode,info.contractnumber) {
-				//mapLock.Lock()
 				updateExtract = append(updateExtract, []map[string]interface{}{
 					map[string]interface{}{
 						"_id": tmp["_id"],
@@ -449,7 +474,6 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 					mgo.UpdateBulk(extract, updateExtract...)
 					updateExtract = [][]map[string]interface{}{}
 				}
-				//mapLock.Unlock()
 			} else {
 				b, source, reason := HM.checkHistory(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
@@ -470,86 +494,110 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 						})
 					} else {
 						repeateN++
+						var is_replace  = false
 						var mergeArr = []int64{} //更改合并数组记录
 						var newData = &Info{}    //更换新的数据池数据
-						var id_map = map[string]interface{}{}
-						repeat_id := source.id
-						if idtype == "1" {
-							id_map["_id"] = info.id
+						var repeat_idMap = map[string]interface{}{} //记录判重的
+						var merge_idMap = map[string]interface{}{} //记录合并的
+						if idtype == "1" { //先临时决定一个id
+							repeat_idMap["_id"] = info.id
+							merge_idMap["_id"] = source.id
 						} else {
-							id_map["_id"] = util.StringTOBsonId(info.id)
+							repeat_idMap["_id"] = util.StringTOBsonId(info.id)
+							merge_idMap["_id"] = util.StringTOBsonId(source.id)
 						}
+						repeat_id:=source.id
+						//以下合并相关
 						if isMerger {
-							//需要合并相关操作-合并操作--评功权重打分-合并完替换原始数据池
 							basic_bool := basicDataScore(source, info)
 							if basic_bool {
-								//已原始数据为标准-对比数据打判重标签
-								newData, mergeArr = mergeDataFields(source, info)
+								//已原始数据为标准 - 对比数据打判重标签-
+								newData, mergeArr,is_replace = mergeDataFields(source, info)
 								DM.replaceSourceData(newData, source.id) //替换
+								//对比数据打重复标签的id,原始数据id的记录
 								if idtype == "1" {
-									id_map["_id"] = info.id
+									repeat_idMap["_id"] = info.id
+									merge_idMap["_id"] = source.id
 								} else {
-									id_map["_id"] = util.StringTOBsonId(info.id)
+									repeat_idMap["_id"] = util.StringTOBsonId(info.id)
+									merge_idMap["_id"] = util.StringTOBsonId(source.id)
 								}
-
 								repeat_id = source.id
 							} else {
 								//已对比数据为标准 ,数据池的数据打判重标签
-								newData, mergeArr = mergeDataFields(info, source)
+								newData, mergeArr,is_replace = mergeDataFields(info, source)
 								DM.replaceSourceData(newData, source.id) //替换
+
+								//原始数据打重复标签的id,   对比数据id的记录
 								if idtype == "1" {
-									id_map["_id"] = source.id
+									repeat_idMap["_id"] = source.id
+									merge_idMap["_id"] = info.id
 								} else {
-									id_map["_id"] = util.StringTOBsonId(source.id)
+									repeat_idMap["_id"] = util.StringTOBsonId(source.id)
+									merge_idMap["_id"] = util.StringTOBsonId(info.id)
 								}
-
 								repeat_id = info.id
 							}
-						}
 
-						var update_map = map[string]interface{}{
-							"$set": map[string]interface{}{
-								"repeat_reason": reason,
-								"repeat":        1,
-								"repeatid":      repeat_id,
-							},
-						}
-						if isMerger {
-							//合并记录
-							if len(newData.mergemap) > 0 {
-								update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
-							}
-							//更新合并后的数据
-							for _, value := range mergeArr {
-								if value == 1 {
-									update_map["$set"].(map[string]interface{})["area"] = newData.area
-									update_map["$set"].(map[string]interface{})["city"] = newData.city
-								} else if value == 2 {
-									update_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-								} else if value == 3 {
-									update_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-								} else if value == 4 {
-									update_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-								} else if value == 5 {
-									update_map["$set"].(map[string]interface{})["budget"] = newData.budget
-								} else if value == 6 {
-									update_map["$set"].(map[string]interface{})["winner"] = newData.winner
-								} else if value == 7 {
-									update_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-								} else if value == 8 {
-									update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-								} else if value == 9 {
-									update_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
-								}else {
 
+							merge_map := make(map[string]interface{},0)
+							if is_replace {//有过合并-更新数据
+
+								merge_map = map[string]interface{}{
+									"$set": map[string]interface{}{
+										"merge":newData.mergemap,
+									},
+								}
+
+								//更新合并后的数据
+								for _, value := range mergeArr {
+									if value == 1 {
+										merge_map["$set"].(map[string]interface{})["area"] = newData.area
+										merge_map["$set"].(map[string]interface{})["city"] = newData.city
+									} else if value == 2 {
+										merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
+									} else if value == 3 {
+										merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
+									} else if value == 4 {
+										merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
+									} else if value == 5 {
+										merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
+									} else if value == 6 {
+										merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
+									} else if value == 7 {
+										merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
+									} else if value == 8 {
+										merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
+									} else if value == 9 {
+										merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
+									}else {
+									}
+
+									if value==0 {
+
+									}
 								}
+								//模板数据更新
+								updateExtract = append(updateExtract, []map[string]interface{}{
+									merge_idMap,
+									merge_map,
+								})
 							}
 						}
-						//构建数据库更新用到的
+
+
+						//重复数据打标签
 						updateExtract = append(updateExtract, []map[string]interface{}{
-							id_map,
-							update_map,
+							repeat_idMap,
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat": 1,
+									"repeat_reason": reason,
+									"repeat_id":repeat_id,
+								},
+							},
 						})
+
 					}
 				}
 			}
@@ -590,151 +638,93 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	}
 }
 
-//合并字段
-func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
+//合并字段-并更新merge字段的值
+func mergeDataFields(source *Info, info *Info) (*Info, []int64,bool) {
 
-	var mergeArr []int64
-	mergeArr = make([]int64, 0)
+	//定义一个新的map[string]interface{}{}
+	merge_recordMap := make(map[string]interface{},0)
+	mergeArr := make([]int64, 0)
+	//是否替换数据了-记录原始的数据
+	is_replace :=false
 	//1、城市
 	if (source.area == "" || source.area == "全国") && info.area != "全国" && info.area != "" {
-		var arrA []string
-		if source.mergemap["area"] == nil {
-			arrA = make([]string, 0)
-		} else {
-			arrA = source.mergemap["area"].([]string)
-		}
-		arrA = append(arrA, source.area)
-		source.mergemap["area"] = arrA
-
-		var arrC []string
-		if source.mergemap["city"] == nil {
-			arrC = make([]string, 0)
-		} else {
-			arrC = source.mergemap["city"].([]string)
-		}
-		arrC = append(arrC, source.city)
-		source.mergemap["city"] = arrC
-
+		merge_recordMap["area"] = info.area
+		merge_recordMap["city"] = info.city
 		source.area = info.area
 		source.city = info.city
 		mergeArr = append(mergeArr, 1)
+		is_replace = true
 	}
 	//2、项目名称
 	if source.projectname == "" && info.projectname != "" {
-		var arr []string
-		if source.mergemap["projectname"] == nil {
-			arr = make([]string, 0)
-		} else {
-			arr = source.mergemap["projectname"].([]string)
-		}
-		arr = append(arr, source.projectname)
-		source.mergemap["projectname"] = arr
-
+		merge_recordMap["projectname"] = info.projectname
 		source.projectname = info.projectname
 		mergeArr = append(mergeArr, 2)
+		is_replace = true
 	}
 	//3、项目编号
 	if source.projectcode == "" && info.projectcode != "" {
-		var arr []string
-		if source.mergemap["projectcode"] == nil {
-			arr = make([]string, 0)
-		} else {
-			arr = source.mergemap["projectcode"].([]string)
-		}
-		arr = append(arr, source.projectcode)
-		source.mergemap["projectcode"] = arr
-
+		merge_recordMap["projectcode"] = info.projectcode
 		source.projectcode = info.projectcode
 		mergeArr = append(mergeArr, 3)
+		is_replace = true
 	}
 	//4、采购单位
 	if source.buyer == "" && info.buyer != "" {
-		var arr []string
-		if source.mergemap["buyer"] == nil {
-			arr = make([]string, 0)
-		} else {
-			arr = source.mergemap["buyer"].([]string)
-		}
-		arr = append(arr, source.buyer)
-		source.mergemap["buyer"] = arr
-
+		merge_recordMap["buyer"] = info.buyer
 		source.buyer = info.buyer
 		mergeArr = append(mergeArr, 4)
+		is_replace = true
 	}
 	//5、预算
 	if source.budget == 0 && info.budget != 0 {
-		var arr []float64
-		if source.mergemap["budget"] == nil {
-			arr = make([]float64, 0)
-		} else {
-			arr = source.mergemap["budget"].([]float64)
-		}
-		arr = append(arr, source.budget)
-		source.mergemap["budget"] = arr
-
+		merge_recordMap["budget"] = info.budget
 		source.budget = info.budget
 		mergeArr = append(mergeArr, 5)
+		is_replace = true
 	}
 	//6、中标单位
 	if source.winner == "" && info.winner != "" {
-		var arr []string
-		if source.mergemap["winner"] == nil {
-			arr = make([]string, 0)
-		} else {
-			arr = source.mergemap["winner"].([]string)
-		}
-		arr = append(arr, source.winner)
-		source.mergemap["winner"] = arr
-
+		merge_recordMap["winner"] = info.winner
 		source.winner = info.winner
 		mergeArr = append(mergeArr, 6)
+		is_replace = true
 	}
 	//7、中标金额
 	if source.bidamount == 0 && info.bidamount != 0 {
-		var arr []float64
-		if source.mergemap["bidamount"] == nil {
-			arr = make([]float64, 0)
-		} else {
-			arr = source.mergemap["bidamount"].([]float64)
-		}
-		arr = append(arr, source.bidamount)
-		source.mergemap["bidamount"] = arr
-
+		merge_recordMap["bidamount"] = info.bidamount
 		source.bidamount = info.bidamount
 		mergeArr = append(mergeArr, 7)
+		is_replace = true
 	}
 	//8、开标时间-地点
 	if source.bidopentime == 0 && info.bidopentime != 0 {
-		var arr []int64
-		if source.mergemap["bidopentime"] == nil {
-			arr = make([]int64, 0)
-		} else {
-			arr = source.mergemap["bidopentime"].([]int64)
-		}
-		arr = append(arr, source.bidopentime)
-		source.mergemap["bidopentime"] = arr
-
+		merge_recordMap["bidopentime"] = info.bidopentime
 		source.bidopentime = info.bidopentime
 		mergeArr = append(mergeArr, 8)
+		is_replace = true
 	}
 
 	//9、合同编号
 	if source.contractnumber == "" && info.contractnumber != "" {
-		var arr []string
-		if source.mergemap["contractnumber"] == nil {
-			arr = make([]string, 0)
-		} else {
-			arr = source.mergemap["contractnumber"].([]string)
-		}
-		arr = append(arr, source.contractnumber)
-		source.mergemap["contractnumber"] = arr
-
+		merge_recordMap["contractnumber"] = info.contractnumber
 		source.contractnumber = info.contractnumber
 		mergeArr = append(mergeArr, 9)
+		is_replace = true
 	}
 
+	if is_replace {//有过替换更新
+		//总次数+1
+		source.mergemap["total_num"] = util.Int64All(source.mergemap["total_num"])+1
+		merge_recordMap["num"] = util.Int64All(source.mergemap["total_num"])
+		//和哪一个数据id进行非空替换的-记录
+		key:=info.id
+		source.mergemap[key] = merge_recordMap
+	}
+
+
 	//以上合并过于简单,待进一步优化
-	return source, mergeArr
+	return source, mergeArr,is_replace
 }
 
 //权重评估

+ 1 - 1
udps/main.go

@@ -24,7 +24,7 @@ func main() {
 	//2018-06-01,2019-02-20
 	/*
 ObjectId("5da3f31aa5cb26b9b798d3aa")
-ObjectId("5da422fba5cb26b9b706984b")
+ObjectId("5da418c4a5cb26b9b7e3e9a6")
 */
 
 	flag.StringVar(&sid, "sid", "", "开始id")