Browse Source

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

* 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract:
  中标预测抽取
  中标预测抽取
  目前不自动删除qyxy_tmp表
  xg
  新增winnerorder字段
Jianghan 4 năm trước cách đây
mục cha
commit
80baa0b8a7

+ 10 - 9
qyxy/src/task.go

@@ -65,15 +65,16 @@ func StartTask() {
 	// 		"$gt": Updatetime,
 	// 	},
 	// }
-	run := QyxyStandard()
-	if run {
-		time.Sleep(5 * time.Minute)
-		if Mgo.DelColl(Dbcoll) {
-			log.Println("Delete Coll ", Dbcoll, "Success")
-		} else {
-			log.Println("Delete Coll ", Dbcoll, "Fail")
-		}
-	}
+	QyxyStandard()
+	// run := QyxyStandard()
+	// if run {
+	// 	time.Sleep(5 * time.Minute)
+	// 	if Mgo.DelColl(Dbcoll) {
+	// 		log.Println("Delete Coll ", Dbcoll, "Success")
+	// 	} else {
+	// 		log.Println("Delete Coll ", Dbcoll, "Fail")
+	// 	}
+	// }
 }
 
 //标准化数据,生索引

+ 3 - 2
src/config.json

@@ -21,15 +21,16 @@
     "elasticPoolSize": 1,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
+ 	"ffield": true,
     "saveresult": false,
     "fieldsfind": false,
     "qualityaudit": false,
     "saveblock": false,
-    "filelength": 100000,
+    "filelength": 500000,
     "iscltlog": false,
     "brandgoods": false,
     "pricenumber":true,
-    "udptaskid": "5cdd3025698414032c8322b1",
+    "udptaskid": "5eda01b0c566ca08409370bb",
     "udpport": "1484",
     "nextNode": [
         {

+ 7 - 6
src/jy/extract/extract.go

@@ -214,9 +214,9 @@ func RunExtractTask(taskId string) {
 			//if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 			//	continue
 			//}
-			if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl"{ //临时
+			if qu.ObjToString(v["spidercode"]) == "a_gjggzyjypt_gcjs_kbjl" { //临时
 				continue
-			}			//根据标题判断是否抽取
+			} //根据标题判断是否抽取
 			b := IsExtract("title", qu.ObjToString(v["title"]), "")
 			if !b {
 				continue
@@ -1817,7 +1817,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if len(j.Winnerorder) > 0 { //候选人信息
 			for i, v := range j.Winnerorder {
 				if v["price"] != nil {
-					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""},j.SpiderCode)[0]
+					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode)[0]
 				}
 			}
 			tmp["winnerorder"] = j.Winnerorder
@@ -1959,7 +1959,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			tmp["projectname"] = j.Title
 		}
 		tmp["repeat"] = 0
-
+		if ju.Ffield {
+			tmp["ffield"] = ffield
+		}
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				/*	if len(e.SiteFields) <= 0 {
@@ -2040,7 +2042,7 @@ func checkFields(tmp map[string]interface{}) map[string]interface{} {
 	delete(tmp, "detail")
 	if _, ok := tmp["bidamount"].(string); ok {
 		delete(tmp, "bidamount")
-	} else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && fb/100 > qu.Float64All(tmp["budget"]){
+	} else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && fb/100 > qu.Float64All(tmp["budget"]) {
 		delete(tmp, "bidamount")
 	}
 	if _, ok := tmp["budget"].(string); ok {
@@ -2404,4 +2406,3 @@ func RemoveReplicaSliceString(slc []string) []string {
 	}
 	return result
 }
-

+ 3 - 2
src/jy/extract/extractInit.go

@@ -20,8 +20,8 @@ import (
 
 type RegLuaInfo struct {
 	//正则或脚本信息
-	Code, Name, Field string  //
-	Score 			  float64
+	Code, Name, Field string //
+	Score             float64
 	RuleText          string  //
 	IsLua             bool    //
 	RegPreBac         *ExtReg //
@@ -144,6 +144,7 @@ type ExtractTask struct {
 	Seg_SV              *gse.Segmenter //分词
 	Luacodes            *sync.Map      //站点规则
 	SiteMerge           *sync.Map      //抽取合并
+
 }
 
 type SiteCity struct {

+ 74 - 12
src/jy/extract/extractudp.go

@@ -160,13 +160,13 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				}
 				var j, jf *ju.Job
 				var isSite bool
-				if ext.IsFileField && (v["projectinfo"] != nil ||v["attach_text"] != nil ){
+				if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
 					v["isextFile"] = true
-					j, jf,isSite = ext.PreInfo(v)
+					j, jf, isSite = ext.PreInfo(v)
 				} else {
-					j, _,isSite = ext.PreInfo(v)
+					j, _, isSite = ext.PreInfo(v)
 				}
-				go ext.ExtractProcess(j, jf,isSite)
+				go ext.ExtractProcess(j, jf, isSite)
 				index++
 				ext.TaskInfo.ProcessPool <- true
 			}
@@ -177,13 +177,13 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				}
 				var j, jf *ju.Job
 				var isSite bool
-				if ext.IsFileField && (v["projectinfo"] != nil ||v["attach_text"] != nil ){
+				if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
 					v["isextFile"] = true
-					j, jf,isSite = ext.PreInfo(v)
+					j, jf, isSite = ext.PreInfo(v)
 				} else {
-					j, _,isSite = ext.PreInfo(v)
+					j, _, isSite = ext.PreInfo(v)
 				}
-				go ext.ExtractProcess(j, jf,isSite)
+				go ext.ExtractProcess(j, jf, isSite)
 				index++
 				ext.TaskInfo.ProcessPool <- true
 			}
@@ -230,18 +230,18 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				_id := qu.BsonIdToSId(v["_id"])
 				var j, jf *ju.Job
 				var isSite bool
-				if ext.IsFileField && (v["projectinfo"] != nil ||v["attach_text"] != nil ){
+				if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
 					v["isextFile"] = true
-					j, jf,isSite = ext.PreInfo(v)
+					j, jf, isSite = ext.PreInfo(v)
 				} else {
-					j, _,isSite = ext.PreInfo(v)
+					j, _, isSite = ext.PreInfo(v)
 				}
 				ext.TaskInfo.ProcessPool <- true
 				wg.Add(1)
 				go func(wg *sync.WaitGroup, j, jf *ju.Job) {
 					defer wg.Done()
 					//log.Debug(index,j.SourceMid,)
-					ext.ExtractProcess(j, jf,isSite)
+					ext.ExtractProcess(j, jf, isSite)
 				}(&wg, j, jf)
 				index++
 				if index%1000 == 0 {
@@ -258,3 +258,65 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 		log.Debug("抽取完成,", "count:", count, ",index:", index, ",bidtotal:", ext.BidTotal, ",eid:", eid)
 	}
 }
+
+//中标预测信息抽取,ossid为附件识别后的id
+var exF *ExtractTask
+
+func ExtractByBidForecast(infoid string, ossid ...string) map[string]interface{} {
+	defer qu.Catch()
+	if exF == nil {
+		exF = &ExtractTask{}
+		exF.Id = qu.ObjToString(ju.Config["udptaskid"])
+		exF.InitTaskInfo()
+		exF.TaskInfo.FDB = db.MgoFactory(1, 2, 600, exF.TaskInfo.FromDbAddr, exF.TaskInfo.FromDB)
+		exF.TaskInfo.TDB = db.MgoFactory(1, 2, 600, exF.TaskInfo.ToDbAddr, exF.TaskInfo.ToDB)
+		exF.InitSite()
+		exF.InitRulePres()
+		exF.InitRuleBacks(false)
+		exF.InitRuleBacks(true)
+		exF.InitRuleCore(false)
+		exF.InitRuleCore(true)
+		exF.InitBlockRule()
+		exF.InitPkgCore()
+		exF.InitTag(false)
+		exF.InitTag(true)
+		exF.InitClearFn(false)
+		exF.InitClearFn(true)
+		if exF.IsExtractCity { //版本上控制是否开始城市抽取
+			//初始化城市DFA信息
+			//exF.InitCityDFA()
+			exF.InitCityInfo()
+			exF.InitAreaCode()
+			exF.InitPostCode()
+		}
+		//质量审核
+		exF.InitAuditFields()
+		exF.InitAuditRule()
+		exF.InitAuditClass()
+		exF.InitAuditRecogField()
+
+		//品牌抽取是否开启
+		ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
+
+		exF.ResultSave(true)
+		exF.BidSave(true)
+		exF.IsRun = true
+		exF.InitFile()
+	}
+	tmp, _ := exF.TaskInfo.FDB.FindById(exF.TaskInfo.FromColl, infoid, nil)
+	if exF.IsFileField && ((*tmp)["projectinfo"] != nil || (*tmp)["attach_text"] != nil) {
+		(*tmp)["isextFile"] = true
+	}
+	exF.TaskInfo.ProcessPool <- true
+	j, jf, _ := exF.PreInfo(*tmp)
+	wg := sync.WaitGroup{}
+	wg.Add(1)
+	go func(wg *sync.WaitGroup, j, jf *ju.Job) {
+		defer wg.Done()
+		exF.ExtractProcess(j, jf, false)
+	}(&wg, j, jf)
+	wg.Wait()
+	exF.BidSave(false)
+
+	return nil
+}

+ 2 - 2
src/jy/util/util.go

@@ -37,7 +37,7 @@ var GoodsGet *DFA     //商品
 var BrandGet *DFA     //品牌
 var IsBrandGoods bool //是否开启品牌抽取
 
-var SaveResult, FieldsFind, IsSaveTag, SaveBlock, QualityAudit bool
+var SaveResult, FieldsFind, IsSaveTag, SaveBlock, QualityAudit, Ffield bool
 
 func init() {
 	syncint = make(chan bool, 1)
@@ -54,7 +54,7 @@ func UtilInit() {
 	IsSaveTag, _ = Config["iscltlog"].(bool)
 	SaveBlock, _ = Config["saveblock"].(bool)
 	QualityAudit, _ = Config["qualityaudit"].(bool)
-
+	Ffield, _ = Config["ffield"].(bool)
 	PriceNumberReg = make(map[string]*regexp.Regexp)
 	for k, v := range PriceNumberConfig {
 		PriceNumberReg[k] = regexp.MustCompile(v)

+ 22 - 3
udpcreateindex/src/biddingall.go

@@ -77,7 +77,7 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 	arr := [][]map[string]interface{}{}
 	arrEs := []map[string]interface{}{}
 	//对比两张表数据,减少查询次数
-	var compare bson.M
+	var compare map[string]interface{}
 	bnil := false
 	for tmp := make(map[string]interface{}); query.Next(tmp); n++ {
 		// if qutil.IntAll(tmp["dataging"]) == 1 { //dataging=1不生索引
@@ -94,7 +94,7 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 		//对比方法----------------
 		for {
 			if compare == nil {
-				compare = make(bson.M)
+				compare = make(map[string]interface{})
 				if !extractquery.Next(compare) {
 					break
 				}
@@ -304,7 +304,25 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 							if len(purchasinglist_new) > 0 {
 								newTmp[v] = purchasinglist_new
 							}
-
+							/*} else if v == "winnerorder" { //中标候选
+							winnerorder_new := []map[string]interface{}{}
+							if winnerorder, _ := tmp[v].([]interface{}); len(winnerorder) > 0 {
+								for _, win := range winnerorder {
+									winMap_new := make(map[string]interface{})
+									winMap := win.(map[string]interface{})
+									for _, wf := range winnerorderlistFields {
+										if winMap[wf] != nil {
+											winMap_new[wf] = winMap[wf]
+										}
+									}
+									if winMap_new != nil && len(winMap_new) > 0 {
+										winnerorder_new = append(winnerorder_new, winMap_new)
+									}
+								}
+							}
+							if len(winnerorder_new) > 0 {
+								newTmp[v] = winnerorder_new
+							}*/
 						} else {
 							if v == "detail" {
 								detail, _ := tmp[v].(string)
@@ -323,6 +341,7 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 				arrEs = append(arrEs, newTmp)
 			}
 			if len(update) > 0 {
+				//delete(update, "winnerorder") //winnerorder不需要更新到bindding表,删除
 				queryId := map[string]interface{}{"_id": tmp["_id"]}
 				set := map[string]interface{}{"$set": update}
 				if len(del) > 0 { //删除的数据

+ 21 - 0
udpcreateindex/src/biddingindex.go

@@ -353,6 +353,26 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 						if len(purchasinglist_new) > 0 {
 							newTmp[v] = purchasinglist_new
 						}
+						/*} else if v == "winnerorder" { //中标候选
+						winnerorder_new := []map[string]interface{}{}
+						if winnerorder, _ := tmp[v].([]interface{}); len(winnerorder) > 0 {
+							for _, win := range winnerorder {
+								winMap_new := make(map[string]interface{})
+								winMap := win.(map[string]interface{})
+								for _, wf := range winnerorderlistFields {
+									if winMap[wf] != nil {
+										winMap_new[wf] = winMap[wf]
+									}
+								}
+								if winMap_new != nil && len(winMap_new) > 0 {
+									winnerorder_new = append(winnerorder_new, winMap_new)
+								}
+							}
+						}
+						if len(winnerorder_new) > 0 {
+							newTmp[v] = winnerorder_new
+						}
+						*/
 					} else {
 						if v == "detail" {
 							detail, _ := tmp[v].(string)
@@ -369,6 +389,7 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 			arrEs = append(arrEs, newTmp)
 		}
 		if len(update) > 0 {
+			//delete(update, "winnerorder") //winnerorder不需要更新到bindding表,删除
 			arr = append(arr, []map[string]interface{}{
 				map[string]interface{}{
 					"_id": tmp["_id"],

+ 11 - 10
udpcreateindex/src/config.json

@@ -1,6 +1,11 @@
 {
     "udpport": ":1483",
     "msg_server": "10.171.112.160:7070",
+     "mongodb": {
+        "addr": "192.168.3.207:27092",
+        "pool": 10,
+        "db": "mxs"
+    },
 	"savedb": {
         "addr": "192.168.3.207:27092",
         "size": 10,
@@ -30,17 +35,18 @@
     },
     "bidding": {
         "db": "mxs",
-        "collect": "test",
+        "collect": "bidding",
         "index": "bidding_v1",
         "type": "bidding",
         "extractdb": "mxs",
         "extractcollect": "extract",
         "indexfields":[ 
-        "buyerzipcode","winnertel","winnerperson","contractcode","winneraddr","agencyaddr","buyeraddr","signaturedate","projectperiod","projectaddr","agencytel","agencyperson","buyerperson","agency","projectscope","projectcode","bidopentime","supervisorrate","buyertel","bidamount","winner","buyer","budget","projectname","bidstatus","buyerclass","topscopeclass","s_topscopeclass","s_subscopeclass","area","city","district","s_winner","_id","title","detail","site","comeintime","href","infoformat","publishtime","s_sha","spidercode","subtype","toptype","projectinfo","purchasing","purchasinglist","filetext","channel"
+        "buyerzipcode","winnertel","winnerperson","contractcode","winneraddr","agencyaddr","buyeraddr","signaturedate","projectperiod","projectaddr","agencytel","agencyperson","buyerperson","agency","projectscope","projectcode","bidopentime","supervisorrate","buyertel","bidamount","winner","buyer","budget","projectname","bidstatus","buyerclass","topscopeclass","s_topscopeclass","s_subscopeclass","area","city","district","s_winner","_id","title","detail","site","comeintime","href","infoformat","publishtime","s_sha","spidercode","subtype","toptype","projectinfo","purchasing","purchasinglist","filetext","channel","winnerorder"
         ],
-        "fields": "buyerzipcode,winnertel,winnerperson,contractcode,winneraddr,agencyaddr,buyeraddr,signaturedate,projectperiod,projectaddr,agencytel,agencyperson,buyerperson,agency,projectscope,projectcode,bidopentime,supervisorrate,buyertel,bidamount,winner,buyer,budget,projectname,buyerclass,topscopeclass,s_topscopeclass,area,city,district,s_winner,toptype,subtype,subscopeclass,s_subscopeclass,dataging",
+        "fields": "buyerzipcode,winnertel,winnerperson,contractcode,winneraddr,agencyaddr,buyeraddr,signaturedate,projectperiod,projectaddr,agencytel,agencyperson,buyerperson,agency,projectscope,projectcode,bidopentime,supervisorrate,buyertel,bidamount,winner,buyer,budget,projectname,buyerclass,topscopeclass,s_topscopeclass,area,city,district,s_winner,toptype,subtype,subscopeclass,s_subscopeclass,dataging,winnerorder",
         "projectinfo": "approvecode,approvecontent,approvestatus,approvetime,approvedept,approvenumber,projecttype,approvecity",
-        "purchasinglist":"itemname,model,unitname,number",
+        "purchasinglist": "itemname,model,unitname,number",
+        "winnerorder": "sort,sortstr,entname",
         "multiIndex": ""
     },
     "filelength": 50000,
@@ -88,13 +94,8 @@
        		"type": "agencyent"
 		}
     },
-    "mongodb": {
-        "addr": "192.168.3.207:27092",
-        "pool": 10,
-        "db": "mxs"
-    },
     "elastic": {
-        "addr": "http://192.168.3.11:9800",
+        "addr": "http://192.168.3.128:9800",
         "pool": 12
     }
 }

+ 23 - 16
udpcreateindex/src/main.go

@@ -14,22 +14,23 @@ import (
 )
 
 var (
-	Sysconfig            map[string]interface{} //配置文件
-	mgo                  *mongodb.MongodbSim    //mongodb操作对象
-	extractmgo           *mongodb.MongodbSim    //mongodb操作对象
-	project2db           *mongodb.MongodbSim    //mongodb操作对象
-	mgostandard          *mongodb.MongodbSim    //mongodb操作对象
-	qyxydb               *mongodb.MongodbSim    //mongodb操作对象
-	udpclient            mu.UdpClient           //udp对象
-	updport              string
-	savesizei            = 500
-	biddingIndexFields   = []string{"_id", "buyerclass", "s_winner", "title", "detail", "detail_bak", "area", "areaval", "site", "type", "amount", "bidopendate", "bidopentime", "buyer", "channel", "city", "comeintime", "contenthtml", "descript", "description", "extracttype", "href", "infoformat", "keywords", "projectcode", "projectname", "publishtime", "s_sha", "spidercode", "subtype", "summary", "toptype", "urltop", "winner", "agency", "budget", "bidamount", "s_subscopeclass", "projectscope", "bidstatus"}
-	projectinfoFields    []string
-	multiIndex           []string
-	purchasinglistFields []string
-	BulkSize             = 400
-	detailLength         = 50000
-	fileLength           = 50000
+	Sysconfig             map[string]interface{} //配置文件
+	mgo                   *mongodb.MongodbSim    //mongodb操作对象
+	extractmgo            *mongodb.MongodbSim    //mongodb操作对象
+	project2db            *mongodb.MongodbSim    //mongodb操作对象
+	mgostandard           *mongodb.MongodbSim    //mongodb操作对象
+	qyxydb                *mongodb.MongodbSim    //mongodb操作对象
+	udpclient             mu.UdpClient           //udp对象
+	updport               string
+	savesizei             = 500
+	biddingIndexFields    = []string{"_id", "buyerclass", "s_winner", "title", "detail", "detail_bak", "area", "areaval", "site", "type", "amount", "bidopendate", "bidopentime", "buyer", "channel", "city", "comeintime", "contenthtml", "descript", "description", "extracttype", "href", "infoformat", "keywords", "projectcode", "projectname", "publishtime", "s_sha", "spidercode", "subtype", "summary", "toptype", "urltop", "winner", "agency", "budget", "bidamount", "s_subscopeclass", "projectscope", "bidstatus"}
+	projectinfoFields     []string
+	multiIndex            []string
+	purchasinglistFields  []string
+	winnerorderlistFields []string
+	BulkSize              = 400
+	detailLength          = 50000
+	fileLength            = 50000
 	//bidding_other连接信息
 	bidding_other_es *elastic.Elastic
 	other_index      string
@@ -130,6 +131,12 @@ func init() {
 			purchasinglistFields = strings.Split(pcl, ",")
 		}
 	}
+	// if bidding["winnerorder"] != nil {
+	// 	winnerorder := util.ObjToString(bidding["winnerorder"])
+	// 	if winnerorder != "" {
+	// 		winnerorderlistFields = strings.Split(winnerorder, ",")
+	// 	}
+	// }
 	if bidding["multiIndex"] != nil {
 		mi := util.ObjToString(bidding["multiIndex"])
 		if mi != "" {