瀏覽代碼

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

fengweiqiang 5 年之前
父節點
當前提交
f17a7763f7

+ 35 - 12
fullproject/src_v1/project.go

@@ -515,7 +515,7 @@ func (p *ProjectTask) NewProject(tmp map[string]interface{}, thisinfo *Info) (st
 	if thisinfo.TopType == "招标" {
 		if thisinfo.SubType != "变更" && thisinfo.SubType != "其它" {
 			set["zbtime"] = tmp["publishtime"]
-			p1.Zbtime = tmp["publishtime"].(int64)
+			p1.Zbtime = qu.Int64All(tmp["publishtime"])
 		}
 	} else if thisinfo.TopType == "结果" || thisinfo.SubType == "合同" {
 			set["jgtime"] = tmp["publishtime"]
@@ -542,6 +542,11 @@ func (p *ProjectTask) NewProject(tmp map[string]interface{}, thisinfo *Info) (st
 		p1.Bidamounttag = 1
 		set["bidamounttag"] = 1
 	}
+	if p1.Bidamount > 0 {
+		set["sortprice"] = p1.Bidamount
+	}else if p1.Budget > 0 {
+		set["sortprice"] = p1.Budget
+	}
 
 	if len(thisinfo.Winners) > 0 {
 		set["s_winner"] = strings.Join(thisinfo.Winners, ",")
@@ -698,18 +703,33 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 		}
 	} else if thisinfo.TopType == "结果" {
 		if thisinfo.SubType == "中标" || thisinfo.SubType == "成交" || thisinfo.SubType == "流标" || thisinfo.SubType == "废标" {
-			jg1 := int64(math.Abs(float64(pInfo.Jgtime - thisinfo.Publishtime)))
-			if pInfo.Jgtime <= 0 {
-				set["jgtime"] = tmp["publishtime"]
-				pInfo.Jgtime = thisinfo.Publishtime
-			}else if jg1 > p.jgTime {
+			if pInfo.Jgtime > 0 {
+				jg1 := int64(math.Abs(float64(pInfo.Jgtime - thisinfo.Publishtime)))
+				//公告状态和项目状态同样都是中标或者成交,
+				if (thisinfo.SubType == "中标" || thisinfo.SubType == "成交") && (pInfo.Bidstatus == "中标" || pInfo.Bidstatus == "成交") {
+					if jg1 > p.jgTime {
+						set["jgtime"] = tmp["publishtime"]
+						pInfo.Jgtime = thisinfo.Publishtime
+					}
+				//公告状态和项目状态同样是流标或者废标
+				}else if (thisinfo.SubType == "流标" || thisinfo.SubType == "废标") && (pInfo.Bidstatus == "流标" || pInfo.Bidstatus == "废标") {
+					if jg1 > p.jgTime {
+						set["jgtime"] = tmp["publishtime"]
+						pInfo.Jgtime = thisinfo.Publishtime
+					}
+				}
+			}else {
 				set["jgtime"] = tmp["publishtime"]
 				pInfo.Jgtime = thisinfo.Publishtime
 			}
-		}else if thisinfo.SubType == "合同" {
-			set["jgtime"] = tmp["publishtime"]
-			pInfo.Jgtime = thisinfo.Publishtime
 		}
+	} else if thisinfo.SubType == "合同" {
+		if pInfo.Bidstatus == "中标" || pInfo.Bidstatus == "成交" {
+			//中标、成交不更新jgtime
+			return
+		}
+		set["jgtime"] = tmp["publishtime"]
+		pInfo.Jgtime = thisinfo.Publishtime
 	}
 	if thisinfo.Bidopentime > pInfo.Bidopentime {
 		pInfo.Bidopentime = thisinfo.Bidopentime
@@ -842,9 +862,7 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 		for _, k := range thisinfo.Winners {
 			if thisinfo.SubType == "流标" || thisinfo.SubType == "废标" {
 				if BinarySearch(pInfo.Winners, k) != -1 {
-					arr := strings.Split(pInfo.Winners, ",")
-					deleteSlice(arr, k, "")
-					pInfo.Winners = strings.Join(pInfo.Winners, ",")
+					deleteSlice(pInfo.Winners, k, "")
 					sort.Strings(pInfo.Winners)
 				}
 			}else {
@@ -877,6 +895,11 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 	} else {
 		set["bidamounttag"] = 1
 	}
+	if pInfo.Bidamount > 0 {
+		set["sortprice"] = pInfo.Bidamount
+	}else if pInfo.Budget > 0 {
+		set["sortprice"] = pInfo.Budget
+	}
 
 	infofiled := InfoField{
 		Budget:       thisinfo.Budget,

+ 6 - 2
fullproject/src_v1/task.go

@@ -91,8 +91,8 @@ func NewPT() *ProjectTask {
 		//updateSign: make(chan bool, 1),
 		coll:       ProjectColl,
 		validTime:  int64(util.IntAllDef(Sysconfig["validdays"], 150) * 86400),
-		statusTime: int64(util.IntAllDef(Sysconfig["statusdays"], 7) * 86400),
-		jgTime:		int64(util.IntAllDef("", 2) * 86400),
+		statusTime: int64(util.IntAllDef(Sysconfig["statusdays"], 15) * 86400),
+		jgTime:		int64(util.IntAllDef(3, 3) * 86400),
 	}
 	return p
 }
@@ -398,6 +398,10 @@ func (p *ProjectTask) enter(db, coll string, q map[string]interface{}) {
 						<-pool
 					}()
 					if util.IntAll(tmp["repeat"]) == 0 {
+						if P_QL.currentType == "project" && util.IntAll(tmp["dataging"]) == 1 {
+							//增量	dataging为1不参与合并
+							return
+						}
 						p.fillInPlace(tmp)
 						info := ParseInfo(tmp)
 						p.currentTime = info.Publishtime

+ 3 - 0
udp_winner/main.go

@@ -208,6 +208,9 @@ func initMongo() {
 	FClient.InitPool()
 	FClientmgoConn := FClient.GetMgoConn()
 	defer FClient.DestoryMongoConn(FClientmgoConn)
+
+
+
 	//加载省市县代码
 	cursor2 := FClientmgoConn.DB(Config["mgodb_extract_kf"]).C("address").Find(bson.M{}).Select(bson.M{"province": 1, "code": 1, "city": 1, "district": 1}).Iter()
 	//defer FClient.Connect(cc)

+ 39 - 20
udpcreateindex/src/biddingall.go

@@ -1,6 +1,7 @@
 package main
 
 import (
+	"fmt"
 	"log"
 	qutil "qfw/util"
 	elastic "qfw/util/elastic"
@@ -12,7 +13,11 @@ import (
 
 //对字段处理 bidamount  budget
 //招标数据表和抽取表一一对应开始更新
-
+/*
+	注意:
+	1、biddingall任务跑历史数据生成索引并更新bidding表
+	2、调用biddingall任务时config.json中indexfields配置要有purchasing、purchasinglist、filetext
+*/
 func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 	defer qutil.Catch()
 	thread := 40
@@ -126,6 +131,7 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 		//下面可以多线程跑的--->
 		//处理分类
 		mpool <- true
+		_id := tmp["_id"]
 		go func(tmp, update, compare, del map[string]interface{}, bnil bool) {
 			defer func() {
 				<-mpool
@@ -191,7 +197,16 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 			if len(ps) > ESLEN {
 				tmp["projectscope"] = string(([]rune(ps))[:4000])
 			}
-
+			//对标的物为空处理
+			if filetext := getFileText(tmp); len(filetext) > 0 { //attach_text
+				tmp["filetext"] = filetext
+			}
+			if purchasing, ok := tmp["purchasing"].(string); ok && purchasing == "" {
+				delete(tmp, "purchasing")
+			}
+			if purchasinglist, ok := tmp["purchasinglist"].([]interface{}); ok && len(purchasinglist) == 0 {
+				delete(tmp, "purchasinglist")
+			}
 			//预算和中标金额
 			//			if s_budget := fmt.Sprint(tmp["budget"]); s_budget == "" || s_budget == "<nil>" || s_budget == "null" {
 			//				tmp["budget"] = nil
@@ -216,29 +231,33 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 								newmap := map[string]interface{}{}
 								for _, v1 := range projectinfoFields {
 									if mp[v1] != nil {
-										newmap[v1] = mp[v1]
+										newmap[v1] = fmt.Sprint(mp[v1])
 									}
 								}
-								newTmp[v] = newmap
-								attachments := mp["attachments"]
-								con := ""
-								if attachments != nil {
-									am, _ := attachments.(map[string]interface{})
-									if am != nil {
-										for _, v1 := range am {
-											vm, _ := v1.(map[string]interface{})
-											if vm != nil {
-												c, _ := vm["content"].(string)
-												con += c
-											}
+								if len(newmap) > 0 {
+									newTmp[v] = newmap
+								}
+							}
+						} else if v == "purchasinglist" { //标的物处理
+							purchasinglist_new := []map[string]interface{}{}
+							if pcl, _ := tmp[v].([]interface{}); len(pcl) > 0 {
+								for _, ls := range pcl {
+									lsm_new := make(map[string]interface{})
+									lsm := ls.(map[string]interface{})
+									for _, pf := range purchasinglistFields {
+										if lsm[pf] != nil {
+											lsm_new[pf] = lsm[pf]
 										}
 									}
-								}
-								con = FilterDetailSpace(con)
-								if con != "" {
-									newTmp["attachments"] = con
+									if lsm_new != nil && len(lsm_new) > 0 {
+										purchasinglist_new = append(purchasinglist_new, lsm_new)
+									}
 								}
 							}
+							if len(purchasinglist_new) > 0 {
+								newTmp[v] = purchasinglist_new
+							}
+
 						} else {
 							if v == "detail" {
 								detail, _ := tmp[v].(string)
@@ -276,7 +295,7 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 			UpdatesLock.Unlock()
 		}(tmp, update, compare, del, bnil)
 		if n%1000 == 0 {
-			log.Println("current:", n, tmp["_id"])
+			log.Println("current:", n, _id)
 		}
 		tmp = make(map[string]interface{})
 	}

+ 59 - 32
udpcreateindex/src/biddingdata.go

@@ -1,7 +1,7 @@
 package main
 
 import (
-	//	"fmt"
+	"fmt"
 	"log"
 	qutil "qfw/util"
 	elastic "qfw/util/elastic"
@@ -122,12 +122,11 @@ func biddingDataTask(data []byte, mapInfo map[string]interface{}) {
 					bnil = false
 					//更新bidding表,生成索引
 					for _, k := range fields {
-						v1 := compare[k]
-						v2 := tmp[k]
+						v1 := compare[k] //extract
+						v2 := tmp[k]     //bidding
 						if v2 == nil && v1 != nil {
 							update[k] = v1
 						} else if v2 != nil && v1 != nil {
-							//update[k+"_b"] = v2
 							update[k] = v1
 						} else if v2 != nil && v1 == nil {
 							update[k] = v2
@@ -165,14 +164,12 @@ func biddingDataTask(data []byte, mapInfo map[string]interface{}) {
 			if !bnil && compare != nil {
 				subscopeclass, _ := compare["subscopeclass"].([]interface{})
 				if subscopeclass != nil {
-					//str := ","
 					m1 := map[string]bool{}
 					newclass := []string{}
 					for _, sc := range subscopeclass {
 						sclass, _ := sc.(string)
 						if !m1[sclass] {
 							m1[sclass] = true
-							//str += sclass + ","
 							newclass = append(newclass, sclass)
 						}
 					}
@@ -213,14 +210,26 @@ func biddingDataTask(data []byte, mapInfo map[string]interface{}) {
 			for tk, tv := range update {
 				tmp[tk] = tv
 			}
+			if tmp["supervisorrate"] != nil { //临时处理supervisorrate抽取类型为string不生索引
+				if _, ok := tmp["supervisorrate"].(string); ok { //supervisorrate数据为string类型
+					delete(tmp, "supervisorrate")
+				}
+			}
 			//对projectscope字段的索引处理
 			ps, _ := tmp["projectscope"].(string)
-			if ps == "" {
-				tmp["projectscope"] = "" //= tmp["detail"]
-			}
 			if len(ps) > ESLEN {
 				tmp["projectscope"] = string(([]rune(ps))[:4000])
 			}
+			//对标的物为空处理
+			if filetext := getFileText(tmp); len(filetext) > 0 { //attach_text
+				tmp["filetext"] = filetext
+			}
+			if purchasing, ok := tmp["purchasing"].(string); ok && purchasing == "" {
+				delete(tmp, "purchasing")
+			}
+			if purchasinglist, ok := tmp["purchasinglist"].([]interface{}); ok && len(purchasinglist) == 0 {
+				delete(tmp, "purchasinglist")
+			}
 			//			if s_budget := fmt.Sprint(tmp["budget"]); s_budget == "" || s_budget == "<nil>" || s_budget == "null" {
 			//				tmp["budget"] = nil
 			//			} else if sbd, ok := tmp["budget"].(string); ok {
@@ -234,7 +243,7 @@ func biddingDataTask(data []byte, mapInfo map[string]interface{}) {
 
 			if qutil.IntAll(update["extracttype"]) != -1 {
 				newTmp := map[string]interface{}{}
-				for _, v := range indexfield {
+				for _, v := range biddingIndexFields { // indexfield
 					if tmp[v] != nil {
 						if "projectinfo" == v {
 							mp, _ := tmp[v].(map[string]interface{})
@@ -242,28 +251,49 @@ func biddingDataTask(data []byte, mapInfo map[string]interface{}) {
 								newmap := map[string]interface{}{}
 								for _, v1 := range projectinfoFields {
 									if mp[v1] != nil {
-										newmap[v1] = mp[v1]
+										newmap[v1] = fmt.Sprint(mp[v1])
 									}
 								}
-								newTmp[v] = newmap
-								attachments := mp["attachments"]
-								con := ""
-								if attachments != nil {
-									am, _ := attachments.(map[string]interface{})
-									if am != nil {
-										for _, v1 := range am {
-											vm, _ := v1.(map[string]interface{})
-											if vm != nil {
-												c, _ := vm["content"].(string)
-												con += c
-											}
+								if len(newmap) > 0 {
+									newTmp[v] = newmap
+								}
+								// attachments := mp["attachments"]
+								// con := ""
+								// if attachments != nil {
+								// 	am, _ := attachments.(map[string]interface{})
+								// 	if am != nil {
+								// 		for _, v1 := range am {
+								// 			vm, _ := v1.(map[string]interface{})
+								// 			if vm != nil {
+								// 				c, _ := vm["content"].(string)
+								// 				con += c
+								// 			}
+								// 		}
+								// 	}
+								// }
+								// if con != "" {
+								// 	con = FilterDetailSpace(con)
+								// 	newTmp["attachments"] = con
+								// }
+							}
+						} else if v == "purchasinglist" { //标的物处理
+							purchasinglist_new := []map[string]interface{}{}
+							if pcl, _ := tmp[v].([]interface{}); len(pcl) > 0 {
+								for _, ls := range pcl {
+									lsm_new := make(map[string]interface{})
+									lsm := ls.(map[string]interface{})
+									for _, pf := range purchasinglistFields {
+										if lsm[pf] != nil {
+											lsm_new[pf] = lsm[pf]
 										}
 									}
+									if lsm_new != nil && len(lsm_new) > 0 {
+										purchasinglist_new = append(purchasinglist_new, lsm_new)
+									}
 								}
-								if con != "" {
-									con = FilterDetailSpace(con)
-									newTmp["attachments"] = con
-								}
+							}
+							if len(purchasinglist_new) > 0 {
+								newTmp[v] = purchasinglist_new
 							}
 						} else {
 							if v == "detail" {
@@ -273,8 +303,6 @@ func biddingDataTask(data []byte, mapInfo map[string]interface{}) {
 								newTmp[v] = tmp[v]
 							}
 						}
-					} else if v == "budget" || v == "bidamount" {
-						newTmp[v] = nil
 					}
 				}
 				UpdatesLock.Lock()
@@ -284,7 +312,7 @@ func biddingDataTask(data []byte, mapInfo map[string]interface{}) {
 			UpdatesLock.Lock()
 			if len(arrEs) >= BulkSize-1 {
 				tmps := arrEs
-				elastic.BulkSave(index, itype, &tmps, false)
+				elastic.BulkSave(index, itype, &tmps, true)
 				arrEs = []map[string]interface{}{}
 			}
 			UpdatesLock.Unlock()
@@ -300,8 +328,7 @@ func biddingDataTask(data []byte, mapInfo map[string]interface{}) {
 	UpdatesLock.Lock()
 	if len(arrEs) > 0 {
 		tmps := arrEs
-		log.Println(tmps[0])
-		elastic.BulkSave(index, itype, &tmps, false)
+		elastic.BulkSave(index, itype, &tmps, true)
 	}
 	UpdatesLock.Unlock()
 	log.Println(mapInfo, "create bidding index...over", n)

+ 322 - 37
udpcreateindex/src/biddingindex.go

@@ -2,6 +2,7 @@ package main
 
 import (
 	"encoding/json"
+	"fmt"
 	"log"
 	mu "mfw/util"
 	"net"
@@ -82,6 +83,258 @@ func biddingTask(data []byte, mapInfo map[string]interface{}) {
 	log.Println(mapInfo, "create bidding index...over", "all:", count, "n1:", n1, "n2:", n2)
 }
 
+// func doIndex1(infos []map[string]interface{}, eMap map[string]map[string]interface{}, index, itype, db, c, bkey string) (int, int) {
+// 	n1, n2 := 0, 0
+// 	//线程池
+// 	UpdatesLock := sync.Mutex{}
+// 	fields := strings.Split(bidding["fields"].(string), ",")
+// 	//更新数组
+// 	arr := [][]map[string]interface{}{}
+// 	arrEs := []map[string]interface{}{}
+// 	//对比两张表数据,减少查询次数
+// 	var compare bson.M
+// 	log.Println("开始迭代..")
+// 	for n, tmp := range infos {
+// 		n1++
+// 		// if qutil.IntAll(tmp["dataging"]) == 1 { //dataging=1不生索引
+// 		// 	tmp = make(map[string]interface{})
+// 		// 	continue
+// 		// }
+// 		update := map[string]interface{}{} //要更新的mongo数据
+// 		//对比方法----------------
+// 		tid := qutil.BsonIdToSId(tmp["_id"])
+// 		if eMap[tid] != nil {
+// 			compare = eMap[tid]
+// 			if qutil.IntAll(compare["dataging"]) == 1 { //extract中dataging=1不生索引
+// 				tmp = make(map[string]interface{})
+// 				compare = nil
+// 				continue
+// 			}
+// 			delete(eMap, tid)
+// 			//更新bidding表,生成索引
+// 			for _, k := range fields {
+// 				v1 := compare[k] //extract
+// 				v2 := tmp[k]     //bidding
+// 				if v2 == nil && v1 != nil {
+// 					update[k] = v1
+// 				} else if v2 != nil && v1 != nil {
+// 					//update[k+"_b"] = v2
+// 					update[k] = v1
+// 				} else if v2 != nil && v1 == nil {
+// 					//update[k+"_b"] = v2
+// 				}
+// 			}
+// 			if qutil.IntAll(compare["repeat"]) == 1 {
+// 				update["extracttype"] = -1
+// 			} else {
+// 				update["extracttype"] = 1
+// 			}
+// 		} else {
+// 			compare = nil
+// 		}
+// 		//下面可以多线程跑的--->
+// 		//处理分类
+// 		if compare != nil { //extract
+// 			subscopeclass, _ := compare["subscopeclass"].([]interface{})
+// 			if subscopeclass != nil {
+// 				//str := ","
+// 				m1 := map[string]bool{}
+// 				newclass := []string{}
+// 				for _, sc := range subscopeclass {
+// 					sclass, _ := sc.(string)
+// 					if !m1[sclass] {
+// 						m1[sclass] = true
+// 						//str += sclass + ","
+// 						newclass = append(newclass, sclass)
+// 					}
+// 				}
+// 				update["s_subscopeclass"] = strings.Join(newclass, ",")
+// 				update["subscopeclass"] = newclass
+// 			}
+// 			//处理中标企业
+// 			//			winner, _ := compare["winner"].(string)
+// 			//			m1 := map[string]bool{}
+// 			//			if winner != "" {
+// 			//				m1[winner] = true
+// 			//			}
+// 			//			package1 := compare["package"]
+// 			//			if package1 != nil {
+// 			//				packageM, _ := package1.(map[string]interface{})
+// 			//				for _, p := range packageM {
+// 			//					pm, _ := p.(map[string]interface{})
+// 			//					pw, _ := pm["winner"].(string)
+// 			//					if pw != "" {
+// 			//						m1[pw] = true
+// 			//					}
+// 			//				}
+// 			//			}
+// 			compare = nil
+// 			//			if len(m1) > 0 {
+// 			//				//str := ","
+// 			//				winnerarr := []string{}
+// 			//				for k, _ := range m1 {
+// 			//					//str += k + ","
+// 			//					winnerarr = append(winnerarr, k)
+// 			//				}
+// 			//				update["s_winner"] = strings.Join(winnerarr, ",")
+// 			//			}
+// 		}
+// 		//------------------对比结束
+
+// 		//处理key descript
+// 		if bkey == "" {
+// 			DealInfo(&tmp, &update)
+// 		}
+// 		//同时保存到elastic
+// 		for tk, tv := range update {
+// 			tmp[tk] = tv
+// 		}
+// 		if tmp["supervisorrate"] != nil { //临时处理supervisorrate抽取类型为string不生索引
+// 			if _, ok := tmp["supervisorrate"].(string); ok { //supervisorrate数据为string类型
+// 				delete(tmp, "supervisorrate")
+// 			}
+// 		}
+// 		//对projectscope字段的索引处理
+// 		ps, _ := tmp["projectscope"].(string)
+// 		//		if ps == "" {
+// 		//			tmp["projectscope"] = "" //= tmp["detail"]
+// 		//		}
+// 		if len(ps) > ESLEN {
+// 			tmp["projectscope"] = string(([]rune(ps))[:4000])
+// 		}
+// 		//		if s_budget := fmt.Sprint(tmp["budget"]); s_budget == "" || s_budget == "<nil>" || s_budget == "null" {
+// 		//			tmp["budget"] = nil
+// 		//		} else if sbd, ok := tmp["budget"].(string); ok {
+// 		//			tmp["budget"] = ObjToMoney([]interface{}{sbd, sbd})[0]
+// 		//		}
+// 		//		if s_bidamount := fmt.Sprint(tmp["bidamount"]); s_bidamount == "" || s_bidamount == "<nil>" || s_bidamount == "null" {
+// 		//			tmp["bidamount"] = nil
+// 		//		} else if sbd, ok := tmp["bidamount"].(string); ok {
+// 		//			tmp["bidamount"] = ObjToMoney([]interface{}{sbd, sbd})[0]
+// 		//		}
+// 		UpdatesLock.Lock()
+// 		//		for k1, _ := range tmp {
+// 		//			if strings.HasSuffix(k1, "_b") || k1 == "contenthtml" {
+// 		//				delete(tmp, k1)
+// 		//			}
+// 		//		}
+// 		go IS.Add("bidding")
+// 		if qutil.IntAll(update["extracttype"]) != -1 {
+// 			newTmp := map[string]interface{}{}     //最终生索引的数据
+// 			for _, v := range biddingIndexFields { //索引字段
+// 				if tmp[v] != nil {
+// 					if "projectinfo" == v {
+// 						mp, _ := tmp[v].(map[string]interface{})
+// 						if mp != nil {
+// 							newmap := map[string]interface{}{}
+// 							for _, v1 := range projectinfoFields {
+// 								if mp[v1] != nil {
+// 									newmap[v1] = fmt.Sprint(mp[v1])
+// 								}
+// 							}
+// 							if len(newmap) > 0 {
+// 								newTmp[v] = newmap
+// 							}
+// 							// attachments := mp["attachments"]
+// 							// con := ""
+// 							// if attachments != nil {
+// 							// 	am, _ := attachments.(map[string]interface{})
+// 							// 	if am != nil {
+// 							// 		for _, v1 := range am {
+// 							// 			vm, _ := v1.(map[string]interface{})
+// 							// 			if vm != nil {
+// 							// 				c, _ := vm["content"].(string)
+// 							// 				con += c
+// 							// 			}
+// 							// 		}
+// 							// 	}
+// 							// }
+// 							// con = FilterDetailSpace(con)
+// 							// if con != "" {
+// 							// 	newTmp["attachments"] = con
+// 							// }
+// 						}
+// 					} else if v == "purchasinglist" { //标的物处理
+// 						purchasinglist_new := []map[string]interface{}{}
+// 						if pcl, _ := tmp[v].([]interface{}); len(pcl) > 0 {
+// 							for _, ls := range pcl {
+// 								lsm_new := make(map[string]interface{})
+// 								lsm := ls.(map[string]interface{})
+// 								for _, pf := range purchasinglistFields {
+// 									if lsm[pf] != nil {
+// 										lsm_new[pf] = lsm[pf]
+// 									}
+// 								}
+// 								if lsm_new != nil && len(lsm_new) > 0 {
+// 									purchasinglist_new = append(purchasinglist_new, lsm_new)
+// 								}
+// 							}
+// 						}
+// 						if len(purchasinglist_new) > 0 {
+// 							newTmp[v] = purchasinglist_new
+// 						}
+
+// 					} else {
+// 						if v == "detail" {
+// 							detail, _ := tmp[v].(string)
+// 							newTmp[v] = FilterDetail(detail)
+// 						} else {
+// 							newTmp[v] = tmp[v]
+// 						}
+// 					}
+// 				}
+// 			}
+// 			arrEs = append(arrEs, newTmp)
+// 		}
+// 		if len(update) > 0 {
+// 			arr = append(arr, []map[string]interface{}{
+// 				map[string]interface{}{
+// 					"_id": tmp["_id"],
+// 				},
+// 				map[string]interface{}{
+// 					"$set": update,
+// 				},
+// 			})
+// 		}
+// 		if len(arr) >= BulkSize-1 {
+// 			mgo.UpdateBulkAll(db, c, arr...)
+// 			arr = [][]map[string]interface{}{}
+// 		}
+// 		if len(arrEs) >= BulkSize-1 {
+// 			tmps := arrEs
+// 			elastic.BulkSave(index, itype, &tmps, true)
+// 			if other_index != "" && other_itype != "" {
+// 				bidding_other_es.BulkSave(other_index, other_itype, &tmps, true)
+// 			}
+// 			if len(multiIndex) == 2 {
+// 				elastic.BulkSave(multiIndex[0], multiIndex[1], &tmps, true)
+// 			}
+// 			arrEs = []map[string]interface{}{}
+// 		}
+// 		UpdatesLock.Unlock()
+// 		if n%100 == 0 {
+// 			log.Println("current:", n)
+// 		}
+// 		tmp = make(map[string]interface{})
+// 	}
+// 	UpdatesLock.Lock()
+// 	if len(arr) > 0 {
+// 		mgo.UpdateBulkAll(db, c, arr...)
+// 	}
+// 	if len(arrEs) > 0 {
+// 		tmps := arrEs
+// 		elastic.BulkSave(index, itype, &tmps, true)
+// 		if other_index != "" && other_itype != "" {
+// 			bidding_other_es.BulkSave(other_index, other_itype, &tmps, true)
+// 		}
+// 		if len(multiIndex) == 2 {
+// 			elastic.BulkSave(multiIndex[0], multiIndex[1], &tmps, true)
+// 		}
+// 	}
+// 	UpdatesLock.Unlock()
+// 	return n1, n2
+// }
+
 func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interface{}, index, itype, db, c, bkey string) (int, int) {
 	n1, n2 := 0, 0
 	//线程池
@@ -95,15 +348,20 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 	log.Println("开始迭代..")
 	for n, tmp := range infos {
 		n1++
-		if qutil.IntAll(tmp["dataging"]) == 1 { //dataging=1不生索引
-			tmp = make(map[string]interface{})
-			continue
-		}
+		// if qutil.IntAll(tmp["dataging"]) == 1 { //dataging=1不生索引
+		// 	tmp = make(map[string]interface{})
+		// 	continue
+		// }
 		update := map[string]interface{}{} //要更新的mongo数据
 		//对比方法----------------
 		tid := qutil.BsonIdToSId(tmp["_id"])
 		if eMap[tid] != nil {
 			compare = eMap[tid]
+			if qutil.IntAll(compare["dataging"]) == 1 { //extract中dataging=1不生索引
+				tmp = make(map[string]interface{})
+				compare = nil
+				continue
+			}
 			delete(eMap, tid)
 			//更新bidding表,生成索引
 			for _, k := range fields {
@@ -196,16 +454,21 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 		if len(ps) > ESLEN {
 			tmp["projectscope"] = string(([]rune(ps))[:4000])
 		}
-		//		if s_budget := fmt.Sprint(tmp["budget"]); s_budget == "" || s_budget == "<nil>" || s_budget == "null" {
-		//			tmp["budget"] = nil
-		//		} else if sbd, ok := tmp["budget"].(string); ok {
-		//			tmp["budget"] = ObjToMoney([]interface{}{sbd, sbd})[0]
-		//		}
-		//		if s_bidamount := fmt.Sprint(tmp["bidamount"]); s_bidamount == "" || s_bidamount == "<nil>" || s_bidamount == "null" {
-		//			tmp["bidamount"] = nil
-		//		} else if sbd, ok := tmp["bidamount"].(string); ok {
-		//			tmp["bidamount"] = ObjToMoney([]interface{}{sbd, sbd})[0]
-		//		}
+		//对标的物为空处理
+		if filetext := getFileText(tmp); len(filetext) > 10 { //attach_text
+			// if site, _ := tmp["site"].(string); site == "中国招标投标公共服务平台" { //site:中国招标投标公共服务平台 detail替换成filetext 并加入标记filedetail=1
+			// 	tmp["detail"] = filetext    //更新es中detail
+			// 	update["detail"] = filetext //更新mongo中detail
+			// 	update["filedetail"] = 1    //mongo中打标记
+			// }
+			tmp["filetext"] = filetext
+		}
+		if purchasing, ok := tmp["purchasing"].(string); ok && purchasing == "" {
+			delete(tmp, "purchasing")
+		}
+		if purchasinglist, ok := tmp["purchasinglist"].([]interface{}); ok && len(purchasinglist) == 0 {
+			delete(tmp, "purchasinglist")
+		}
 		UpdatesLock.Lock()
 		//		for k1, _ := range tmp {
 		//			if strings.HasSuffix(k1, "_b") || k1 == "contenthtml" {
@@ -216,9 +479,6 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 		if qutil.IntAll(update["extracttype"]) != -1 {
 			newTmp := map[string]interface{}{}     //最终生索引的数据
 			for _, v := range biddingIndexFields { //索引字段
-				//			if tmp[v] != nil {
-				//				newTmp[v] = tmp[v]
-				//			}
 				if tmp[v] != nil {
 					if "projectinfo" == v {
 						mp, _ := tmp[v].(map[string]interface{})
@@ -226,29 +486,50 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 							newmap := map[string]interface{}{}
 							for _, v1 := range projectinfoFields {
 								if mp[v1] != nil {
-									newmap[v1] = mp[v1]
+									newmap[v1] = fmt.Sprint(mp[v1])
 								}
 							}
-							newTmp[v] = newmap
-							attachments := mp["attachments"]
-							con := ""
-							if attachments != nil {
-								am, _ := attachments.(map[string]interface{})
-								if am != nil {
-									for _, v1 := range am {
-										vm, _ := v1.(map[string]interface{})
-										if vm != nil {
-											c, _ := vm["content"].(string)
-											con += c
-										}
+							if len(newmap) > 0 {
+								newTmp[v] = newmap
+							}
+							// attachments := mp["attachments"]
+							// con := ""
+							// if attachments != nil {
+							// 	am, _ := attachments.(map[string]interface{})
+							// 	if am != nil {
+							// 		for _, v1 := range am {
+							// 			vm, _ := v1.(map[string]interface{})
+							// 			if vm != nil {
+							// 				c, _ := vm["content"].(string)
+							// 				con += c
+							// 			}
+							// 		}
+							// 	}
+							// }
+							// con = FilterDetailSpace(con)
+							// if con != "" {
+							// 	newTmp["attachments"] = con
+							// }
+						}
+					} else if v == "purchasinglist" { //标的物处理
+						purchasinglist_new := []map[string]interface{}{}
+						if pcl, _ := tmp[v].([]interface{}); len(pcl) > 0 {
+							for _, ls := range pcl {
+								lsm_new := make(map[string]interface{})
+								lsm := ls.(map[string]interface{})
+								for _, pf := range purchasinglistFields {
+									if lsm[pf] != nil {
+										lsm_new[pf] = lsm[pf]
 									}
 								}
-							}
-							con = FilterDetailSpace(con)
-							if con != "" {
-								newTmp["attachments"] = con
+								if lsm_new != nil && len(lsm_new) > 0 {
+									purchasinglist_new = append(purchasinglist_new, lsm_new)
+								}
 							}
 						}
+						if len(purchasinglist_new) > 0 {
+							newTmp[v] = purchasinglist_new
+						}
 					} else {
 						if v == "detail" {
 							detail, _ := tmp[v].(string)
@@ -257,9 +538,7 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 							newTmp[v] = tmp[v]
 						}
 					}
-				} /*else if v == "budget" || v == "bidamount" {
-					newTmp[v] = nil
-				}*/
+				}
 			}
 			arrEs = append(arrEs, newTmp)
 		}
@@ -280,6 +559,9 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 		if len(arrEs) >= BulkSize-1 {
 			tmps := arrEs
 			elastic.BulkSave(index, itype, &tmps, true)
+			if other_index != "" && other_itype != "" {
+				bidding_other_es.BulkSave(other_index, other_itype, &tmps, true)
+			}
 			if len(multiIndex) == 2 {
 				elastic.BulkSave(multiIndex[0], multiIndex[1], &tmps, true)
 			}
@@ -298,6 +580,9 @@ func doIndex(infos []map[string]interface{}, eMap map[string]map[string]interfac
 	if len(arrEs) > 0 {
 		tmps := arrEs
 		elastic.BulkSave(index, itype, &tmps, true)
+		if other_index != "" && other_itype != "" {
+			bidding_other_es.BulkSave(other_index, other_itype, &tmps, true)
+		}
 		if len(multiIndex) == 2 {
 			elastic.BulkSave(multiIndex[0], multiIndex[1], &tmps, true)
 		}

+ 19 - 18
udpcreateindex/src/biddingindexback.go

@@ -4,6 +4,7 @@ import (
 	"log"
 	qutil "qfw/util"
 	elastic "qfw/util/elastic"
+
 	//elastic "qfw/util/elastic_v5"
 	"regexp"
 	//	"strings"
@@ -115,24 +116,24 @@ func biddingBackTask(data []byte, mapInfo map[string]interface{}) {
 							}
 						}
 						newTmp[v] = newmap
-						attachments := mp["attachments"]
-						con := ""
-						if attachments != nil {
-							am, _ := attachments.(map[string]interface{})
-							if am != nil {
-								for _, v1 := range am {
-									vm, _ := v1.(map[string]interface{})
-									if vm != nil {
-										c, _ := vm["content"].(string)
-										con += c
-									}
-								}
-							}
-						}
-						con = FilterDetailSpace(con)
-						if con != "" {
-							newTmp["attachments"] = con
-						}
+						// attachments := mp["attachments"]
+						// con := ""
+						// if attachments != nil {
+						// 	am, _ := attachments.(map[string]interface{})
+						// 	if am != nil {
+						// 		for _, v1 := range am {
+						// 			vm, _ := v1.(map[string]interface{})
+						// 			if vm != nil {
+						// 				c, _ := vm["content"].(string)
+						// 				con += c
+						// 			}
+						// 		}
+						// 	}
+						// }
+						// con = FilterDetailSpace(con)
+						// if con != "" {
+						// 	newTmp["attachments"] = con
+						// }
 					}
 				} else {
 					if v == "detail" {

+ 19 - 18
udpcreateindex/src/biddingindexback2.go

@@ -6,6 +6,7 @@ import (
 	qutil "qfw/util"
 	elastic "qfw/util/elastic"
 	"strings"
+
 	//elastic "qfw/util/elastic_v5"
 	//	"strings"
 	"sync"
@@ -108,24 +109,24 @@ func biddingBackTask2(data []byte, mapInfo map[string]interface{}) {
 							if len(newmap) > 0 {
 								newTmp[v] = newmap
 							}
-							attachments := mp["attachments"]
-							con := ""
-							if attachments != nil {
-								am, _ := attachments.(map[string]interface{})
-								if am != nil {
-									for _, v1 := range am {
-										vm, _ := v1.(map[string]interface{})
-										if vm != nil {
-											c, _ := vm["content"].(string)
-											con += c
-										}
-									}
-								}
-							}
-							if con != "" {
-								con = FilterDetailSpace(con)
-								newTmp["attachments"] = con
-							}
+							// attachments := mp["attachments"]
+							// con := ""
+							// if attachments != nil {
+							// 	am, _ := attachments.(map[string]interface{})
+							// 	if am != nil {
+							// 		for _, v1 := range am {
+							// 			vm, _ := v1.(map[string]interface{})
+							// 			if vm != nil {
+							// 				c, _ := vm["content"].(string)
+							// 				con += c
+							// 			}
+							// 		}
+							// 	}
+							// }
+							// if con != "" {
+							// 	con = FilterDetailSpace(con)
+							// 	newTmp["attachments"] = con
+							// }
 						}
 					} else {
 						if v == "detail" {

+ 231 - 34
udpcreateindex/src/bidingpurchasing.go

@@ -7,7 +7,8 @@ import (
 	"sync"
 	"unicode/utf8"
 
-	u "./util"
+	u "util"
+
 	"gopkg.in/mgo.v2/bson"
 )
 
@@ -37,18 +38,55 @@ func biddingPurchaingTask(q map[string]interface{}) {
 	i := 0
 	for tmp := make(map[string]interface{}); query.Next(tmp); i = i + 1 {
 		n++
-		if util.IntAll(tmp["extracttype"]) == -1 { //重复数据不生索引
+		if util.IntAll(tmp["extracttype"]) == -1 { // || util.IntAll(tmp["dataging"]) == 1 { //重复数据不生索引
 			tmp = make(map[string]interface{})
 			continue
 		}
 		newTmp := map[string]interface{}{} //最终生索引的数据
+		saveArr := []map[string]interface{}{}
 		//oss拼装filetext
-		filetext := getFileText(tmp)
-		newTmp["filetext"] = filetext
+		if filetext := getFileText(tmp); len(filetext) > 10 {
+			if site, _ := tmp["site"].(string); site == "中国招标投标公共服务平台" { //site:中国招标投标公共服务平台 detail替换成filetext 并加入标记filedetail=1
+				tmp["detail"] = filetext
+				saveArr = append(saveArr, map[string]interface{}{"_id": tmp["_id"]})
+				saveArr = append(saveArr, map[string]interface{}{
+					"$set": map[string]interface{}{
+						"filedetail": 1,
+						"detail":     filetext,
+					},
+				})
+			}
+			newTmp["filetext"] = filetext
+		}
 		//purchasing
-		newTmp["purchasing"] = tmp["purchasing"]
+		if purchasing, ok := tmp["purchasing"].(string); ok {
+			if len(purchasing) > 0 {
+				newTmp["purchasing"] = tmp["purchasing"]
+			}
+		}
 		//purchasinglist
-		newTmp["purchasinglist"] = tmp["purchasinglist"]
+		if purchasinglist, ok := tmp["purchasinglist"].([]interface{}); ok {
+			util.Debug(len(purchasinglist))
+			if len(purchasinglist) > 0 {
+				purchasinglist_new := []map[string]interface{}{}
+				for _, ls := range purchasinglist {
+					lsm_new := make(map[string]interface{})
+					lsm := ls.(map[string]interface{})
+					for _, pf := range purchasinglistFields {
+						if lsm[pf] != nil {
+							lsm_new[pf] = lsm[pf]
+						}
+					}
+					if lsm_new != nil && len(lsm_new) > 0 {
+						purchasinglist_new = append(purchasinglist_new, lsm_new)
+					}
+				}
+				util.Debug(len(purchasinglist_new), purchasinglist_new)
+				if len(purchasinglist_new) > 0 {
+					newTmp["purchasinglist"] = purchasinglist_new
+				}
+			}
+		}
 
 		//处理数据
 		if tmp["supervisorrate"] != nil { //临时处理supervisorrate抽取类型为string不生索引
@@ -74,24 +112,24 @@ func biddingPurchaingTask(q map[string]interface{}) {
 							}
 						}
 						newTmp[v] = newmap
-						attachments := mp["attachments"]
-						con := ""
-						if attachments != nil {
-							am, _ := attachments.(map[string]interface{})
-							if am != nil {
-								for _, v1 := range am {
-									vm, _ := v1.(map[string]interface{})
-									if vm != nil {
-										c, _ := vm["content"].(string)
-										con += c
-									}
-								}
-							}
-						}
-						con = FilterDetailSpace(con)
-						if con != "" {
-							newTmp["attachments"] = con
-						}
+						// attachments := mp["attachments"]
+						// con := ""
+						// if attachments != nil {
+						// 	am, _ := attachments.(map[string]interface{})
+						// 	if am != nil {
+						// 		for _, v1 := range am {
+						// 			vm, _ := v1.(map[string]interface{})
+						// 			if vm != nil {
+						// 				c, _ := vm["content"].(string)
+						// 				con += c
+						// 			}
+						// 		}
+						// 	}
+						// }
+						// con = FilterDetailSpace(con)
+						// if con != "" {
+						// 	newTmp["attachments"] = con
+						// }
 					}
 				} else {
 					if v == "detail" {
@@ -104,16 +142,19 @@ func biddingPurchaingTask(q map[string]interface{}) {
 			}
 		}
 		arrEs = append(arrEs, newTmp)
-		arrMgo = append(arrMgo, []map[string]interface{}{ //要更新数据
-			map[string]interface{}{
-				"_id": tmp["_id"],
-			},
-			map[string]interface{}{
-				"$set": map[string]interface{}{
-					"extract_state": 4,
-				},
-			},
-		})
+		if len(saveArr) > 0 {
+			arrMgo = append(arrMgo, saveArr) //要更新数据
+		}
+		// arrMgo = append(arrMgo, []map[string]interface{}{ //要更新数据
+		// 	map[string]interface{}{
+		// 		"_id": tmp["_id"],
+		// 	},
+		// 	map[string]interface{}{
+		// 		"$set": map[string]interface{}{
+		// 			"extract_state": 4,
+		// 		},
+		// 	},
+		// })
 		//批量更新
 		if len(arrMgo) >= savesizei-1 {
 			mgo.UpdateBulkAll(db, c, arrMgo...)
@@ -145,6 +186,162 @@ func biddingPurchaingTask(q map[string]interface{}) {
 	log.Println("create filetext index...over", n)
 }
 
+//定时任务site:中国招标投标公共服务平台
+/*
+	注意:
+	1、调用此任务时config.json中indexfields配置不要有purchasing、purchasinglist、filetext
+*/
+func site_attach_text(q map[string]interface{}) {
+	defer util.Catch()
+	//锁
+	SaveUpdageLock := sync.Mutex{}
+	//连接参数
+	c, _ := bidding["collect"].(string)   //bidding表
+	db, _ := bidding["db"].(string)       //库
+	index, _ := bidding["index"].(string) //索引别名
+	itype, _ := bidding["type"].(string)
+	//
+	session := mgo.GetMgoConn(86400)
+	defer mgo.DestoryMongoConn(session)
+	count, _ := session.DB(db).C(c).Find(&q).Count()
+	log.Println("site_attach_text:	", db, c, "查询语句:", q, "同步总数:", count, "elastic库:", index)
+
+	query := session.DB(db).C(c).Find(q).Select(bson.M{
+		"projectinfo.attachment": 0,
+		"contenthtml":            0,
+	}).Iter()
+	arrEs := make([]map[string]interface{}, savesizei)
+	arrMgo := [][]map[string]interface{}{}
+	var n int
+	var indexnum int
+	i := 0
+	for tmp := make(map[string]interface{}); query.Next(tmp); i = i + 1 {
+		n++
+		//计数
+		if n%savesizei == 0 {
+			log.Println("当前:", n)
+		}
+		site, _ := tmp["site"].(string)
+		if util.IntAll(tmp["extracttype"]) == -1 || site != "中国招标投标公共服务平台" || tmp["attach_text"] == nil {
+			tmp = make(map[string]interface{})
+			continue
+		}
+		newTmp := map[string]interface{}{} //最终生索引的数据
+		saveArr := []map[string]interface{}{}
+
+		filetext := getFileText(tmp) //oss拼装filetext
+		if len(filetext) > 10 {
+			tmp["detail"] = filetext //filetext替换detail
+			saveArr = append(saveArr, map[string]interface{}{"_id": tmp["_id"]})
+			saveArr = append(saveArr, map[string]interface{}{
+				"$set": map[string]interface{}{
+					"filedetail": 1,
+					"detail":     filetext,
+				},
+			})
+			newTmp["filetext"] = filetext //
+		} else {
+			//log.Println("filetext is null string:", tmp["_id"])
+			tmp = make(map[string]interface{})
+			continue
+		}
+
+		indexnum++
+
+		//purchasing
+		if purchasing, ok := tmp["purchasing"].(string); ok {
+			if len(purchasing) > 0 {
+				newTmp["purchasing"] = tmp["purchasing"]
+			}
+		}
+		//purchasinglist
+		if purchasinglist, ok := tmp["purchasinglist"].([]interface{}); ok {
+			if len(purchasinglist) > 0 {
+				purchasinglist_new := []map[string]interface{}{}
+				for _, ls := range purchasinglist {
+					lsm_new := make(map[string]interface{})
+					lsm := ls.(map[string]interface{})
+					for _, pf := range purchasinglistFields {
+						if lsm[pf] != nil {
+							lsm_new[pf] = lsm[pf]
+						}
+					}
+					if lsm_new != nil && len(lsm_new) > 0 {
+						purchasinglist_new = append(purchasinglist_new, lsm_new)
+					}
+				}
+				if len(purchasinglist_new) > 0 {
+					newTmp["purchasinglist"] = purchasinglist_new
+				}
+			}
+		}
+
+		//处理数据
+		if tmp["supervisorrate"] != nil { //临时处理supervisorrate抽取类型为string不生索引
+			if _, ok := tmp["supervisorrate"].(string); ok { //supervisorrate数据为string类型
+				delete(tmp, "supervisorrate")
+			}
+		}
+		//对projectscope字段的索引处理
+		ps, _ := tmp["projectscope"].(string)
+		if len(ps) > ESLEN {
+			tmp["projectscope"] = string(([]rune(ps))[:4000])
+		}
+		SaveUpdageLock.Lock()
+		for _, v := range biddingIndexFields { //索引字段
+			if tmp[v] != nil {
+				if "projectinfo" == v {
+					mp, _ := tmp[v].(map[string]interface{})
+					if mp != nil {
+						newmap := map[string]interface{}{}
+						for _, v1 := range projectinfoFields {
+							if mp[v1] != nil {
+								newmap[v1] = mp[v1]
+							}
+						}
+						newTmp[v] = newmap
+					}
+				} else {
+					if v == "detail" {
+						detail, _ := tmp[v].(string)
+						newTmp[v] = FilterDetail(detail)
+					} else {
+						newTmp[v] = tmp[v]
+					}
+				}
+			}
+		}
+		arrEs = append(arrEs, newTmp) //要生索引数据
+		if len(saveArr) > 0 {
+			arrMgo = append(arrMgo, saveArr) //要更新数据
+		}
+		//批量更新
+		if len(arrMgo) >= savesizei-1 {
+			mgo.UpdateBulkAll(db, c, arrMgo...)
+			arrMgo = [][]map[string]interface{}{}
+		}
+		//生索引
+		if len(arrEs) >= savesizei-1 {
+			tmps := arrEs
+			elastic.BulkSave(index, itype, &tmps, true)
+			arrEs = []map[string]interface{}{}
+		}
+		SaveUpdageLock.Unlock()
+		tmp = make(map[string]interface{})
+	}
+
+	SaveUpdageLock.Lock()
+	if len(arrMgo) > 0 {
+		mgo.UpdateBulkAll(db, c, arrMgo...)
+	}
+	if len(arrEs) > 0 {
+		tmps := arrEs
+		elastic.BulkSave(index, itype, &tmps, true)
+	}
+	SaveUpdageLock.Unlock()
+	log.Println("create filetext index...over", n, indexnum)
+}
+
 func getFileText(tmp map[string]interface{}) (filetext string) {
 	if attchMap, ok := tmp["attach_text"].(map[string]interface{}); attchMap != nil && ok {
 		for _, tmpData1 := range attchMap {

+ 16 - 10
udpcreateindex/src/config.json

@@ -30,30 +30,30 @@
     },
     "bidding": {
         "db": "mxs",
-        "collect": "test1",
+        "collect": "test",
         "index": "bidding_v2",
         "type": "bidding",
         "extractdb": "mxs",
-        "extractcollect": "test2",
+        "extractcollect": "extract",
         "indexfields":[ 
-        "buyerzipcode","winnertel","winnerperson","contractcode","winneraddr","agencyaddr","buyeraddr","signaturedate","projectperiod","projectaddr","agencytel","agencyperson","buyerperson","agency","projectscope","projectcode","bidopentime","supervisorrate","buyertel","bidamount","winner","buyer","budget","projectname","bidstatus","buyerclass","topscopeclass","s_subscopeclass","area","city","district","s_winner","_id","title","detail","site","comeintime","href","infoformat","publishtime","s_sha","spidercode","subtype","toptype","projectinfo"
+        "buyerzipcode","winnertel","winnerperson","contractcode","winneraddr","agencyaddr","buyeraddr","signaturedate","projectperiod","projectaddr","agencytel","agencyperson","buyerperson","agency","projectscope","projectcode","bidopentime","supervisorrate","buyertel","bidamount","winner","buyer","budget","projectname","bidstatus","buyerclass","topscopeclass","s_subscopeclass","area","city","district","s_winner","_id","title","detail","site","comeintime","href","infoformat","publishtime","s_sha","spidercode","subtype","toptype","projectinfo","purchasing","purchasinglist","filetext"
         ],
-        "fields": "buyerzipcode,winnertel,winnerperson,contractcode,winneraddr,agencyaddr,buyeraddr,signaturedate,projectperiod,projectaddr,agencytel,agencyperson,buyerperson,agency,projectscope,projectcode,bidopentime,supervisorrate,buyertel,bidamount,winner,buyer,budget,projectname,buyerclass,topscopeclass,area,city,district,s_winner",
-        "projectinfo": "approvecode,approvecontent,approvestatus,approvetime,industry",
+        "fields": "buyerzipcode,winnertel,winnerperson,contractcode,winneraddr,agencyaddr,buyeraddr,signaturedate,projectperiod,projectaddr,agencytel,agencyperson,buyerperson,agency,projectscope,projectcode,bidopentime,supervisorrate,buyertel,bidamount,winner,buyer,budget,projectname,buyerclass,topscopeclass,area,city,district,s_winner,toptype,subtype,subscopeclass,s_subscopeclass,dataging",
+        "projectinfo": "approvecode,approvecontent,approvestatus,approvetime,approvedept,approvenumber,projecttype,approvecity",
+        "purchasinglist":"itemname,model,unitname,number",
         "multiIndex": ""
     },
     "filelength": 100000,
     "project": {
-		"addr": "192.168.3.207:27092",
-        "size": 2,
-        "db": "extract_kf",
-        "collect": "huawei_project",
+        "db": "mxs",
+        "collect": "test",
         "index": "projectset_v1",
         "type": "projectset"
     },
     "project2": {
 		"addr": "192.168.3.207:27092",
-        "db": "extract_kf",
+		"pool": 1,
+        "db": "mxs",
         "collect": "huawei_project",
         "index": "project_v2",
         "type": "project"
@@ -94,5 +94,11 @@
     "elastic": {
         "addr": "http://192.168.3.128:9800",
         "pool": 12
+    },
+    "elastic_other": {
+        "addr": "http://127.0.0.1:9800",
+        "pool": 12,
+        "index": "bidding_v2",
+        "type": "bidding"
     }
 }

+ 40 - 16
udpcreateindex/src/main.go

@@ -10,24 +10,28 @@ import (
 	"qfw/util/mongodb"
 	"strings"
 	"time"
-
-	u "./util"
+	u "util"
 )
 
 var (
-	Sysconfig          map[string]interface{} //配置文件
-	mgo                *mongodb.MongodbSim    //mongodb操作对象
-	extractmgo         *mongodb.MongodbSim    //mongodb操作对象
-	project2db         *mongodb.MongodbSim    //mongodb操作对象
-	mgostandard        *mongodb.MongodbSim    //mongodb操作对象
-	qyxydb             *mongodb.MongodbSim    //mongodb操作对象
-	udpclient          mu.UdpClient           //udp对象
-	updport            string
-	savesizei          = 500
-	biddingIndexFields = []string{"_id", "buyerclass", "s_winner", "title", "detail", "detail_bak", "area", "areaval", "site", "type", "amount", "bidopendate", "bidopentime", "buyer", "channel", "city", "comeintime", "contenthtml", "descript", "description", "extracttype", "href", "infoformat", "keywords", "projectcode", "projectname", "publishtime", "s_sha", "spidercode", "subtype", "summary", "toptype", "urltop", "winner", "agency", "budget", "bidamount", "s_subscopeclass", "projectscope", "bidstatus"}
-	projectinfoFields  []string
-	multiIndex         []string
-	BulkSize           = 400
+	Sysconfig            map[string]interface{} //配置文件
+	mgo                  *mongodb.MongodbSim    //mongodb操作对象
+	extractmgo           *mongodb.MongodbSim    //mongodb操作对象
+	project2db           *mongodb.MongodbSim    //mongodb操作对象
+	mgostandard          *mongodb.MongodbSim    //mongodb操作对象
+	qyxydb               *mongodb.MongodbSim    //mongodb操作对象
+	udpclient            mu.UdpClient           //udp对象
+	updport              string
+	savesizei            = 500
+	biddingIndexFields   = []string{"_id", "buyerclass", "s_winner", "title", "detail", "detail_bak", "area", "areaval", "site", "type", "amount", "bidopendate", "bidopentime", "buyer", "channel", "city", "comeintime", "contenthtml", "descript", "description", "extracttype", "href", "infoformat", "keywords", "projectcode", "projectname", "publishtime", "s_sha", "spidercode", "subtype", "summary", "toptype", "urltop", "winner", "agency", "budget", "bidamount", "s_subscopeclass", "projectscope", "bidstatus"}
+	projectinfoFields    []string
+	multiIndex           []string
+	purchasinglistFields []string
+	BulkSize             = 400
+	//bidding_other连接信息
+	bidding_other_es *elastic.Elastic
+	other_index      string
+	other_itype      string
 
 	winner, bidding, biddingback, project, project2, buyer, standard, qyxy_ent map[string]interface{}
 )
@@ -90,9 +94,22 @@ func init() {
 	}
 	mgostandard.InitPool()
 	log.Println(standard["addr"].(string))
-
+	//初始化es
+	//bidding
 	econf := Sysconfig["elastic"].(map[string]interface{})
 	elastic.InitElasticSize(econf["addr"].(string), util.IntAllDef(econf["pool"], 5))
+	//bidding_other
+	if Sysconfig["elastic_other"] != nil {
+		econf_other := Sysconfig["elastic_other"].(map[string]interface{})
+		other_index = econf_other["index"].(string)
+		other_itype = econf_other["type"].(string)
+		bidding_other_es = &elastic.Elastic{
+			S_esurl: econf_other["addr"].(string),
+			I_size:  util.IntAllDef(econf_other["pool"], 5),
+		}
+		bidding_other_es.InitElasticSize()
+	}
+	//
 	if bidding["indexfields"] != nil {
 		biddingIndexFields = util.ObjArrToStringArr(bidding["indexfields"].([]interface{}))
 	}
@@ -102,6 +119,12 @@ func init() {
 			projectinfoFields = strings.Split(pf, ",")
 		}
 	}
+	if bidding["purchasinglist"] != nil {
+		pcl := util.ObjToString(bidding["purchasinglist"])
+		if pcl != "" {
+			purchasinglistFields = strings.Split(pcl, ",")
+		}
+	}
 	if bidding["multiIndex"] != nil {
 		mi := util.ObjToString(bidding["multiIndex"])
 		if mi != "" {
@@ -109,6 +132,7 @@ func init() {
 		}
 	}
 	log.Println(projectinfoFields)
+	log.Println(purchasinglistFields)
 	//初始化oss
 	u.InitOss()
 }

+ 19 - 3
udpcreateindex/src/task.go

@@ -12,7 +12,7 @@ import (
 func task_index() {
 	c := cron.New()
 	c.AddFunc("20 30 5 * * *", func() { task_projects() })
-	c.AddFunc("0 15 * * * *", func() { task_biddingfile() }) //每两小时执行一次
+	//c.AddFunc("0 30 * * * *", func() { task_biddingfile() }) //每30分钟执行一次
 	//c.AddFunc("0 22 14 * * *", func() { task_qyxyindex() })
 	c.Start()
 }
@@ -21,8 +21,12 @@ func task_index() {
 func task_biddingfile() {
 	defer qutil.Catch()
 	q := map[string]interface{}{
-		"extract_state": map[string]interface{}{
-			"$eq": 3,
+		"comeintime": map[string]interface{}{
+			"$gte": time.Now().Unix() - 5400,
+			"$lte": time.Now().Unix() - 3600,
+		},
+		"attach_text": map[string]interface{}{
+			"$exists": true,
 		},
 	}
 	biddingPurchaingTask(q)
@@ -50,3 +54,15 @@ func task_qyxyindex() {
 	q := map[string]interface{}{}
 	qyxyTask(q)
 }
+
+//定时任务site:中国招标投标公共服务平台
+func crontab() {
+	defer qutil.Catch()
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gte": qutil.StringTOBsonId("5e344f000000000000000000"), //2020-02-01 5e344f000000000000000000
+			"$lte": qutil.StringTOBsonId("5ea1bb800000000000000000"), //2020-04-23 5ea1bb800000000000000000
+		},
+	}
+	site_attach_text(q)
+}

+ 105 - 1
udpfilterdup/src/README.md

@@ -1 +1,105 @@
-基于内存的信息重复过滤
+基于内存的信息重复过滤
+
+{
+    "udpport": ":11485",
+    "dupdays": 7,
+    "mongodb": {
+        "addr": "172.17.4.187:27083",
+        "pool": 10,
+        "db": "qfw",
+        "extract": "result_file_20200410",
+        "extract_back": "result_file_20200409",
+        "site": {
+            "dbname": "qfw",
+            "coll": "site"
+        }
+    },
+    "jkmail": {
+        "to": "zhangjinkun@topnet.net.cn",
+        "api": "http://10.171.112.160:19281/_send/_mail"
+    },
+    "nextNode": [
+        {
+            "addr": "172.17.145.179",
+            "port": 1782,
+            "stype": "project",
+            "memo": "合并项目"
+        },
+        {
+            "addr": "127.0.0.1",
+            "port": 1783,
+            "stype": "bidding",
+            "memo": "创建招标数据索引new"
+        }
+    ],
+
+    "threads": 1,
+    "isMerger": false,
+    "isSort":false,
+    "lowHeavy":false,
+    "timingTask":true,
+    "timingSpanDay": 3,
+    "timingPubScope": 720,
+    "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批)",
+    "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
+    "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+}
+
+
+
+
+
+
+{
+    "udpport": ":1785",
+    "dupdays": 5,
+    "mongodb": {
+        "addr": "172.17.4.187:27083",
+        "pool": 5,
+        "db": "qfw",
+        "extract": "result_file_20200410",
+        "extract_back": "result_file_20200409",
+        "site": {
+            "dbname": "qfw",
+            "coll": "site"
+        }
+    },
+    "jkmail": {
+        "to": "zhengkun@topnet.net.cn,zhangjinkun@topnet.net.cn",
+        "api": "http://10.171.112.160:19281/_send/_mail"
+    },
+    "nextNode": [
+        {
+            "addr": "172.17.145.179",
+            "port": 1782,
+            "stype": "project",
+            "memo": "合并项目"
+        },
+        {
+            "addr": "127.0.0.1",
+            "port": 1783,
+            "stype": "bidding",
+            "memo": "创建招标数据索引new"
+        }
+    ],
+    "threads": 1,
+    "isMerger": false,
+    "isSort":true,
+    "lowHeavy":false,
+    "timingTask":false,
+    "timingSpanDay": 3,
+    "timingPubScope": 720,
+    "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批)",
+    "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
+    "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+}
+
+
+
+
+
+

+ 15 - 16
udpfilterdup/src/config.json

@@ -1,34 +1,33 @@
 {
-    "udpport": ":1485",
+    "udpport": ":11995",
     "dupdays": 5,
     "mongodb": {
         "addr": "192.168.3.207:27092",
-        "pool": 10,
+        "pool": 5,
         "db": "extract_kf",
-        "extract": "a_testbidding_new",
+        "extract": "zk_Copy_of_zk_bidding_0506",
+        "extract_back": "zk_Copy_of_zk_bidding_0506",
         "site": {
-            "dbname": "qfw",
+            "dbname": "extract_kf",
             "coll": "site"
         }
     },
     "jkmail": {
-        "to": "zhangjinkun@topnet.net.cn",
+        "to": "zhengkun@topnet.net.cn,zhangjinkun@topnet.net.cn",
         "api": "http://10.171.112.160:19281/_send/_mail"
     },
-    "nextNode": [],
+    "nextNode": [
+    ],
+    "threads": 1,
     "isMerger": false,
-    "threads": 5,
-    "isSort":false,
-    "lowHeavy":true,
+    "isSort":true,
+    "lowHeavy":false,
+    "timingTask":false,
+    "timingSpanDay": 3,
+    "timingPubScope": 720,
     "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
     "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
     "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批)",
     "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
     "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
-}
-
-
-
-
-
-
+}

+ 156 - 299
udpfilterdup/src/datamap.go

@@ -4,7 +4,6 @@ import (
 	"fmt"
 	"log"
 	qutil "qfw/util"
-	"qfw/util/mongodb"
 	"regexp"
 	"strings"
 	"sync"
@@ -30,10 +29,8 @@ type Info struct {
 	comeintime     int64   //入库时间
 	bidopentime    int64   //开标时间
 	bidopenaddress string  //开标地点
-
-	site string //站点
-	href string //正文的url
-
+	site 		   string //站点
+	href 		     string //正文的url
 	repeatid         string                 //重复id
 	titleSpecialWord bool                   //标题特殊词
 	specialWord      bool                   //再次判断的特殊词
@@ -51,21 +48,80 @@ type datamap struct {
 	days   int        //保留几天数据
 	data   map[string][]*Info
 	keymap []string
+	areakeys []string
 	keys   map[string]bool
 }
 
-//历史更新数据
-type historymap struct {
-	lock   sync.Mutex //锁
-	days   int        //保留几天数据
-	data   map[string][]*Info
-	keymap []string
-	keys   map[string]bool
+func TimedTaskDatamap(days int,lasttime int64) *datamap {
+	log.Println("数据池开始重新构建")
+	datelimit = qutil.Float64All(days * 86400)
+	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{},map[string]bool{}}
+	if lasttime <0 {
+		log.Println("数据池空数据")
+		return dm
+	}
+
+	start := int(time.Now().Unix())
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	query := map[string]interface{}{"publishtime": map[string]interface{}{
+		"$lt": lasttime,
+	}}
+	log.Println("query", query)
+	it := sess.DB(mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
+	n, continuSum := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		//qutil.IntAll(tmp["dataging"]) == 1
+		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1||qutil.IntAll(tmp["dataging"]) == 1  {
+
+		} else {
+			pt := tmp["publishtime"]
+			pt_time := qutil.Int64All(pt)
+			if qutil.Float64All(lasttime-pt_time) < datelimit {
+				continuSum++
+				info := NewInfo(tmp)
+				dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
+				k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
+				data := dm.data[k]
+				if data == nil {
+					data = []*Info{}
+				}
+				data = append(data, info)
+				dm.data[k] = data
+				dm.keys[dkey] = true
+				//添加省
+				isAreaExist :=false
+				for _,v:= range dm.areakeys {
+					if v==info.area {
+						isAreaExist = true
+					}
+				}
+				if !isAreaExist {
+					areaArr := dm.areakeys
+					areaArr = append(areaArr,info.area)
+					dm.areakeys = areaArr
+				}
+			} else {
+				break
+			}
+		}
+		if n%50000 == 0 {
+			log.Println("current 数据池:", n, continuSum)
+		}
+		tmp = make(map[string]interface{})
+	}
+
+
+	log.Printf("数据池构建完成::%d秒,%d个\n", int(time.Now().Unix())-start, n)
+
+
+	return dm
 }
 
+
 func NewDatamap(days int, lastid string) *datamap {
 	datelimit = qutil.Float64All(days * 86400)
-	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, map[string]bool{}}
+	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{},[]string{}, map[string]bool{}}
 	if lastid == "" {
 		return dm
 	}
@@ -105,6 +161,18 @@ func NewDatamap(days int, lastid string) *datamap {
 				data = append(data, info)
 				dm.data[k] = data
 				dm.keys[dkey] = true
+				//添加省
+				isAreaExist :=false
+				for _,v:= range dm.areakeys {
+					if v==info.area {
+						isAreaExist = true
+					}
+				}
+				if !isAreaExist {
+					areaArr := dm.areakeys
+					areaArr = append(areaArr,info.area)
+					dm.areakeys = areaArr
+				}
 			} else {
 				break
 			}
@@ -118,84 +186,6 @@ func NewDatamap(days int, lastid string) *datamap {
 	return dm
 }
 
-//构建新历史数据池
-func NewHistorymap(startid string, lastid string, startTime int64, lastTime int64) *historymap {
-	datelimit = qutil.Float64All(5 * 86400)
-	hm := &historymap{sync.Mutex{}, 5, map[string][]*Info{}, []string{}, map[string]bool{}}
-	if lastid == "" || startid == "" {
-		return hm
-	}
-	//取startid之前5天
-	sess_start := mgo.GetMgoConn()
-	defer mgo.DestoryMongoConn(sess_start) //lte  gte
-	it_start := sess_start.DB(mgo.DbName).C(extract).Find(mongodb.ObjToMQ(`{"_id":{"$lte":"`+startid+`"}}`,
-		true)).Sort("-_id").Iter()
-	m, n := 0, 0
-	for tmp_start := make(map[string]interface{}); it_start.Next(&tmp_start); {
-		pt_s := tmp_start["comeintime"]
-		if Is_Sort {
-			pt_s = tmp_start["publishtime"]
-		}
-		pt_time := qutil.Int64All(pt_s)
-		if pt_time <= 0 {
-			continue
-		}
-		if qutil.Float64All(startTime-pt_time) <= datelimit {
-			n++
-			info := NewInfo(tmp_start)
-			dkey := qutil.FormatDateWithObj(&pt_s, qutil.Date_yyyyMMdd)
-			k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
-			data := hm.data[k]
-			if data == nil {
-				data = []*Info{}
-			}
-			data = append(data, info)
-			hm.data[k] = data
-			hm.keys[dkey] = true
-		} else {
-			break
-		}
-		tmp_start = make(map[string]interface{})
-	}
-
-	log.Println("load history 前:", n)
-	//取lastid之后5天
-	sess_last := mgo.GetMgoConn()
-	defer mgo.DestoryMongoConn(sess_last) //lte  gte
-	it_last := sess_last.DB(mgo.DbName).C(extract).Find(mongodb.ObjToMQ(`{"_id":{"$gte":"`+lastid+`"}}`,
-		true)).Sort("_id").Iter()
-
-	for tmp_last := make(map[string]interface{}); it_last.Next(&tmp_last); {
-		pt_l := tmp_last["comeintime"]
-		if Is_Sort {
-			pt_l = tmp_last["publishtime"]
-		}
-		pt_time := qutil.Int64All(pt_l)
-		if pt_time <= 0 {
-			continue
-		}
-		if qutil.Float64All(pt_time-lastTime) <= datelimit {
-			m++
-			info := NewInfo(tmp_last)
-			dkey := qutil.FormatDateWithObj(&pt_l, qutil.Date_yyyyMMdd)
-			k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
-			data := hm.data[k]
-			if data == nil {
-				data = []*Info{}
-			}
-			data = append(data, info)
-			hm.data[k] = data
-			hm.keys[dkey] = true
-		} else {
-			break
-		}
-		tmp_last = make(map[string]interface{})
-	}
-
-	log.Println("load history 后:", m)
-	return hm
-}
-
 func NewInfo(tmp map[string]interface{}) *Info {
 	subtype := qutil.ObjToString(tmp["subtype"])
 	area := qutil.ObjToString(tmp["area"])
@@ -203,7 +193,12 @@ func NewInfo(tmp map[string]interface{}) *Info {
 		area = "全国"
 	}
 	info := &Info{}
-	info.id = BsonTOStringId(tmp["_id"])
+	if IdType {
+		info.id = qutil.ObjToString(tmp["_id"])
+	}else  {
+		info.id = BsonTOStringId(tmp["_id"])
+	}
+
 	info.title = qutil.ObjToString(tmp["title"])
 	info.area = area
 	info.subtype = subtype
@@ -243,10 +238,17 @@ func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
 	keys := []string{}
 	d.lock.Lock()
 	for k, _ := range d.keys { //不同时间段
-		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
-		if info.area != "全国" { //这个后续可以不要
-			keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
+		if info.area=="全国" {
+			//匹配所有省
+			for _,v := range d.areakeys{
+				keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, v))
+			}
+		}else {
+			//匹配指定省
+			keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
 		}
+		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
+
 	}
 	d.lock.Unlock()
 L:
@@ -255,11 +257,18 @@ L:
 		data := d.data[k]
 		d.lock.Unlock()
 		if len(data) > 0 { //对比v   找到同类型,同省或全国的数据作对比
+			//log.Println(info.area,info.subtype,k)
 			for _, v := range data {
 				reason = ""
 				if v.id == info.id { //正常重复
+					//log.Println("相同id",info.id)
 					return false, v, ""
 				}
+
+				//if v.id == "5c761a4fa5cb26b9b73d9512" &&info.id=="5c767bd1a5cb26b9b7a61597" {
+				//	log.Println("测试数据")
+				//}
+
 				if info.subtype == v.subtype {
 					if info.site != "" {
 						sitelock.Lock()
@@ -364,13 +373,14 @@ L:
 					}
 				}
 			}
+
 		}
 	}
 
 	//往预存数据 d 添加
 	if !b {
 		ct := info.comeintime
-		if Is_Sort {
+		if Is_Sort ||TimingTask{
 			ct = info.publishtime
 		}
 		dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
@@ -388,234 +398,81 @@ L:
 			data = append(data, info)
 			d.data[k] = data
 		}
-		d.lock.Unlock()
-	}
-
-	return
-}
-
-func (h *historymap) checkHistory(info *Info) (b bool, source *Info, reasons string) {
-	reason := ""
-	keys := []string{}
-	h.lock.Lock()
-	for k, _ := range h.keys { //不同时间段
-		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
-		if info.area != "全国" { //这个后续可以不要
-			keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
-		}
-	}
-	h.lock.Unlock()
-
-L:
-	for _, k := range keys {
-		h.lock.Lock()
-		data := h.data[k]
-		h.lock.Unlock()
-		if len(data) > 0 { //对比v   找到同类型,同省或全国的数据作对比
-			for _, v := range data {
-				reason = ""
-				if v.id == info.id { //正常重复
-					return false, v, ""
-				}
-				if info.subtype == v.subtype {
-					if info.site != "" {
-						sitelock.Lock()
-						dict := SiteMap[info.site]
-						sitelock.Unlock()
-						if dict != nil {
-							if info.area == "全国" && dict["area"] != "" {
-								info.area = qutil.ObjToString(dict["area"])
-								info.city = qutil.ObjToString(dict["city"])
-							} else {
-								if info.city == "" && dict["city"] != "" {
-									info.area = qutil.ObjToString(dict["area"])
-									info.city = qutil.ObjToString(dict["city"])
-								}
-							}
-						}
-					}
-					//前置条件1 - 站点相关
-					if info.site != "" && info.site == v.site {
-						if info.href != "" && info.href == v.href {
-							reason = "href相同"
-							b = true
-							source = v
-							reasons = reason
-							break L
-						}
-						if info.href != "" && info.href != v.href {
-							reason = "href不同-"
-						}
-					}
-
-					//前置条件2 - 标题相关,有且一个关键词
-					if ((info.titleSpecialWord && !v.titleSpecialWord) || (info.specialWord && !v.specialWord)) &&
-						info.title != v.title && v.title != "" && info.title != "" {
-						continue
-					}
-
-					//前置条件3 - 标题相关,均含有关键词
-					if ((info.titleSpecialWord && v.titleSpecialWord) || (info.specialWord && v.specialWord)) &&
-						len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 && v.title != "" && info.title != "" {
-						if !(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
-							continue //无包含关系
-						}
-						if strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title) {
-							reason = reason + "标题关键词且包含关系"
-							//继续二级金额判断
-							if !againRepeat(v, info) {
-								b = true
-								source = v
-								reasons = reason
-								break
-							}
-
-						}
-					}
-
-					//新增快速数据过少判重
-					if LowHeavy {
-						repeat := false
-						if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
-							b = true
-							source = v
-							reasons = reason
-							break
-						}
-					}
-
-					//代理机构相同-非空相等
-					if v.agency != "" && info.agency != "" && v.agency == info.agency {
-						reason = reason + "同机构-"
-						repeat := false
-						if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
-							b = true
-							source = v
-							reasons = reason
-							break
-						}
-					} else {
-						reason = reason + "非同机构-"
-						if info.city != "" && info.city == v.city {
-							reason = reason + "同城-"
-							repeat := false
-							if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
-								b = true
-								source = v
-								reasons = reason
-								break
-							}
-						} else {
-							reason = reason + "不同城-"
-							repeat := false
-							if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
-								b = true
-								source = v
-								reasons = reason
-								break
-							}
-						}
-					}
-				}
-			}
-		}
-	}
 
-	//
-	if b {
-		if info.repeatid == source.id {
-			b = false //重复-无变化-不处理
-		}
-	} else {
-		if source != nil {
-			if source.repeatid != "" { //未判重-有变化--记录
-				b = true
-				reason = "未判重记录"
-				reasons = reason
+		//添加省
+		isAreaExist :=false
+		for _,v:= range d.areakeys {
+			if v==info.area {
+				isAreaExist = true
 			}
 		}
-	}
-	//往预存数据 d 添加
-	if !b {
-		ct := info.comeintime
-		if Is_Sort {
-			ct = info.publishtime
-		}
-		dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
-		k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
-		data := h.data[k]
-		if data == nil {
-			data = []*Info{info}
-			h.data[k] = data
-			if !h.keys[dkey] {
-				h.keys[dkey] = true
-				//h.update(ct)
-			}
-		} else {
-			data = append(data, info)
-			h.data[k] = data
+		if !isAreaExist {
+			areaArr := d.areakeys
+			areaArr = append(areaArr,info.area)
+			d.areakeys = areaArr
 		}
+
+		d.lock.Unlock()
 	}
+
 	return
 }
-
 //替换原始数据池
-func (d *datamap) replaceSourceData(replaceData *Info, replaceId string) {
-	ct := replaceData.comeintime
-	if Is_Sort {
-		ct = replaceData.publishtime
+func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
+	ct := newData.comeintime
+	if Is_Sort||TimingTask {
+		ct = newData.publishtime
 	}
 	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
-	k := fmt.Sprintf("%s_%s_%s", dkey, replaceData.subtype, replaceData.area)
+	k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
 	d.lock.Lock()
 	data := d.data[k]
 	if data == nil {
-		data = []*Info{replaceData}
+		data = []*Info{newData}
 		d.data[k] = data
 		if !d.keys[dkey] {
 			d.keys[dkey] = true
 		}
 	} else {
-		//遍历替换
+ 		//遍历替换
+ 		isReplace := false
 		for k, v := range data {
-			if v.id == replaceId {
-				data[k] = replaceData
+			if v.id == oldData.id {
+				data[k] = newData //同天_type_area 替换
+				isReplace = true
 				break
 			}
 		}
-		d.data[k] = data
-	}
-	d.lock.Unlock()
-}
-
-func (h *historymap) replaceSourceData(replaceData *Info, replaceId string) {
-	ct := replaceData.comeintime
-	if Is_Sort {
-		ct = replaceData.publishtime
-	}
-	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
-	k := fmt.Sprintf("%s_%s_%s", dkey, replaceData.subtype, replaceData.area)
-	h.lock.Lock()
-	data := h.data[k]
-	if data == nil {
-		data = []*Info{replaceData}
-		h.data[k] = data
-		if !h.keys[dkey] {
-			h.keys[dkey] = true
-		}
-	} else {
-		//遍历替换
-		for k, v := range data {
-			if v.id == replaceId {
-				data[k] = replaceData
-				break
+		if !isReplace {
+			//添加新数据 删除老数据
+			data = append(data,newData)
+			ct_old := oldData.comeintime
+			if Is_Sort||TimingTask {
+				ct_old = oldData.publishtime
+			}
+			dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
+			k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
+			data_old := d.data[k_old]
+			if len(data_old)==1 {
+				delete(d.data ,k_old)
+			} else {
+				for k, v := range data_old {
+					if v.id == oldData.id {
+						//删除对应当前的
+						data_old = append(data_old[:k], data_old[k+1:]...)
+						break
+					}
+				}
+				d.data[k_old] = data_old
 			}
+		}else {
+			d.data[k] = data
 		}
-		h.data[k] = data
 	}
-	h.lock.Unlock()
+	d.lock.Unlock()
 }
 
+
 func (d *datamap) update(t int64) {
 	//每天0点清除历史数据
 	d.keymap = d.GetLatelyFiveDay(t)
@@ -689,7 +546,7 @@ func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
 		}else if isValue==1 {
 			isMeet := false
 			if isMeet, reason = judgeLowQualityData(v, info, reason); isMeet {
-				log.Println("符合低质量条件条件1",info.id,"--",v.id)
+				//log.Println("符合低质量条件条件1",info.id,"--",v.id)
 				reason = reason + "---有且一个要素组合"
 				return true, reason
 			}
@@ -988,7 +845,7 @@ func tenderRepeat_B(v *Info, info *Info, reason string) (bool, string) {
 		if n == 2 && m == 2 {
 			return false, reason
 		} else {
-			reason = reason + "满足招标B,选二,"
+			reason = reason + "满足招标B,选二,"
 			return true, reason
 		}
 	}

+ 288 - 255
udpfilterdup/src/main.go

@@ -8,6 +8,7 @@ import (
 	"encoding/json"
 	"flag"
 	"fmt"
+	"github.com/cron"
 	"log"
 	mu "mfw/util"
 	"net"
@@ -16,19 +17,20 @@ import (
 	"regexp"
 	"sync"
 	"time"
+
+	"gopkg.in/mgo.v2/bson"
 )
 
 var (
-	Sysconfig map[string]interface{} //配置文件
-	mconf     map[string]interface{} //mongodb配置信息
-	mgo       *MongodbSim            //mongodb操作对象
-	extract   string
-	udpclient mu.UdpClient             //udp对象
-	nextNode  []map[string]interface{} //下节点数组
-	dupdays   = 5                      //初始化判重范围
-	DM        *datamap                 //
-	HM        *historymap              //判重数据
-	lastid    = ""
+	Sysconfig    map[string]interface{} //配置文件
+	mconf        map[string]interface{} //mongodb配置信息
+	mgo          *MongodbSim            //mongodb操作对象
+	extract      string
+	extract_back string
+	udpclient    mu.UdpClient             //udp对象
+	nextNode     []map[string]interface{} //下节点数组
+	dupdays      = 5                      //初始化判重范围
+	DM           *datamap                 //
 
 	//正则筛选相关
 	FilterRegTitle   = regexp.MustCompile("^_$")
@@ -36,15 +38,20 @@ var (
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
 	FilterRegTitle_2 = regexp.MustCompile("^_$")
 
-	isMerger         bool                              //是否合并
-	Is_Sort          bool                              //是否排序
-	threadNum        int                               //线程数量
-	SiteMap          map[string]map[string]interface{} //站点map
-	LowHeavy		 bool							   //低质量数据判重
-	sid, eid string                            //测试人员判重使用
+	isMerger       bool                              //是否合并
+	Is_Sort        bool                              //是否排序
+	threadNum      int                               //线程数量
+	SiteMap        map[string]map[string]interface{} //站点map
+	LowHeavy       bool                              //低质量数据判重
+	TimingTask     bool                              //是否定时任务
+	timingSpanDay  int64                             //时间跨度
+	timingPubScope int64                             //发布时间周期
+	sid,eid,lastid       string                      //测试人员判重使用
+	IdType         bool    //默认object类型
 )
 
 func init() {
+	//5ea9a4800000000000000000
 	flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
 	flag.StringVar(&sid, "sid", "", "开始id")
 	flag.StringVar(&eid, "eid", "", "结束id")
@@ -60,6 +67,8 @@ func init() {
 	}
 	mgo.InitPool()
 	extract = mconf["extract"].(string)
+	extract_back = mconf["extract_back"].(string)
+
 	dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
 	//加载数据
 	DM = NewDatamap(dupdays, lastid)
@@ -70,7 +79,11 @@ func init() {
 	isMerger = Sysconfig["isMerger"].(bool)
 	Is_Sort = Sysconfig["isSort"].(bool)
 	threadNum = util.IntAllDef(Sysconfig["threads"], 1)
-	LowHeavy =  Sysconfig["lowHeavy"].(bool)
+	LowHeavy = Sysconfig["lowHeavy"].(bool)
+	TimingTask = Sysconfig["timingTask"].(bool)
+	timingSpanDay = util.Int64All(Sysconfig["timingSpanDay"])
+	timingPubScope = util.Int64All(Sysconfig["timingPubScope"])
+
 	//站点配置
 	site := mconf["site"].(map[string]interface{})
 	SiteMap = make(map[string]map[string]interface{}, 0)
@@ -85,10 +98,11 @@ func init() {
 			"district": util.ObjToString(site_dict["district"]),
 			"sitetype": util.ObjToString(site_dict["sitetype"]),
 			"level":    util.ObjToString(site_dict["level"]),
+			"weight":   util.ObjToString(site_dict["weight"]),
 		}
 		SiteMap[util.ObjToString(site_dict["site"])] = data_map
 	}
-	log.Printf("站点加载用时:%d秒,%d个\n", int(time.Now().Unix())-start, len(SiteMap))
+	log.Printf("new站点加载用时:%d秒,%d个\n", int(time.Now().Unix())-start, len(SiteMap))
 }
 
 func main() {
@@ -97,34 +111,38 @@ func main() {
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
 	udpclient.Listen(processUdpMsg)
 	log.Println("Udp服务监听", updport)
+	if TimingTask {
+		go timedTaskDay()
+	}
+
 	time.Sleep(99999 * time.Hour)
 }
 
 //测试组人员使用
 func mainT() {
-	/*
-		ObjectId("5da3f31aa5cb26b9b798d3aa")
-		ObjectId("5da418c4a5cb26b9b7e3e9a6")
-
-		ObjectId("5da3f2c5a5cb26b9b79847fc")
-		ObjectId("5db2735ba5cb26b9b7c99c6f")
-	*/
-	log.Println("测试开始")
-	sid = "5da3f2c5a5cb26b9b79847fc"
-	eid = "5db2735ba5cb26b9b7c99c6f"
-
-
 
-	mapinfo := map[string]interface{}{}
-	if sid == "" || eid == "" {
-		log.Println("sid,eid参数不能为空")
-		os.Exit(0)
+	if TimingTask {
+		log.Println("定时任务测试开始")
+		go timedTaskDay()
+		time.Sleep(99999 * time.Hour)
+	} else {
+		//2019年8月1日-8月17日  712646
+		IdType = true
+		sid = "5d41607aa5cb26b9b734fe30"
+		eid = "5eb172e1f2c1a7850bad1c39"
+		log.Println("正常判重测试开始")
+		log.Println(sid, "---", eid)
+		mapinfo := map[string]interface{}{}
+		if sid == "" || eid == "" {
+			log.Println("sid,eid参数不能为空")
+			os.Exit(0)
+		}
+		mapinfo["gtid"] = sid
+		mapinfo["lteid"] = eid
+		//mapinfo["stop"] = "true"
+		task([]byte{}, mapinfo)
+		time.Sleep(99999 * time.Second)
 	}
-	mapinfo["gtid"] = sid
-	mapinfo["lteid"] = eid
-	mapinfo["stop"] = "true"
-	task([]byte{}, mapinfo)
-	time.Sleep(10 * time.Second)
 }
 func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 	fmt.Println("接受的段数据")
@@ -138,17 +156,13 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 			udpclient.WriteUdp([]byte("err:"+err.Error()), mu.OP_NOOP, ra)
 		} else if mapInfo != nil {
 			taskType := util.ObjToString(mapInfo["stype"])
-			if taskType == "historyTask" {
-				//更新流程
-				go historyTask(data, mapInfo)
-			} else if taskType == "normalTask" {
+			if taskType == "normalTask" {
 				//判重流程
 				go task(data, mapInfo)
 			} else {
 				//其他
 				go task(data, mapInfo)
 			}
-
 			key, _ := mapInfo["key"].(string)
 			if key == "" {
 				key = "udpok"
@@ -175,6 +189,14 @@ func task(data []byte, mapInfo map[string]interface{}) {
 			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
 		},
 	}
+	if IdType {
+		q = map[string]interface{}{
+			"_id": map[string]interface{}{
+				"$gt":  mapInfo["gtid"].(string),
+				"$lte": mapInfo["lteid"].(string),
+			},
+		}
+	}
 	log.Println(mgo.DbName, extract, q)
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
@@ -185,7 +207,6 @@ func task(data []byte, mapInfo map[string]interface{}) {
 		log.Println("排序:publishtime")
 		it = sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
 	}
-	//it = sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
 	updateExtract := [][]map[string]interface{}{}
 	log.Println("线程数:", threadNum)
 	pool := make(chan bool, threadNum)
@@ -196,7 +217,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 		if n%10000 == 0 {
 			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
 		}
-		if util.IntAll(tmp["repeat"]) == 1 {
+		if util.IntAll(tmp["repeat"]) == 1 || util.IntAll(tmp["repeat"]) == -1 {
 			tmp = make(map[string]interface{})
 			repeateN++
 			continue
@@ -209,7 +230,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
-			if !LowHeavy {	//是否进行低质量数据判重
+			if !LowHeavy { //是否进行低质量数据判重
 				if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
 					updateExtract = append(updateExtract, []map[string]interface{}{
 						map[string]interface{}{
@@ -217,7 +238,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 						},
 						map[string]interface{}{
 							"$set": map[string]interface{}{
-								"repeat": -1,//无效数据标签
+								"repeat": -1, //无效数据标签
 							},
 						},
 					})
@@ -238,26 +259,38 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				var repeat_idMap = map[string]interface{}{} //记录判重的
 				var merge_idMap = map[string]interface{}{}  //记录合并的
 				repeat_idMap["_id"] = StringTOBsonId(info.id)
+				if IdType {
+					repeat_idMap["_id"] = info.id
+				}
 				merge_idMap["_id"] = StringTOBsonId(source.id)
-				repeat_id := source.id//初始化一个数据
+				repeat_id := source.id //初始化一个数据
 
-				if isMerger {//合并相关
+				if isMerger { //合并相关
 					basic_bool := basicDataScore(source, info)
 					if basic_bool {
 						//已原始数据为标准 - 对比数据打判重标签-
 						newData, mergeArr, is_replace = mergeDataFields(source, info)
-						DM.replaceSourceData(newData, source.id) //替换
+						DM.replaceSourceData(newData, source) //替换
 						//对比数据打重复标签的id,原始数据id的记录
 						repeat_idMap["_id"] = StringTOBsonId(info.id)
 						merge_idMap["_id"] = StringTOBsonId(source.id)
+
+						if IdType {
+							repeat_idMap["_id"] = info.id
+							merge_idMap["_id"] = source.id
+						}
 						repeat_id = source.id
 					} else {
 						//已对比数据为标准 ,数据池的数据打判重标签
 						newData, mergeArr, is_replace = mergeDataFields(info, source)
-						DM.replaceSourceData(newData, source.id) //替换
+						DM.replaceSourceData(newData, source) //替换
 						//原始数据打重复标签的id,   对比数据id的记录
 						repeat_idMap["_id"] = StringTOBsonId(source.id)
 						merge_idMap["_id"] = StringTOBsonId(info.id)
+						if IdType {
+							repeat_idMap["_id"] = source.id
+							merge_idMap["_id"] = info.id
+						}
 						repeat_id = info.id
 					}
 
@@ -305,15 +338,22 @@ func task(data []byte, mapInfo map[string]interface{}) {
 							merge_map,
 						})
 					}
-				}else { //高质量数据
+				} else { //高质量数据
 					basic_bool := basicDataScore(source, info)
 					if !basic_bool {
-						DM.replaceSourceData(info, source.id) //替换
+						DM.replaceSourceData(info, source) //替换
 						repeat_idMap["_id"] = StringTOBsonId(source.id)
+						if IdType {
+							repeat_idMap["_id"] = source.id
+						}
 						repeat_id = info.id
 					}
 				}
 
+
+				log.Println("最终结果","目标id:",repeat_idMap["_id"])
+
+
 				//重复数据打标签
 				updateExtract = append(updateExtract, []map[string]interface{}{
 					repeat_idMap,
@@ -342,6 +382,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 
 	//任务完成,开始发送广播通知下面节点
 	if n > repeateN && mapInfo["stop"] == nil {
+		log.Println("判重任务完成发送udp")
 		for _, to := range nextNode {
 			sid, _ := mapInfo["gtid"].(string)
 			eid, _ := mapInfo["lteid"].(string)
@@ -363,245 +404,201 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	}
 }
 
-//支持历史更新
-func historyTask(data []byte, mapInfo map[string]interface{}) {
+//定时任务
+func timedTaskDay() {
+	log.Println("部署定时任务")
+	c := cron.New()
+	c.AddFunc("0 0 1 * * ?", func() { movedata() })      //每天凌晨1点执行一次
+	c.AddFunc("0 0 */4 * * ?", func() { timedTaskOnce() }) //每天凌晨2点执行一次
+	c.Start()
+	//timedTaskOnce()
+}
+func timedTaskOnce() {
 
-	fmt.Println("开始取历史时间段")
+	log.Println("开始一次定时任务")
 	defer util.Catch()
-	sess := mgo.GetMgoConn()
-	defer mgo.DestoryMongoConn(sess)
-
-	q:= map[string]interface{}{
+	//当前时间-8   -4 小时
+	now := time.Now()
+	log.Println(now)
+	preTime := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-8, 0, 0, 0, time.Local)
+	curTime := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-4, 0, 0, 0, time.Local)
+	log.Println(preTime,curTime)
+	task_sid := util.BsonIdToSId(bson.NewObjectIdWithTime(preTime))
+	task_eid := util.BsonIdToSId(bson.NewObjectIdWithTime(curTime))
+	between_time := curTime.Unix() - (86400 * timingPubScope)
+	log.Println("id区间:",task_sid, task_eid,"时间:", between_time)
+	//区间id
+	q_start := map[string]interface{}{
 		"_id": map[string]interface{}{
-			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
-			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+			"$gte": StringTOBsonId(task_sid),
+			"$lte": StringTOBsonId(task_eid),
 		},
 	}
-	it := sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
-	minTime, maxTime := int64(0), int64(0)
-	for tmp := make(map[string]interface{}); it.Next(&tmp); {
-		//取出最大最小时间
-		info_time := tmp["comeintime"]
-		if Is_Sort {
-			info_time = tmp["publishtime"]
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	it_start := sess.DB(mgo.DbName).C(extract).Find(&q_start).Sort("publishtime").Iter()
+	num,oknum, deterTime:= int64(0),int64(0),int64(0) //计数
+	updateExtract := [][]map[string]interface{}{}//批量更新mongo数组
+	pendAllArr:=[][]map[string]interface{}{}//待处理数组
+	dayArr := []map[string]interface{}{}
+	for tmp := make(map[string]interface{}); it_start.Next(&tmp); num++ {
+		if num%10000 == 0 {
+			log.Println("正序遍历:", num)
 		}
-		if minTime == 0 || maxTime == 0 && util.Int64All(info_time) != 0 {
-			minTime = util.Int64All(info_time)
-			maxTime = util.Int64All(info_time)
-		} else {
-			t := util.Int64All(info_time)
-			if t < minTime && t != 0 {
-				minTime = t
-			}
-			if t > maxTime && t != 0 {
-				maxTime = t
+		//取-符合-发布时间半年内的数据
+		if util.IntAll(tmp["dataging"]) == 1 {
+			pubtime := util.Int64All(tmp["publishtime"])
+			if pubtime > 0 && pubtime >= between_time {
+				oknum++
+				if deterTime==0 {
+					log.Println("找到第一条符合条件的数据")
+					deterTime = util.Int64All(tmp["publishtime"])
+					dayArr = append(dayArr,tmp)
+				}else {
+					if pubtime-deterTime >timingSpanDay*86400 {
+						//新数组重新构建,当前组数据加到全部组数据
+						pendAllArr = append(pendAllArr,dayArr)
+						dayArr = []map[string]interface{}{}
+						deterTime = util.Int64All(tmp["publishtime"])
+						dayArr = append(dayArr,tmp)
+					}else {
+						dayArr = append(dayArr,tmp)
+					}
+				}
+			}else {
+				//不在两年内的也清标记
+				updateExtract = append(updateExtract, []map[string]interface{}{
+					map[string]interface{}{
+						"_id": tmp["_id"],
+					},
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"dataging": 0,
+						},
+					},
+				})
+				if len(updateExtract) > 50 {
+					mgo.UpSertBulk(extract, updateExtract...)
+					updateExtract = [][]map[string]interface{}{}
+				}
+
 			}
 		}
+		tmp = make(map[string]interface{})
 	}
-	//时间不正确时
-	if minTime == 0 && maxTime == 0 {
-		log.Println("段数据区间 不符合")
-		return
+
+
+	//批量更新标记
+	if len(updateExtract) > 0 {
+		mgo.UpSertBulk(extract, updateExtract...)
+		updateExtract = [][]map[string]interface{}{}
 	}
-	fmt.Println("最小时间==", minTime, "最大时间==", maxTime)
-	gtid, lteid := util.BsonIdToSId(mapInfo["gtid"].(string)), util.BsonIdToSId(mapInfo["lteid"].(string))
-	fmt.Println(gtid, lteid)
-	HM = NewHistorymap(gtid, lteid, minTime, maxTime)
 
-	fmt.Println("开始历史数据判重")
+	if len(dayArr)>0 {
+		pendAllArr = append(pendAllArr,dayArr)
+		dayArr = []map[string]interface{}{}
+	}
 
-	defer util.Catch()
-	//区间id
-	sess_history := mgo.GetMgoConn()
-	defer mgo.DestoryMongoConn(sess_history)
-	q_history := map[string]interface{}{
-		"_id": map[string]interface{}{
-			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
-			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
-		},
+	log.Println("查询数量:",num,"符合条件:",oknum)
+
+	if len(pendAllArr) <= 0 {
+		log.Println("没找到dataging==1的数据")
+		return
 	}
-	log.Println(mgo.DbName, extract, q_history)
 
-	//是否排序
-	it_history := sess_history.DB(mgo.DbName).C(extract).Find(&q_history).Iter()
-	if Is_Sort {
-		it_history = sess_history.DB(mgo.DbName).C(extract).Find(&q_history).Sort("publishtime").Iter()
+	//测试分组数量是否正确
+	testNum:=0
+	for k,v:=range pendAllArr {
+		log.Println("第",k,"组--","数量:",len(v))
+		testNum = testNum+len(v)
 	}
-	updateExtract := [][]map[string]interface{}{}
-	log.Println("线程数:", threadNum)
-	pool := make(chan bool, threadNum)
-	wg := &sync.WaitGroup{}
+	log.Println("本地构建分组完成:",len(pendAllArr),"组","测试-总计数量:",testNum)
+
 	n, repeateN := 0, 0
-	for tmp := make(map[string]interface{}); it_history.Next(&tmp); n++ {
-		if n%10000 == 0 {
-			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
-		}
-		pool <- true
-		wg.Add(1)
-		go func(tmp map[string]interface{}) {
-			defer func() {
-				<-pool
-				wg.Done()
-			}()
+	for k,v:=range pendAllArr {
+		//构建当前组的数据池
+		log.Println("构建第",k,"组---(数据池)")
+		DM = TimedTaskDatamap(dupdays, util.Int64All(v[0]["publishtime"]))
+		log.Println("开始遍历判重第",k,"组  共计数量:",len(v))
+		n = n+len(v)
+		log.Println("统计目前总数量:",n,"重复数量:",repeateN)
+		for _,tmp:=range v {
 			info := NewInfo(tmp)
-			if !LowHeavy {	//是否进行低质量数据判重
+			if !LowHeavy { //是否进行低质量数据判重
 				if invalidData(info.buyer, info.projectname, info.projectcode, info.contractnumber) {
+					log.Println("无效数据")
 					updateExtract = append(updateExtract, []map[string]interface{}{
 						map[string]interface{}{
 							"_id": tmp["_id"],
 						},
 						map[string]interface{}{
 							"$set": map[string]interface{}{
-								"repeat": -1,//无效数据标签
+								"repeat":   -1, //无效数据标签
+								"dataging": 0,
 							},
 						},
 					})
-					if len(updateExtract) > 500 {
+					if len(updateExtract) > 50 {
 						mgo.UpSertBulk(extract, updateExtract...)
 						updateExtract = [][]map[string]interface{}{}
 					}
-					return
+					continue
 				}
 			}
-			b, source, reason := HM.checkHistory(info)
+			b, source, reason := DM.check(info)
 			if b { //有重复,生成更新语句,更新抽取和更新招标
-				if reason == "未判重记录" {
-					fmt.Println("未判重记录")
-					//把info的数据判重的标签更换,并新增字段
-					HM.replaceSourceData(info, info.id) //替换即添加
-					updateExtract = append(updateExtract, []map[string]interface{}{
-						map[string]interface{}{
-							"_id": tmp["_id"],
-						},
-						map[string]interface{}{
-							"$set": map[string]interface{}{
-								"repeat":   0,
-								"repeatid": -2,
-							},
+				log.Println("判重结果", b, reason,"目标id",info.id)
+				repeateN++
+				//重复数据打标签
+				updateExtract = append(updateExtract, []map[string]interface{}{
+					map[string]interface{}{
+						"_id": tmp["_id"],
+					},
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat":        1,
+							"repeat_reason": reason,
+							"repeat_id":     source.id,
+							"dataging":      0,
 						},
-					})
-				} else {
-					repeateN++
-					var is_replace = false
-					var mergeArr = []int64{}                    //更改合并数组记录
-					var newData = &Info{}                       //更换新的数据池数据
-					var repeat_idMap = map[string]interface{}{} //记录判重的
-					var merge_idMap = map[string]interface{}{}  //记录合并的
-					repeat_idMap["_id"] = StringTOBsonId(info.id)
-					merge_idMap["_id"] = StringTOBsonId(source.id)
-					repeat_id := source.id
-					//以下合并相关
-					if isMerger {
-						basic_bool := basicDataScore(source, info)
-						if basic_bool {
-							//已原始数据为标准 - 对比数据打判重标签-
-							newData, mergeArr, is_replace = mergeDataFields(source, info)
-							HM.replaceSourceData(newData, source.id) //替换
-							//对比数据打重复标签的id,原始数据id的记录
-							repeat_idMap["_id"] = StringTOBsonId(info.id)
-							merge_idMap["_id"] = StringTOBsonId(source.id)
-							repeat_id = source.id
-						} else {
-							//已对比数据为标准 ,数据池的数据打判重标签
-							newData, mergeArr, is_replace = mergeDataFields(info, source)
-							HM.replaceSourceData(newData, source.id) //替换
-							//原始数据打重复标签的id,   对比数据id的记录
-							repeat_idMap["_id"] = StringTOBsonId(source.id)
-							merge_idMap["_id"] = StringTOBsonId(info.id)
-							repeat_id = info.id
-						}
-
-						merge_map := make(map[string]interface{}, 0)
-						if is_replace { //有过合并-更新数据
-							merge_map = map[string]interface{}{
-								"$set": map[string]interface{}{
-									"merge": newData.mergemap,
-								},
-							}
-
-							//更新合并后的数据
-							for _, value := range mergeArr {
-								if value == 0 {
-									merge_map["$set"].(map[string]interface{})["area"] = newData.area
-									merge_map["$set"].(map[string]interface{})["city"] = newData.city
-								} else if value == 1 {
-									merge_map["$set"].(map[string]interface{})["area"] = newData.area
-									merge_map["$set"].(map[string]interface{})["city"] = newData.city
-								} else if value == 2 {
-									merge_map["$set"].(map[string]interface{})["projectname"] = newData.projectname
-								} else if value == 3 {
-									merge_map["$set"].(map[string]interface{})["projectcode"] = newData.projectcode
-								} else if value == 4 {
-									merge_map["$set"].(map[string]interface{})["buyer"] = newData.buyer
-								} else if value == 5 {
-									merge_map["$set"].(map[string]interface{})["budget"] = newData.budget
-								} else if value == 6 {
-									merge_map["$set"].(map[string]interface{})["winner"] = newData.winner
-								} else if value == 7 {
-									merge_map["$set"].(map[string]interface{})["bidamount"] = newData.bidamount
-								} else if value == 8 {
-									merge_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
-								} else if value == 9 {
-									merge_map["$set"].(map[string]interface{})["contractnumber"] = newData.contractnumber
-								} else if value == 10 {
-									merge_map["$set"].(map[string]interface{})["publishtime"] = newData.publishtime
-								} else if value == 11 {
-									merge_map["$set"].(map[string]interface{})["agency"] = newData.agency
-								} else {
-								}
-							}
-							//模板数据更新
-							updateExtract = append(updateExtract, []map[string]interface{}{
-								merge_idMap,
-								merge_map,
-							})
-						}
-					}else { //高质量数据
-						basic_bool := basicDataScore(source, info)
-						if !basic_bool {
-							HM.replaceSourceData(info, source.id) //替换
-							repeat_idMap["_id"] = StringTOBsonId(source.id)
-							repeat_id = info.id
-						}
-					}
-
-					//重复数据打标签
-					updateExtract = append(updateExtract, []map[string]interface{}{
-						repeat_idMap,
-						map[string]interface{}{
-							"$set": map[string]interface{}{
-								"repeat":        1,
-								"repeat_reason": reason,
-								"repeat_id":     repeat_id,
-							},
+					},
+				})
+			}else {
+				updateExtract = append(updateExtract, []map[string]interface{}{
+					map[string]interface{}{
+						"_id": tmp["_id"],
+					},
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"dataging": 0,//符合条件的都为dataging==0
 						},
-					})
-
-				}
+					},
+				})
+			}
+			if len(updateExtract) > 50 {
+				mgo.UpSertBulk(extract, updateExtract...)
+				updateExtract = [][]map[string]interface{}{}
 			}
-		}(tmp)
-		if len(updateExtract) > 500 {
-			mgo.UpSertBulk(extract, updateExtract...)
-			updateExtract = [][]map[string]interface{}{}
 		}
-		tmp = make(map[string]interface{})
 	}
-	wg.Wait()
+
+
+
 	if len(updateExtract) > 0 {
 		mgo.UpSertBulk(extract, updateExtract...)
-		//mgo.UpdateBulk(bidding, updateBidding...)
+		updateExtract = [][]map[string]interface{}{}
 	}
-	log.Println("this task over.", n, "repeateN:", repeateN, mapInfo["stop"])
+	log.Println("this timeTask over.", n, "repeateN:", repeateN)
 
-	//任务完成,开始发送广播通知下面节点
-	if n > repeateN && mapInfo["stop"] == nil {
+	//任务完成,开始发送广播通知下面节点 发udp 去升索引待定 + 合并
+	if n > repeateN {
 		for _, to := range nextNode {
-			sid, _ := mapInfo["gtid"].(string)
-			eid, _ := mapInfo["lteid"].(string)
-			key := sid + "-" + eid + "-" + util.ObjToString(to["stype"])
+			next_sid := util.BsonIdToSId(task_sid)
+			next_eid := util.BsonIdToSId(task_eid)
+			key := next_sid + "-" + next_eid + "-" + util.ObjToString(to["stype"])
 			by, _ := json.Marshal(map[string]interface{}{
-				"gtid":  sid,
-				"lteid": eid,
+				"gtid":  next_sid,
+				"lteid": next_eid,
 				"stype": util.ObjToString(to["stype"]),
 				"key":   key,
 			})
@@ -747,7 +744,7 @@ func basicDataScore(v *Info, info *Info) bool {
 	//先判断level
 	if dict_v != nil {
 		v_level := util.ObjToString(dict_v["level"])
-		if v_level == "中央" {
+		if v_level == "国家" {
 			v_score = 4
 		} else if v_level == "省级" {
 			v_score = 3
@@ -763,7 +760,7 @@ func basicDataScore(v *Info, info *Info) bool {
 
 	if dict_info != nil {
 		info_level := util.ObjToString(dict_info["level"])
-		if info_level == "中央" {
+		if info_level == "国家" {
 			info_score = 4
 		} else if info_level == "省级" {
 			info_score = 3
@@ -788,11 +785,11 @@ func basicDataScore(v *Info, info *Info) bool {
 	//判断sitetype
 	if dict_v != nil {
 		v_sitetype := util.ObjToString(dict_v["sitetype"])
-		if v_sitetype == "政府采购" || v_sitetype == "政府门户" {
+		if v_sitetype == "政府采购" {
 			v_score = 4
 		} else if v_sitetype == "公共资源" {
 			v_score = 3
-		} else if v_sitetype == "官方网站" {
+		} else if v_sitetype == "官方网站"|| v_sitetype == "政府门户" {
 			v_score = 2
 		} else if v_sitetype == "社会公共招标平台" || v_sitetype == "企业招标平台" {
 			v_score = 1
@@ -804,11 +801,11 @@ func basicDataScore(v *Info, info *Info) bool {
 
 	if dict_info != nil {
 		info_sitetype := util.ObjToString(dict_info["sitetype"])
-		if info_sitetype == "政府采购" || info_sitetype == "政府门户" {
+		if info_sitetype == "政府采购" {
 			info_score = 4
 		} else if info_sitetype == "公共资源" {
 			info_score = 3
-		} else if info_sitetype == "官方网站" {
+		} else if info_sitetype == "官方网站"|| info_sitetype == "政府门户" {
 			info_score = 2
 		} else if info_sitetype == "社会公共招标平台" || info_sitetype == "企业招标平台" {
 			info_score = 1
@@ -825,6 +822,17 @@ func basicDataScore(v *Info, info *Info) bool {
 		return false
 	}
 
+	if v_score == info_score {//同sitetype 情况下   分析weight
+		v_weight := util.IntAll(dict_v["weight"])
+		info_weight := util.IntAll(dict_info["weight"])
+		if v_weight>info_weight {
+			return true
+		}
+		if info_weight>v_weight {
+			return false
+		}
+	}
+
 	//网站评估
 	m, n := 0, 0
 	if v.projectname != "" {
@@ -922,3 +930,28 @@ func invalidData(d1 string, d2 string, d3 string, d4 string) bool {
 	}
 	return false
 }
+
+//迁移数据dupdays+5之前的数据
+func movedata() {
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	year, month, day := time.Now().Date()
+	q := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$lt": time.Date(year, month, day, 0, 0, 0, 0, time.Local).Add(-time.Duration(dupdays) * 24 * time.Hour).Unix(),
+		},
+	}
+	log.Println(q)
+	it := sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		mgo.Save(extract_back, tmp)
+		tmp = map[string]interface{}{}
+		if index%1000 == 0 {
+			log.Println("index", index)
+		}
+	}
+	log.Println("save to", extract_back, " ok index", index)
+	delnum := mgo.Delete(extract, q)
+	log.Println("remove from ", extract, delnum)
+}

+ 1 - 1
udps/main.go

@@ -18,7 +18,7 @@ var startDate, endDate string
 
 func main() {
 	ip, p, tmptime, tmpkey, id1, id2, stype, q, bkey, param := "", 0, 0, "", "", "", "", "", "", ""
-	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
+	flag.StringVar(&startDate, "start", "2020-04-30", "开始日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
 	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")
 	flag.IntVar(&p, "p", 0, "端口")