瀏覽代碼

Merge branch 'dev3.4.1' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4.1

maxiaoshan 4 年之前
父節點
當前提交
d088b4a855
共有 40 個文件被更改,包括 5589 次插入470 次删除
  1. 1 3
      data_quality/src/main.go
  2. 2 0
      fullproject/src_v1/init.go
  3. 6 3
      fullproject/src_v1/mgotool.go
  4. 3 0
      fullproject/src_v1/project.go
  5. 3 1
      fullproject/src_v1/task.go
  6. 31 3
      fullproject/src_v1/update.go
  7. 3 2
      qyxy/src/main.go
  8. 38 32
      qyxy/src/task.go
  9. 12 1
      src/jy/extract/extract.go
  10. 1 1
      src/res/fieldscore.json
  11. 180 180
      src/res/formattext.json
  12. 18 19
      standardata/src/standaragency.go
  13. 38 34
      standardata/src/standarbuyer.go
  14. 31 36
      standardata/src/standarwinner.go
  15. 7 69
      udpcreateindex/src/biddingall.go
  16. 1 1
      udpcreateindex/src/config.json
  17. 1 1
      udpfilterdup/src/config.json
  18. 94 23
      udpfilterdup/src/dataMethod.go
  19. 21 7
      udpfilterdup/src/dataMethodHeavy.go
  20. 22 2
      udpfilterdup/src/datamap.go
  21. 74 49
      udpfilterdup/src/main.go
  22. 794 0
      udpfilterdup/src3/README.md
  23. 38 0
      udpfilterdup/src3/config.json
  24. 258 0
      udpfilterdup/src3/dataMethod.go
  25. 467 0
      udpfilterdup/src3/dataMethodHeavy.go
  26. 420 0
      udpfilterdup/src3/dataMethodMerge.go
  27. 588 0
      udpfilterdup/src3/datamap.go
  28. 759 0
      udpfilterdup/src3/main.go
  29. 315 0
      udpfilterdup/src3/mgo.go
  30. 59 0
      udpfilterdup/src3/udptaskmap.go
  31. 62 0
      udpfilterdup/src3/updateMethod.go
  32. 23 0
      udpfusion/src/config.json
  33. 200 0
      udpfusion/src/main.go
  34. 315 0
      udpfusion/src/mgo.go
  35. 59 0
      udpfusion/src/sendmail.go
  36. 62 0
      udpfusion/src/updateFusion.go
  37. 62 0
      udpfusion/src/updateRecord.go
  38. 190 0
      udpfusion/src/weightFusion.go
  39. 328 0
      udpfusion/src/weightValue.go
  40. 3 3
      udps/main.go

+ 1 - 3
data_quality/src/main.go

@@ -106,11 +106,9 @@ func mainT() {
 func main() {
 
 
-
-
-
 	sid := "1f0000000000000000000000"
 	eid := "9f0000000000000000000000"
+
 	log.Println(sid, "---", eid)
 	mapinfo := map[string]interface{}{}
 	if sid == "" || eid == "" {

+ 2 - 0
fullproject/src_v1/init.go

@@ -17,6 +17,7 @@ var (
 	Sysconfig                                      map[string]interface{} //读取配置文件
 	MongoTool, MgoBidding                          *MongodbSim            //mongodb连接
 	ExtractColl, ProjectColl, BackupColl, SiteColl string                 //抽取表、项目表、项目快照表、站点表
+	ExtractColl1                                   string
 	Thread                                         int                    //配置项线程数
 	BlackList                                      []interface{}
 	BlaskListMap                                   map[string]bool
@@ -66,6 +67,7 @@ func init() {
 	MgoBidding.InitPool()
 
 	ExtractColl = Sysconfig["extractColl"].(string)
+	ExtractColl1 = Sysconfig["extractColl1"].(string)
 	ProjectColl = Sysconfig["projectColl"].(string)
 	BackupColl = Sysconfig["projectColl"].(string) + "_back"
 	SiteColl = Sysconfig["siteColl"].(string)

+ 6 - 3
fullproject/src_v1/mgotool.go

@@ -124,7 +124,10 @@ func (ms *MgoSess) Iter() *MgoIter {
 }
 
 func (ms *MgoSess) Count() (int64, error) {
-	return ms.M.C.Database(ms.Db).Collection(ms.Coll).CountDocuments(ms.M.Ctx, ms.Query)
+	if ms.Query != nil {
+		return ms.M.C.Database(ms.Db).Collection(ms.Coll).CountDocuments(ms.M.Ctx, ms.Query)
+	}
+	return ms.M.C.Database(ms.Db).Collection(ms.Coll).EstimatedDocumentCount(ms.M.Ctx)
 }
 
 type MongodbSim struct {
@@ -189,9 +192,9 @@ func (m *MongodbSim) UpSertBulk(c string, doc ...[]map[string]interface{}) bool
 		write.SetUpsert(true)
 		writes = append(writes, write)
 	}
-	_, e := coll.BulkWrite(m.Ctx, writes)
+	r, e := coll.BulkWrite(m.Ctx, writes)
 	if e != nil {
-		log.Println("mgo upsert error:", e.Error())
+		log.Println("mgo upsert error:", e.Error(), r)
 		return false
 	}
 	return true

+ 3 - 0
fullproject/src_v1/project.go

@@ -601,6 +601,9 @@ func (p *ProjectTask) NewProject(tmp map[string]interface{}, thisinfo *Info) (st
 			set["qualifies"] = strings.Join(str, ",")
 		}
 	}
+	if len(p1.EntIdList) > 0 {
+		set["entidlist"] = p1.EntIdList
+	}
 	p1.InfoFiled = make(map[string]InfoField)
 	infofield := InfoField{
 		Budget:       thisinfo.Budget,

+ 3 - 1
fullproject/src_v1/task.go

@@ -422,7 +422,7 @@ func (p *ProjectTask) delInfoPro(udpInfo map[string]interface{}) {
 	}
 	client := Es.GetEsConn()
 	defer Es.DestoryEsConn(client)
-	esquery := `{"query": {"bool": {"must": [{"match": {"ids": "`+infoid+`"}}]}}}`
+	esquery := `{"query": {"bool": {"must": [{"term": {"ids": "`+infoid+`"}}]}}}`
 	data := Es.Get(Index, Itype, esquery)
 	if len(*data) > 0 {
 		pid := util.ObjToString(((*data)[0])["_id"])
@@ -490,7 +490,9 @@ func (p *ProjectTask) enter(db, coll string, q map[string]interface{}) {
 						p.CommonMerge(tmp, info)
 					} else {
 						//信息错误,进行更新
+						p.mapBidLock.Lock()
 						countRepeat++
+						p.mapBidLock.Unlock()
 					}
 				}(tmp)
 			case <-over:

+ 31 - 3
fullproject/src_v1/update.go

@@ -88,6 +88,13 @@ func (p *ProjectTask) mergeAndModify(pInfoId string, index, position int, tmp ma
 			for _, v := range proList{
 				v1 := v.(map[string]interface{})
 				temp := MongoTool.FindById(ExtractColl, qu.ObjToString(v1["infoid"]))
+				if len(temp) == 0 {
+					temp = MongoTool.FindById(ExtractColl1, qu.ObjToString(v1["infoid"]))
+					if len(temp) == 0 {
+						qu.Debug("extract not find id...", v1["infoid"])
+						continue
+					}
+				}
 				tempInfo := ParseInfo(temp)
 				if flag {
 					merge := p.ReMerge(tempInfo, temp, tmpPro)
@@ -197,9 +204,9 @@ func (p *ProjectTask) delJudge(infoid, pid string) {
 		backupPro(tmpPro)
 		c := MongoTool.Delete(ProjectColl, pid)
 		if c > 0 {
-			//client := Es.GetEsConn()
-			//defer Es.DestoryEsConn(client)
-			//Es.DelById(Itype, Index, pid)
+			client := Es.GetEsConn()
+			defer Es.DestoryEsConn(client)
+			Es.DelById(Itype, Index, pid)
 		}
 		return
 	}
@@ -225,6 +232,13 @@ func (p *ProjectTask) delJudge(infoid, pid string) {
 		for _, v := range proList{
 			v1 := v.(map[string]interface{})
 			temp := MongoTool.FindById(ExtractColl, qu.ObjToString(v1["infoid"]))
+			if len(temp) == 0 {
+				temp = MongoTool.FindById(ExtractColl1, qu.ObjToString(v1["infoid"]))
+				if len(temp) == 0 {
+					qu.Debug("extract not find id...", v1["infoid"])
+					continue
+				}
+			}
 			tempInfo := ParseInfo(temp)
 			if flag {
 				merge := p.ReMerge(tempInfo, temp, tmpPro)
@@ -447,6 +461,13 @@ func (p *ProjectTask) innerMerge(infoList []interface{}, tmpPro map[string]inter
 	for k, m := range infoList{
 		m1 := m.(map[string]interface{})
 		temp := MongoTool.FindById(ExtractColl, qu.ObjToString(m1["infoid"]))
+		if len(temp) == 0 {
+			temp = MongoTool.FindById(ExtractColl1, qu.ObjToString(m1["infoid"]))
+			if len(temp) == 0 {
+				qu.Debug("extract not find id...", m1["infoid"])
+				continue
+			}
+		}
 		tempInfo := ParseInfo(temp)
 		if k == 0 {
 			p1 = p.newPro(temp, newP, tempInfo)
@@ -467,6 +488,13 @@ func (p *ProjectTask) innerMerge1(infoList []interface{}, infoid string, tmpPro
 			continue
 		}
 		temp := MongoTool.FindById(ExtractColl, qu.ObjToString(m1["infoid"]))
+		if len(temp) == 0 {
+			temp = MongoTool.FindById(ExtractColl1, qu.ObjToString(m1["infoid"]))
+			if len(temp) == 0 {
+				qu.Debug("extract not find id...", m1["infoid"])
+				continue
+			}
+		}
 		tempInfo := ParseInfo(temp)
 		if k == 0 {
 			p1 = p.newPro(temp, newP, tempInfo)

+ 3 - 2
qyxy/src/main.go

@@ -70,8 +70,9 @@ func init() {
 }
 
 func main() {
-	//go TimeTask()
-	QyxyStandard()
+	go TimeTask()
+	//QyxyStandard()
+	//HistoryQyxyStandard()
 	ch := make(chan bool, 1)
 	<-ch
 }

+ 38 - 32
qyxy/src/task.go

@@ -97,6 +97,9 @@ func QyxyStandard() bool {
 	wg := &sync.WaitGroup{}
 	lock := &sync.Mutex{} //控制读写
 	arr := [][]map[string]interface{}{}
+	//q := map[string]interface{}{
+	//	"company_name" : "北京教培师训网络科技股份有限公司",
+	//}
 	count, _ := sess.DB(Dbname).C(Dbcoll).Find(nil).Count()
 	log.Println("共查询:", count, "条")
 	if count == 0 {
@@ -377,23 +380,17 @@ func QyxyStandard() bool {
 			}
 			//es数据过滤
 			EsSaveFlag := true
-			company_name := qu.ObjToString(esMap["company_name"])
-			if len([]rune(company_name)) < 8 {
-				EsSaveFlag = false
-			}
-			if EsSaveFlag {
-				company_type := qu.ObjToString(esMap["company_type"])
-				if company_type == "" || company_type == "个体工商户" {
-					EsSaveFlag = false
-				}
-			}
-			if EsSaveFlag {
-				status := qu.ObjToString(esMap["company_status"])
-				if status != "正常" {
+			company_type := qu.ObjToString(esMap["company_type"])
+			if company_type == "个体工商户" {
+				esMap["company_type_int"] = 1
+				company_name := qu.ObjToString(esMap["company_name"])
+				if len([]rune(company_name)) < 5 {
 					EsSaveFlag = false
 				}
-			}
-			if EsSaveFlag {
+			}else if company_type == "" {
+				EsSaveFlag = false
+			}else {
+				esMap["company_type_int"] = 0
 				credit_no := strings.TrimSpace(qu.ObjToString(esMap["credit_no"]))
 				company_code := strings.TrimSpace(qu.ObjToString(esMap["company_code"]))
 				if credit_no == "" && company_code == "" {
@@ -417,7 +414,7 @@ func QyxyStandard() bool {
 			}
 			EsSaveAllCache <- esMap //所有数据保存
 			update = append(update, map[string]interface{}{"$set": mgoMap})
-			SaveHistoryName(tmp)
+			SaveHistoryName(tmp)		//保存曾用名
 			if len(update) == 2 {
 				arr = append(arr, update)
 			}
@@ -728,19 +725,20 @@ func HistoryQyxyStandard() {
 			}
 			//es数据过滤
 			EsSaveFlag := true
-			company_name := qu.ObjToString(esMap["company_name"])
-			if len([]rune(company_name)) < 8 {
-				EsSaveFlag = false
-			}
-			if EsSaveFlag {
-				company_type := qu.ObjToString(esMap["company_type"])
-				if company_type == "" || company_type == "个体工商户" {
+			company_type := qu.ObjToString(esMap["company_type"])
+			if company_type == "个体工商户" {
+				esMap["company_type_int"] = 1
+				company_name := qu.ObjToString(esMap["company_name"])
+				if len([]rune(company_name)) < 5 {
 					EsSaveFlag = false
 				}
-			}
-			if EsSaveFlag {
-				status := qu.ObjToString(esMap["company_status"])
-				if status != "正常" {
+			}else if company_type == "" {
+				EsSaveFlag = false
+			}else {
+				esMap["company_type_int"] = 0
+				credit_no := strings.TrimSpace(qu.ObjToString(esMap["credit_no"]))
+				company_code := strings.TrimSpace(qu.ObjToString(esMap["company_code"]))
+				if credit_no == "" && company_code == "" {
 					EsSaveFlag = false
 				}
 			}
@@ -766,9 +764,9 @@ func HistoryQyxyStandard() {
 				}
 				EsSaveCache <- esMap //过滤后数据保存
 			}
-			EsSaveAllCache <- esMap //所有数据保存
-			SaveHistoryName(tmp)
-			update = append(update, map[string]interface{}{"$set": mgoMap})
+			//EsSaveAllCache <- esMap //所有数据保存
+			//SaveHistoryName(tmp)
+			//update = append(update, map[string]interface{}{"$set": mgoMap})
 			if len(update) == 2 {
 				arr = append(arr, update)
 			}
@@ -896,10 +894,17 @@ func InitAddress() {
 	log.Println("Init Address...")
 	AddressMap = map[string]*City{}
 	AddressOldMap = map[string]*City{}
-	address, _ := Mgo.Find("address_new_2020", nil, nil, nil, false, -1, -1)
-	for _, tmp := range *address {
+
+	sess := Mgo.GetMgoConn()
+	defer Mgo.DestoryMongoConn(sess)
+	result := sess.DB(Dbname).C("address_new_2020").Find(nil).Iter()
+	count := 0
+	for tmp := make(map[string]interface{}); result.Next(&tmp); count++ {
 		code := qu.ObjToString(tmp["code"])
 		codeLen := len(code)
+		if codeLen > 6 {
+			continue
+		}
 		if t_code := CodeMap[codeLen]; t_code != "" { //新的address表补齐code
 			code = code + t_code
 		}
@@ -915,6 +920,7 @@ func InitAddress() {
 			AddressMap[code] = city
 		}
 	}
+	qu.Debug("Init Address end...", len(AddressMap), len(AddressOldMap))
 }
 
 func InitQyStype() {

+ 12 - 1
src/jy/extract/extract.go

@@ -1806,6 +1806,10 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 					delete(v, "winner")
 					delete(v, "bidamount")
 				}
+				j.Winnerorder = nil
+				if jf!= nil && jf.Winnerorder!= nil{
+					jf.Winnerorder = nil
+				}
 			}
 		}
 		//重新取出清理过后的中标候选人
@@ -2201,7 +2205,14 @@ func checkFields(tmp map[string]interface{}) map[string]interface{} {
 	if tmp["project_timeunit"]=="年" && tmp["project_duration"] == nil {
 		delete(tmp, "project_timeunit")
 	}
-
+	tmp["repeat"] = 0
+	if tmp["winner"] != nil && tmp["s_winner"] != nil {
+		strwin := qu.ObjToString(tmp["winner"])
+		strwin_s := qu.ObjToString(tmp["s_winner"])
+		if !strings.Contains(strwin_s, strwin) {
+			tmp["s_winner"] = strwin
+		}
+	}
 	return tmp
 }
 

+ 1 - 1
src/res/fieldscore.json

@@ -450,7 +450,7 @@
         "negativewords": [
             {
                 "describe": "出现中文汉字",
-                "regstr": "[\\u4e00-\\u9fa5]",
+                "regstr": "[\\u4e00-\\u9fa5]{2,10}",
                 "score": -10
             },  
             {

+ 180 - 180
src/res/formattext.json

@@ -1,182 +1,182 @@
 {
-    "all": [
-		{
-            "reg": "(项目名称)及(编号)[::](.+?),(.+)",
-            "separator": "$1:$3\n项目$2:$4",
-            "desc": ""
-        },
-        {
-            "reg": "([^项目概况|\u4e00-\u9fa5]{2,10})以?及([^招标内容|\u4e00-\u9fa5]{2,10})[::](.+?),(.+)",
-            "separator": "$1:$3\n$2:$4",
-            "desc": ""
-        },
-		{
-            "reg": "项目名称[::].+?([\r\n]编号[::为].+|[((]编号[::].+?[))])",
-            "separator": "编号[::为]__项目编号:",
-            "desc": ""
-        },
-        {
-            "reg": "[((][大小]写.*?[))]",
-            "separator": " ",
-            "desc": "替换掉无效的kv"
-        },
-        {
-            "reg": "(\\d+[,,.]+)+\\d+((百|千)?元|(百|千)?(万|亿)元?)",
-            "separator": "[,,]__",
-            "desc": "把金额里面的,号替换成,号 例如:8,88,8.8元 to 8888.8元"
-        },
-        {
-            "reg": "[^,,\\d](\\d{1,3}[,,]+)+\\d{3}",
-            "separator": "[,,]__",
-            "desc": "把金额里面的,号替换成,号 例如:8,88,8.8元 to 8888.8元"
-        },
-        {
-            "reg": "(.+?[((]((百|千)?元|(百|千)?(万|亿)元?)[))][::][\\d.]+)(.+?[::].+)",
-            "separator": "$1\n$6",
-            "desc": "两个kv连到一起的,指定k以换行符分隔开来"
-        },
-        {
-            "reg": "[((]¥[::][\\d,.]+[))]",
-            "separator": "¥[::]__",
-            "desc": "预中标价:壹佰柒拾捌万玖仟捌佰肆拾玖元整(¥:178984900)"
-        },
-        {
-            "reg": "(.{2,10}?[::][^、::,。\r\n]+?)(((采购)?代理机构|报价截止时间|成交供应商)[::].+)",
-            "separator": "$1\n$2",
-            "desc": "两个kv连到一起的,指定k以换行符分隔开来"
-        },
-        {
-            "reg": "[\r\n].{3,20}?[::]",
-            "separator": "([\u4e00-\u9fa5]+?)[\u3000\u2003\u00a0\\s]+__$1",
-            "desc": "例如:把采 购 人替换成采购人"
-        },
-        {
-            "reg_c": "([\u4e00-\u9fa5][^((,,。、.;;\r\n]{1,30}?[::][^\\s\u3000\u2003\u00a0,、。;;\r\n]+)([((])(.+?[::].+?)([))])",
-			"reg": "[((]([^::))\\r\\n]{2,10}[::][^::))\\r\\n]+)+[))]",
-            "separator": "[\\s\u3000\u2003\u00a0]__",
-            "desc": "项目名称:流通领域商品质量委托抽检(A 包:汽车配件及用品;B 包:家庭用品) "
-        },
-		{
-            "reg": "招标内容[::][((]共1包[))][\r\n]",
-            "separator": "[((]共1包[))][\r\n]__",
-            "desc": ""
-        },
-		{
-			"reg": "地[\\s\u3000\u2003\u00a0]+(址|点)|邮[\\s\u3000\u2003\u00a0]+政[\\s\u3000\u2003\u00a0]+编[\\s\u3000\u2003\u00a0]+码|邮[\\s\u3000\u2003\u00a0]+编|联[\\s\u3000\u2003\u00a0]+系[\\s\u3000\u2003\u00a0]+(人|方[\\s\u3000\u2003\u00a0]+式)|电[\\s\u3000\u2003\u00a0]+话|手[\\s\u3000\u2003\u00a0]+机|传[\\s\u3000\u2003\u00a0]+真|邮[\\s\u3000\u2003\u00a0]+箱|主[\\s\u3000\u2003\u00a0]+要[\\s\u3000\u2003\u00a0]+负[\\s\u3000\u2003\u00a0]+责[\\s\u3000\u2003\u00a0]+人",
-            "separator": "[\\s\u3000\u2003\u00a0]+__",
-            "desc": ""
-		},
-		{
-			"reg": "((地址|邮编)[::][^::\\s\u3000\u2003\u00a0]{5,})(联系人(姓名)?[::])",
-            "separator": "${1} ${3}",
-            "desc": ""
-		},
-		{
-			"reg": "(采购单位)(联系人)及(联系电话)[::](.+?)[::](.+)",
-            "separator": "$1$2:$4\n$1$3:$5",
-            "desc": ""
-		},
-		{
-			"reg": "([^((,,。、.;;::\r\n公司局政府卫生院]{0,8})(联系人|地址)[::]([^\\s\u3000\u2003\u00a0,,]+?)(联系)?(电话(/传真)?|手机|传真|邮编)[::](.+)",
-            "separator": "\n${1}${2}:${3}\n${1}${5}:${7}",
-            "desc": ""
-		},
-		{
-			"reg": "([\n))])(联系人)及(手机|电话)[::](.+?)[\\s\u3000\u2003\u00a0/,,]+(.+)",
-            "separator": "$1$2:$4\n$3:$5",
-            "desc": ""
-		},
-		{
-			"reg": "\\n(.{2,8})(联系地址)、(联系人)及(电话)[::]([^\\s]+)\\s+([^\\s]+)\\s+(.+)",
-            "separator": "\n$1$2:$5\n$1$3:$6\n$1$4:$7",
-            "desc": ""
-		},
-		{
-			"reg": "\\n(.{2,8})联系方式[::](.+?)\\s+\\+\\s+(.+)",
-            "separator": "\n${1}联系人:$2\n${1}联系方式:$3",
-            "desc": ""
-		},
-		{
-			"reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8})联系方式:联系人:",
-            "separator": "${1}联系人:",
-            "desc": ""
-		},
-		{
-			"reg": "\\n(.{2,8})(联系人)、(联系电话)[::]([\u4e00-\u9fa5、]+)(.+)",
-            "separator": "\n${1}${2}:${4}\n${1}${3}:${5}",
-            "desc": ""
-		},
-		{
-			"reg": "(收货)(联系人)和(联系方式)[::](.+?)/(.+)",
-            "separator": "${1}${2}:${4}\n${1}${3}:${5}",
-            "desc": ""
-		},
-		{
-			"reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{2,8})联系人[::]([\u4e00-\u9fa5、]+)\\s+((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
-            "separator": "${1}联系人:${2}\n${1}联系方式:${3}",
-            "desc": ""
-		},
-		{
-			"reg": "(采购[^方式]{1,8})[::]([^::]{3,15})[,,]([^::]{2,5})((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
-            "separator": "${1}:${2}\n${1}联系人:${3}\n${1}联系方式:${4}",
-            "desc": ""
-		},
-		{
-			"reg": "((联系|负责)人(姓名)?)和(电话)[::]([\u4e00-\u9fa5\\s]+)(.+)",
-            "separator": "${1}:${5}\n${4}:${6}",
-            "desc": ""
-		},
-		{
-			"reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8})(联系.{1,4})(和|及)(.{2,4})[::][\u3000\u2003\u00a0\\s]*([\u4e00-\u9fa5]{2,5})[::\\s\u3000\u2003\u00a0]*((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
-            "separator": "${1}${2}:${5}\n${1}${4}:${6}",
-            "desc": "采购人联系人和联系方式:雷蒙:13299985556 or 联系人及电话:  朱云鹏    13993240931"
-		},
-		{
-			"reg": "((招标|代理).{2,4})联系方式[::](.*)联系人[::](.+?)[\\s\u3000\u2003\u00a0]+联系方式[::]([\\d-转()()/、]+)",
-            "separator": "${1}联系人:${4} ${1}电话:${5}",
-            "desc": "采购项目联系方式:</td></tr><tr><td>        联系人:朱志强        联系方式:67897307"
-		},
-		{
-			"reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8}?)(联系(方式|电话|人)和?)+[::]([^\\d::]{2,8}?)[((]?[\\s\u3000\u2003\u00a0]*((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
-            "separator": "${1}联系人:${4}\n${1}联系方式:${5}",
-            "desc": "采购人联系方式:李静  0311-66629799 or 联系方式:张先生 0917―2660282"
-		},
-		{
-			"reg": "[((]([^))]{2,8}联系人)[::](.+?)[,,]((联系)?(电话|手机)(号码)?)[::](.+)[))]",
-            "separator": "\n${1}:${2}\n${3}:${7}",
-            "desc": ""
-		},
-		{
-			"reg": "\n(.{0,8})联系(人|方式)([::](.+?))[,,]((联系)?(电话|手机)(号码)?[::](.+))",
-            "separator": "\n${1}联系人${3}\n${5}",
-            "desc": ""
-		}
-    ],
-    "kv": [
-        {
-            "reg": "</?td[^>]*>",
-            "separator": "",
-            "desc": "把td清理掉"
-        },
-        {
-            "reg": "</?t[rh][^>]*>",
-            "separator": "\n",
-            "desc": "tr或th替换成换行"
-        },
-        {
-            "reg": "[\\s\u3000\u2003\u00a0]+[^\r\n]([一二三四五六七八九十]+[、..]|\\d+、)",
-            "separator": "\n$1",
-            "desc": "给没有换行的序号添加换行"
-        },
-        {
-            "reg": "\n[\\d.\u3000\u2003\u00a0\\s]*(联系人)及(电话)[::](.+?)[\u3000\u2003\u00a0\\s]+(.+)",
-            "separator": "\n$1:$3\n$2:$4",
-            "desc": ""
-        },
-        {
-            "reg": "[^\\n::]{2,18}[::]\\s*详见[^,。,.::\\s]{2,18}",
-            "separator": "",
-            "desc": "替换掉无效的kv"
-        }
-    ]
+  "all": [
+    {
+      "reg": "(项目名称)及(编号)[::](.+?),(.+)",
+      "separator": "$1:$3\n项目$2:$4",
+      "desc": ""
+    },
+    {
+      "reg": "([^项目概况|\u4e00-\u9fa5]{2,10})以?及([^招标内容|\u4e00-\u9fa5]{2,10})[::](.+?),(.+)",
+      "separator": "$1:$3\n$2:$4",
+      "desc": ""
+    },
+    {
+      "reg": "项目名称[::].+?([\r\n]编号[::为].+|[((]编号[::].+?[))])",
+      "separator": "编号[::为]__项目编号:",
+      "desc": ""
+    },
+    {
+      "reg": "[((][大小]写.*?[))]",
+      "separator": " ",
+      "desc": "替换掉无效的kv"
+    },
+    {
+      "reg": "(\\d+[,,.]+)+\\d+((百|千)?元|(百|千)?(万|亿)元?)",
+      "separator": "[,,]__",
+      "desc": "把金额里面的,号替换成,号 例如:8,88,8.8元 to 8888.8元"
+    },
+    {
+      "reg": "[^,,\\d](\\d{1,3}[,,]+)+\\d{3}",
+      "separator": "[,,]__",
+      "desc": "把金额里面的,号替换成,号 例如:8,88,8.8元 to 8888.8元"
+    },
+    {
+      "reg": "(.+?[((]((百|千)?元|(百|千)?(万|亿)元?)[))][::][\\d.]+)(.+?[::].+)",
+      "separator": "$1\n$6",
+      "desc": "两个kv连到一起的,指定k以换行符分隔开来"
+    },
+    {
+      "reg": "[((]¥[::][\\d,.]+[))]",
+      "separator": "¥[::]__",
+      "desc": "预中标价:壹佰柒拾捌万玖仟捌佰肆拾玖元整(¥:178984900)"
+    },
+    {
+      "reg": "[\r\n].{3,20}?[::]",
+      "separator": "([\u4e00-\u9fa5]+?)[\u3000\u2003\u00a0\\s]+__$1",
+      "desc": "例如:把采 购 人替换成采购人"
+    },
+    {
+      "reg_c": "([\u4e00-\u9fa5][^((,,。、.;;\r\n]{1,30}?[::][^\\s\u3000\u2003\u00a0,、。;;\r\n]+)([((])(.+?[::].+?)([))])",
+      "reg": "[((]([^::))\\r\\n]{2,10}[::][^::))\\r\\n]+)+[))]",
+      "separator": "[\\s\u3000\u2003\u00a0]__",
+      "desc": "项目名称:流通领域商品质量委托抽检(A 包:汽车配件及用品;B 包:家庭用品) "
+    },
+    {
+      "reg": "招标内容[::][((]共1包[))][\r\n]",
+      "separator": "[((]共1包[))][\r\n]__",
+      "desc": ""
+    },
+    {
+      "reg": "地[\\s\u3000\u2003\u00a0]+(址|点)|邮[\\s\u3000\u2003\u00a0]+政[\\s\u3000\u2003\u00a0]+编[\\s\u3000\u2003\u00a0]+码|邮[\\s\u3000\u2003\u00a0]+编|联[\\s\u3000\u2003\u00a0]+系[\\s\u3000\u2003\u00a0]+(人|方[\\s\u3000\u2003\u00a0]+式)|电[\\s\u3000\u2003\u00a0]+话|手[\\s\u3000\u2003\u00a0]+机|传[\\s\u3000\u2003\u00a0]+真|邮[\\s\u3000\u2003\u00a0]+箱|主[\\s\u3000\u2003\u00a0]+要[\\s\u3000\u2003\u00a0]+负[\\s\u3000\u2003\u00a0]+责[\\s\u3000\u2003\u00a0]+人",
+      "separator": "[\\s\u3000\u2003\u00a0]+__",
+      "desc": ""
+    },
+    {
+      "reg": "((地址|邮编)[::][^::\\s\u3000\u2003\u00a0]{5,})(联系人(姓名)?[::])",
+      "separator": "${1} ${3}",
+      "desc": ""
+    },
+    {
+      "reg": "(采购单位)(联系人)及(联系电话)[::](.+?)[::](.+)",
+      "separator": "$1$2:$4\n$1$3:$5",
+      "desc": ""
+    },
+    {
+      "reg": "([^((,,。、.;;::\r\n公司局政府卫生院]{0,8})(联系人|地址)[::]([^\\s\u3000\u2003\u00a0,,]+?)(联系)?(电话(/传真)?|手机|传真|邮编)[::](.+)",
+      "separator": "\n${1}${2}:${3}\n${1}${5}:${7}",
+      "desc": ""
+    },
+    {
+      "reg": "([\n))])(联系人)及(手机|电话)[::](.+?)[\\s\u3000\u2003\u00a0/,,]+(.+)",
+      "separator": "$1$2:$4\n$3:$5",
+      "desc": ""
+    },
+    {
+      "reg": "\\n(.{2,8})(联系地址)、(联系人)及(电话)[::]([^\\s]+)\\s+([^\\s]+)\\s+(.+)",
+      "separator": "\n$1$2:$5\n$1$3:$6\n$1$4:$7",
+      "desc": ""
+    },
+    {
+      "reg": "\\n(.{2,8})联系方式[::](.+?)\\s+\\+\\s+(.+)",
+      "separator": "\n${1}联系人:$2\n${1}联系方式:$3",
+      "desc": ""
+    },
+    {
+      "reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8})联系方式:联系人:",
+      "separator": "${1}联系人:",
+      "desc": ""
+    },
+    {
+      "reg": "\\n(.{2,8})(联系人)、(联系电话)[::]([\u4e00-\u9fa5、]+)(.+)",
+      "separator": "\n${1}${2}:${4}\n${1}${3}:${5}",
+      "desc": ""
+    },
+    {
+      "reg": "(收货)(联系人)和(联系方式)[::](.+?)/(.+)",
+      "separator": "${1}${2}:${4}\n${1}${3}:${5}",
+      "desc": ""
+    },
+    {
+      "reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{2,8})联系人[::]([\u4e00-\u9fa5、]+)\\s+((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
+      "separator": "${1}联系人:${2}\n${1}联系方式:${3}",
+      "desc": ""
+    },
+    {
+      "reg": "(采购[^方式]{1,8})[::]([^::]{3,15})[,,]([^::]{2,5})((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
+      "separator": "${1}:${2}\n${1}联系人:${3}\n${1}联系方式:${4}",
+      "desc": ""
+    },
+    {
+      "reg": "((联系|负责)人(姓名)?)和(电话)[::]([\u4e00-\u9fa5\\s]+)(.+)",
+      "separator": "${1}:${5}\n${4}:${6}",
+      "desc": ""
+    },
+    {
+      "reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8})(联系.{1,4})(和|及)(.{2,4})[::][\u3000\u2003\u00a0\\s]*([\u4e00-\u9fa5]{2,5})[::\\s\u3000\u2003\u00a0]*((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
+      "separator": "${1}${2}:${5}\n${1}${4}:${6}",
+      "desc": "采购人联系人和联系方式:雷蒙:13299985556 or 联系人及电话:  朱云鹏    13993240931"
+    },
+    {
+      "reg": "((招标|代理).{2,4})联系方式[::](.*)联系人[::](.+?)[\\s\u3000\u2003\u00a0]+联系方式[::]([\\d-转()()/、]+)",
+      "separator": "${1}联系人:${4} ${1}电话:${5}",
+      "desc": "采购项目联系方式:</td></tr><tr><td>        联系人:朱志强        联系方式:67897307"
+    },
+    {
+      "reg": "(?s)([^((,,。、.;;::\\s\u3000\u2003\u00a0]{0,8}?)(联系(方式|电话|人)和?)+[::]([^\\d::]{2,8}?)[((]?[\\s\u3000\u2003\u00a0]*((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)",
+      "separator": "${1}联系人:${4}\n${1}联系方式:${5}",
+      "desc": "采购人联系方式:李静  0311-66629799 or 联系方式:张先生 0917―2660282"
+    },
+    {
+      "reg": "[((]([^))]{2,8}联系人)[::](.+?)[,,]((联系)?(电话|手机)(号码)?)[::](.+)[))]",
+      "separator": "\n${1}:${2}\n${3}:${7}",
+      "desc": ""
+    },
+    {
+      "reg": "\n(.{0,8})联系(人|方式)([::](.+?))[,,]((联系)?(电话|手机)(号码)?[::](.+))",
+      "separator": "\n${1}联系人${3}\n${5}",
+      "desc": ""
+    }
+  ],
+  "kv": [
+    {
+      "reg": "</?td[^>]*>",
+      "separator": "",
+      "desc": "把td清理掉"
+    },
+    {
+      "reg": "</?t[rh][^>]*>",
+      "separator": "\n",
+      "desc": "tr或th替换成换行"
+    },
+    {
+      "reg": "(代理机构)(.*)\n(联系人.*\n)(联系电话.*\n)(中标单位名称)(.*\n)(地址.*\n)",
+      "separator": "${1}${2}${1}${3}${1}${4}${5}${6}${5}${7}",
+      "desc": ""
+    },
+    {
+      "reg": "(中标单位)\n(名称.*\n)(地址.*\n)",
+      "separator": "${1}${2}${1}${3}",
+      "desc": ""
+    },
+    {
+      "reg": "\n[\\d.\u3000\u2003\u00a0\\s]*(联系人)及(电话)[::](.+?)[\u3000\u2003\u00a0\\s]+(.+)",
+      "separator": "\n$1:$3\n$2:$4",
+      "desc": ""
+    },
+    {
+      "reg": "[^\\n::]{2,18}[::]\\s*详见[^,。,.::\\s]{2,18}",
+      "separator": "",
+      "desc": "替换掉无效的kv"
+    }
+  ]
 }

+ 18 - 19
standardata/src/standaragency.go

@@ -100,7 +100,7 @@ func agencyStandarData(db string, query map[string]interface{}) {
 				data := winMegerIndustry(entid, v)
 				MongoTo.UpdateById(agencyent, entid,
 					map[string]interface{}{
-						"$set":  data,
+						"$set": data,
 						//"$push": map[string]interface{}{"contact": v},
 					},
 				)
@@ -170,28 +170,27 @@ func historyagency(db, fromcoll string) {
 								bs, _ := json.Marshal(ps)
 								redis.PutRedis("agency", agencybd, agency, bs, -1)
 							}
-						} else {
-							log.Println("jsonErr", err)
 						}
 					}
-				} else {
-					val := []map[string]interface{}{}
-					if agencytel != "" {
-						tmp := map[string]interface{}{
-							"contact_person": agencyperson,
-							"phone":          agencytel,
-							"topscopeclass":  comRepTopscopeclass(topscopeclass),
-							"infoid":         _id,
-						}
-						val = append(val, tmp)
+					return
+				}
+				val := []map[string]interface{}{}
+				if agencytel != "" {
+					tmp := map[string]interface{}{
+						"contact_person": agencyperson,
+						"phone":          agencytel,
+						"topscopeclass":  comRepTopscopeclass(topscopeclass),
+						"infoid":         _id,
 					}
-					bs, _ := json.Marshal(val)
-					redis.PutRedis("agency", agencybd, agency, bs, -1)
-					MongoTo.Save(agencyerr, map[string]interface{}{
-						"name":       agency,
-						"updatetime": time.Now().Unix(),
-					})
+					val = append(val, tmp)
 				}
+				bs, _ := json.Marshal(val)
+				redis.PutRedis("agency", agencybd, agency, bs, -1)
+				MongoTo.Save(agencyerr, map[string]interface{}{
+					"name":       agency,
+					"updatetime": time.Now().Unix(),
+				})
+
 			}
 		}(tmp)
 		tmp = map[string]interface{}{}

+ 38 - 34
standardata/src/standarbuyer.go

@@ -103,7 +103,7 @@ func buyerStandarData(db string, query map[string]interface{}) {
 				data := buyerMegerBuyerclass(entid, v)
 				MongoTo.UpdateById(buyerent, entid,
 					map[string]interface{}{
-						"$set":  data,
+						"$set": data,
 						//"$push": map[string]interface{}{"contact": v},
 					},
 				)
@@ -136,7 +136,16 @@ func historybuyer(db, fromcoll string) {
 				<-buyerchanbool
 			}()
 			buyer := qu.ObjToString(tmp["buyer"])
-			buyerclass := qu.ObjToString(tmp["buyerclass"])
+			buyerclass :=""
+			if tb ,ok := tmp["buyerclass"].(primitive.A);ok{
+				tbn := len(tb)
+				if tbn>0{
+					buyerclass = qu.ObjToString(tb[tbn-1])
+				}
+			}else if tbs ,ok :=tmp["buyerclass"].(string);ok{
+				buyerclass = tbs
+
+			}
 			topscopeclass, _ := tmp["topscopeclass"].(primitive.A)
 			if buyer != "" && utf8.RuneCountInString(buyer) > 4 {
 				buyerperson := qu.ObjToString(tmp["buyerperson"])
@@ -158,30 +167,29 @@ func historybuyer(db, fromcoll string) {
 							ps = append(ps, v)
 							bs, _ := json.Marshal(ps)
 							redis.PutRedis("buyer", buyerbd, buyer, bs, -1)
-						} else {
-							log.Println("jsonErr", err)
 						}
 					}
-				} else {
-					val := []map[string]interface{}{}
-					if buyerperson != "" || buyertel != "" {
-						tmp := map[string]interface{}{
-							"contact_person": buyerperson,
-							"phone":          buyertel,
-							"buyerclass":     buyerclass,
-							"topscopeclass":  comRepTopscopeclass(topscopeclass),
-							"infoid":         _id,
-						}
-						val = append(val, tmp)
+					return
+				}
+				val := []map[string]interface{}{}
+				if buyerperson != "" || buyertel != "" {
+					tmp := map[string]interface{}{
+						"contact_person": buyerperson,
+						"phone":          buyertel,
+						"buyerclass":     buyerclass,
+						"topscopeclass":  comRepTopscopeclass(topscopeclass),
+						"infoid":         _id,
 					}
-					bs, _ := json.Marshal(val)
-					redis.PutRedis("buyer", buyerbd, buyer, bs, -1)
-					MongoTo.Save(buyererr, map[string]interface{}{
-						"name":       buyer,
-						"buyerclass": buyerclass,
-						"updatetime": time.Now().Unix(),
-					})
+					val = append(val, tmp)
 				}
+				bs, _ := json.Marshal(val)
+				redis.PutRedis("buyer", buyerbd, buyer, bs, -1)
+				MongoTo.Save(buyererr, map[string]interface{}{
+					"name":       buyer,
+					"buyerclass": buyerclass,
+					"updatetime": time.Now().Unix(),
+				})
+
 			}
 		}(tmp)
 		tmp = map[string]interface{}{}
@@ -246,24 +254,20 @@ func buyerMegerBuyerclass(id string, ps map[string]interface{}) map[string]inter
 		return nil
 	}
 	data := map[string]interface{}{}
-	buyerclass := tmp["buyerclass"].(primitive.A)
-	tmpbuyerclass := map[string]bool{}
-	for _, v := range buyerclass {
-		tt := qu.ObjToString(v)
-		tmpbuyerclass[tt] = true
-	}
-	tmpbuyerclass[qu.ObjToString(ps["buyerclass"])] = true
-	newbuyerclass := []interface{}{}
-	for k, _ := range tmpbuyerclass {
-		newbuyerclass = append(newbuyerclass, k)
+	if buyerclass, ok := tmp["buyerclass"].(primitive.A); ok {
+		bn := len(buyerclass)
+		if bn > 0 {
+			data["buyerclass"] = qu.ObjToString(buyerclass[bn-1])
+		}
+	} else if sb, ok := tmp["buyerclass"].(string); ok {
+		data["buyerclass"] = sb
 	}
-	data["buyerclass"] = newbuyerclass
 	data["updatetime"] = time.Now().Unix()
 	//contact
 	contact := tmp["contact"].(primitive.A)
 	contact = append(contact, ps)
 	//bid_contact
-	bid_contacts,contacts := bid_contact(contact)
+	bid_contacts, contacts := bid_contact(contact)
 	if len(bid_contacts) > 0 {
 		data["bid_contact"] = bid_contacts
 	}

+ 31 - 36
standardata/src/standarwinner.go

@@ -50,7 +50,7 @@ func winnerStandarData(db string, query map[string]interface{}) {
 					province = qu.ObjToString(data["province"])
 					if province == "" { //省份为空,buyer优先提取区域信息再company_address
 						province, city, district = GetProvinceCityDistrict([]string{winner}) //先buyer
-						if province == "" {                                                 //再address
+						if province == "" {                                                  //再address
 							if address := qu.ObjToString(data["company_address"]); address != "" {
 								province, city, district = GetProvinceCityDistrict([]string{address})
 							}
@@ -82,9 +82,9 @@ func winnerStandarData(db string, query map[string]interface{}) {
 						"topscopeclass": comRepTopscopeclass(topscopeclass),
 						"check":         comMarkdata(winner, "winner"),
 						"updatetime":    time.Now().Unix(),
-						"province":   province,
-						"city":       city,
-						"district":   district,
+						"province":      province,
+						"city":          city,
+						"district":      district,
 					})
 				}
 			}
@@ -99,7 +99,7 @@ func winnerStandarData(db string, query map[string]interface{}) {
 				data := winMegerIndustry(entid, v)
 				MongoTo.UpdateById(winnerent, entid,
 					map[string]interface{}{
-						"$set":  data,
+						"$set": data,
 						//"$push": map[string]interface{}{"contact": v},
 					},
 				)
@@ -141,7 +141,8 @@ func historywinner(db, fromcoll string) {
 	log.Println("history  start")
 	sess := MongoFrom.GetMgoConn()
 	defer MongoFrom.Close()
-	it := sess.DB(db).C(fromcoll).Find(map[string]interface{}{}).Select(bson.M{"repeat": 1, "winner": 1, "winnertel": 1, "winnerperson": 1, "topscopeclass": 1}).Sort("_id").Iter()
+	it := sess.DB(db).C(fromcoll).Find(map[string]interface{}{}).Select(bson.M{
+		"repeat": 1, "winner": 1, "winnertel": 1, "winnerperson": 1, "topscopeclass": 1}).Sort("_id").Iter()
 	index := 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
 		if qu.IntAll(tmp["repeat"]) > 0 { //重复数据跳过
@@ -175,29 +176,28 @@ func historywinner(db, fromcoll string) {
 							bs, _ := json.Marshal(ps)
 							redis.PutRedis("winner", winnerbd, winner, bs, -1)
 							//log.Println(_id, index, winner)
-						} else {
-							log.Println("jsonErr", err)
 						}
 					}
-				} else {
-					val := []map[string]interface{}{}
-					if winnerperson != "" || winnertel != "" {
-						tmp := map[string]interface{}{
-							"contact_person": winnerperson,
-							"phone":          winnertel,
-							"topscopeclass":  comRepTopscopeclass(topscopeclass),
-							"infoid":         _id,
-						}
-						val = append(val, tmp)
+					return
+				}
+				val := []map[string]interface{}{}
+				if winnerperson != "" || winnertel != "" {
+					tmp := map[string]interface{}{
+						"contact_person": winnerperson,
+						"phone":          winnertel,
+						"topscopeclass":  comRepTopscopeclass(topscopeclass),
+						"infoid":         _id,
 					}
-					bs, _ := json.Marshal(val)
-					redis.PutRedis("winner", winnerbd, winner, bs, -1)
-					MongoTo.Save(winnererr, map[string]interface{}{
-						"name":          winner,
-						"topscopeclass": comRepTopscopeclass(topscopeclass),
-						"updatetime":    time.Now().Unix(),
-					})
+					val = append(val, tmp)
 				}
+				bs, _ := json.Marshal(val)
+				redis.PutRedis("winner", winnerbd, winner, bs, -1)
+				MongoTo.Save(winnererr, map[string]interface{}{
+					"name":          winner,
+					"topscopeclass": comRepTopscopeclass(topscopeclass),
+					"updatetime":    time.Now().Unix(),
+				})
+
 			}
 		}(tmp)
 		tmp = map[string]interface{}{}
@@ -262,8 +262,8 @@ func winMegerIndustry(id string, ps map[string]interface{}) map[string]interface
 		return nil
 	}
 	data := map[string]interface{}{}
-	industry,ok := tmp["industry"].(primitive.A)
-	if ok{
+	industry, ok := tmp["industry"].(primitive.A)
+	if ok {
 		tmpindustry := map[string]bool{}
 		for _, v := range industry {
 			tt := qu.ObjToString(v)
@@ -286,7 +286,7 @@ func winMegerIndustry(id string, ps map[string]interface{}) map[string]interface
 	contact := tmp["contact"].(primitive.A)
 	contact = append(contact, ps)
 	//bid_contact
-	bid_contacts,contacts := bid_contact(contact)
+	bid_contacts, contacts := bid_contact(contact)
 	if len(bid_contacts) > 0 {
 		data["bid_contact"] = bid_contacts
 	}
@@ -437,15 +437,10 @@ func comHisMegerNewData(name, datatype string, ps []map[string]interface{}) map[
 		data["industry"] = industry
 	} else if datatype == "buyer" {
 		data["buyer_name"] = name
-		tmpbuyerclass := map[string]bool{}
-		for _, p := range ps {
-			tmpbuyerclass[qu.ObjToString(p["buyerclass"])] = true
-		}
-		buyerclass := []interface{}{}
-		for k, _ := range tmpbuyerclass {
-			buyerclass = append(buyerclass, k)
+		lennum := len(ps)
+		if lennum > 1 {
+			data["buyerclass"] = qu.ObjToString(ps[lennum-1]["buyerclass"])
 		}
-		data["buyerclass"] = buyerclass
 		data["ranks"] = ""
 		data["type"] = ""
 		data["address"] = ""

+ 7 - 69
udpcreateindex/src/biddingall.go

@@ -90,6 +90,13 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 		// 	tmp = make(map[string]interface{})
 		// 	continue
 		// }
+		//if qutil.IntAll(tmp["repeat"]) != 0 {
+		//	esQ := `{"query": {"bool": {"must": [{"term": {"id": "`+ mongodb.BsonIdToSId(tmp["_id"]) +`"}}]}}}`
+		//	esData := elastic.Get(index, itype, esQ)
+		//	if len(*esData) > 0 {
+		//		elastic.DelById(index, itype, mongodb.BsonIdToSId(tmp["_id"]))
+		//	}
+		//}
 
 		// if sensitive := qutil.ObjToString(tmp["sensitive"]); sensitive != "" { //bidding中有敏感词,不生索引
 		// 	tmp = make(map[string]interface{})
@@ -483,75 +490,6 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 	UpdatesLock.Unlock()
 	log.Println(mapInfo, "create bidding index...over", n)
 }
-func preNUm(data byte) int {
-	var mask byte = 0x80
-	var num int = 0
-	//8bit中首个0bit前有多少个1bits
-	for i := 0; i < 8; i++ {
-		if (data & mask) == mask {
-			num++
-			mask = mask >> 1
-		} else {
-			break
-		}
-	}
-	return num
-}
-
-func isGBK(data []byte) bool {
-	length := len(data)
-	var i int = 0
-	for i < length {
-		if data[i] <= 0x7f {
-			//编码0~127,只有一个字节的编码,兼容ASCII码
-			i++
-			continue
-		} else {
-			//大于127的使用双字节编码,落在gbk编码范围内的字符
-			if data[i] >= 0x81 &&
-				data[i] <= 0xfe &&
-				data[i+1] >= 0x40 &&
-				data[i+1] <= 0xfe &&
-				data[i+1] != 0xf7 {
-				i += 2
-				continue
-			} else {
-				return false
-			}
-		}
-	}
-	return true
-}
-
-func isUtf8(data []byte) bool {
-	i := 0
-	for i < len(data) {
-		if (data[i] & 0x80) == 0x00 {
-			// 0XXX_XXXX
-			i++
-			continue
-		} else if num := preNUm(data[i]); num > 2 {
-			// 110X_XXXX 10XX_XXXX
-			// 1110_XXXX 10XX_XXXX 10XX_XXXX
-			// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
-			// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
-			// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
-			// preNUm() 返回首个字节的8个bits中首个0bit前面1bit的个数,该数量也是该字符所使用的字节数
-			i++
-			for j := 0; j < num-1; j++ {
-				//判断后面的 num - 1 个字节是不是都是10开头
-				if (data[i] & 0xc0) != 0x80 {
-					return false
-				}
-				i++
-			}
-		} else {
-			//其他情况说明不是utf-8
-			return false
-		}
-	}
-	return true
-}
 
 //更新extract表
 func UpdateExtract() {

+ 1 - 1
udpcreateindex/src/config.json

@@ -61,7 +61,7 @@
     "projectinfomap": {
       "approvecode": "string", "approvecontent": "string", "approvestatus": "string", "approvetime": "string", "approvedept": "string", "approvenumber": "string", "projecttype": "string", "approvecity": "string"
     },
-    "purchasinglist": "itemname,brandname,model,unitname,number",
+    "purchasinglist": "itemname,brandname,model,unitname,number,unitprice,totalprice",
     "purchasinglistmap": {
       "itemname": "string", "brandname": "string", "model": "string", "unitname": "string", "number": "float64", "unitprice": "float64", "totalprice": "float64"
     },

+ 1 - 1
udpfilterdup/src/config.json

@@ -15,7 +15,7 @@
     "task_mongodb": {
         "task_addrName": "192.168.3.207:27092",
         "task_dbName": "zhengkun",
-        "task_collName": "site",
+        "task_collName": "test",
         "pool": 10
     },
     "jkmail": {

+ 94 - 23
udpfilterdup/src/dataMethod.go

@@ -2,13 +2,12 @@ package main
 
 import (
 	"math"
+	qutil "qfw/util"
 	"regexp"
 	"strings"
-	qutil "qfw/util"
 )
 
 
-
 //完善判重数据检测-前置条件
 func convertArabicNumeralsAndLetters(data string) string {
 	newData :=data
@@ -76,31 +75,92 @@ func againRepeat(v *Info, info *Info) bool {
 	return false
 }
 
-////站点再次判断
-//func againSite(v *Info, info *Info) bool {
-//
-//	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
-//		return true
-//	}
-//	if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
-//		return true
-//	}
-//	if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
-//		return true
-//	}
-//	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
-//		return true
-//	}
-//	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
-//		return true
-//	}
-//
-//	return false
-//}
+//均含有关键词再次判断
+func againContainSpecialWord (v *Info, info *Info) bool {
 
+	if isBidopentimeInterval(info.bidopentime,v.bidopentime) {
+		return true
+	}
+	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+		return true
+	}
+	if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
+		return true
+	}
+	if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
+		return true
+	}
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+	//提取标题-标段号处理
+	if dealTitleSpecial(v.title,info.title) {
+		return true
+	}
 
+	return false
+}
 
+//提取标题-标段号处理
+func dealTitleSpecial(title1 string,title2 string) bool{
 
+	regular1 := "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?"
+	regular2 := "[0-9a-zA-Z一二三四五六七八九十零123456789](包|标段|标包)"
+	regx1_1,_ := regexp.Compile(regular1)
+	str1:=regx1_1.FindString(title1)
+	if str1!="" {
+		//log.Println("标题1,规则一提取:",str1)
+	}else {
+		regx1_2,_ := regexp.Compile(regular2)
+		str1=regx1_2.FindString(title1)
+		if str1!="" {
+			//log.Println("标题1,规则二提取:",str1)
+		}
+	}
+
+	regx2_1,_ := regexp.Compile(regular1)
+	str2:=regx2_1.FindString(title2)
+	if str2!="" {
+		//log.Println("标题2,规则一提取:",str2)
+	}else {
+		regx2_2,_ := regexp.Compile(regular2)
+		str2=regx2_2.FindString(title2)
+		if str2!="" {
+			//log.Println("标题2,规则二提取:",str2)
+		}
+	}
+
+	//根据提取的结果,在进行清洗
+	if str1!="" {
+		str1 = deleteExtraSpace(str1)
+		str1= strings.Replace(str1, "(", "", -1)
+		str1= strings.Replace(str1, "(", "", -1)
+		str1= strings.Replace(str1, ")", "", -1)
+		str1= strings.Replace(str1, ")", "", -1)
+		str1 = convertArabicNumeralsAndLetters(str1)
+	}
+
+	if str2!="" {
+		str2 = deleteExtraSpace(str2)
+		str2= strings.Replace(str2, "(", "", -1)
+		str2= strings.Replace(str2, "(", "", -1)
+		str2= strings.Replace(str2, ")", "", -1)
+		str2= strings.Replace(str2, ")", "", -1)
+		str2 = convertArabicNumeralsAndLetters(str2)
+	}
+
+	//log.Println("最终:",str1,str2)
+	if str1!=str2 {
+		//log.Println("不一致")
+		return true
+	}else {
+		//log.Println("一致")
+		return false
+	}
+}
 
 
 //删除中标单位字符串中多余的空格(含tab)
@@ -129,6 +189,17 @@ func isBidWinningAmount(f1 float64 ,f2 float64) bool {
 }
 
 
+//时间间隔周期
+func isTimeIntervalPeriod(i1 int64 ,i2 int64) bool {
+
+	if math.Abs(float64(i1-i2)) < 172800.0 {
+		return true
+	}else {
+		return false //大于48小时
+	}
+}
+
+
 //开标时间区间为一天
 func isBidopentimeInterval(i1 int64 ,i2 int64) bool {
 	if i1==0||i2==0 {

+ 21 - 7
udpfilterdup/src/dataMethodHeavy.go

@@ -66,7 +66,18 @@ func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 //判重方法2
 func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
 	isMeet := false
-	if v.agency == info.agency && v.agency != "" && info.agency != "" {
+	isAgency :=false
+	//招标类-代理机构不同-广泛前后缀比较
+	if v.agency != info.agency && v.agency != "" && info.agency != "" {
+		//新增一层判断
+		if strings.Contains(v.agency, info.agency) || strings.Contains(info.agency, v.agency) {
+			isAgency = true
+		}else {
+			return false, reason
+		}
+	}
+
+	if (v.agency == info.agency && v.agency != "" && info.agency != "")|| isAgency {
 		if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
 			info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
 			info.subtype == "变更" || info.subtype == "其他" {
@@ -122,11 +133,7 @@ func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
 		}
 	}
 
-	//不同
-	if v.agency != info.agency && v.agency != "" && info.agency != "" {
-		return false, reason
-	}
-	//机构最少一个为空
+	//机构最2少一个为空
 	if v.agency == "" || info.agency == "" {
 		var repeat = false
 		if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
@@ -409,9 +416,16 @@ func contractRepeat_C(v *Info, info *Info) bool {
 
 //快速低质量数据判重
 func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
-	if !isTheSameDay(v.publishtime,info.publishtime) {
+	//if !isTheSameDay(v.publishtime,info.publishtime) {
+	//	return false,reason
+	//}
+
+	//区间间隔24小时
+	if !isTimeIntervalPeriod(v.publishtime,info.publishtime) {
 		return false,reason
 	}
+
+
 	//首先判定是否为低质量数据    info目标数据
 	if info.title!=""&&(info.agency==""||v.agency=="")&&
 		info.title==v.title&&info.projectcode==""&&info.contractnumber==""&&info.buyer=="" {

+ 22 - 2
udpfilterdup/src/datamap.go

@@ -141,6 +141,12 @@ func NewDatamap(days int, lastid string) *datamap {
 	nowTime := time.Now().Unix()//当前时间的时间戳
 	n, continuSum := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+
+		//source := util.ObjToMap(tmp["jsondata"]) //修复临时添加
+		//if util.IntAll((*source)["sourcewebsite"]) == 1 {
+		//	continue
+		//}
+
 		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1{
 
 		} else {
@@ -192,6 +198,12 @@ func NewDatamap(days int, lastid string) *datamap {
 //数据构建
 func NewInfo(tmp map[string]interface{}) *Info {
 	subtype := qutil.ObjToString(tmp["subtype"])
+	if subtype=="招标"||subtype=="邀标"||subtype=="询价"||
+		subtype=="竞谈"||subtype=="竞价" {
+		subtype = "招标"
+	}
+
+
 	area := qutil.ObjToString(tmp["area"])
 	if area == "A" {
 		area = "全国"
@@ -350,10 +362,9 @@ L:
 						}else {
 							if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
 								//无包含关系-即不相等
-								if againRepeat(v, info) {
+								if againContainSpecialWord(v, info) {
 									continue
 								}
-
 							}
 						}
 					}
@@ -575,6 +586,15 @@ func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
 }
 
 
+func (d *datamap) currentTotalCount() int {
+	num:=qutil.IntAll(0)
+	for _,v:=range d.data {
+		num = num+qutil.IntAll(len(v))
+	}
+	return num
+}
+
+
 
 
 

+ 74 - 49
udpfilterdup/src/main.go

@@ -58,7 +58,7 @@ var (
 
 
 func init() {
-	//return
+
 	flag.StringVar(&lastid, "id", "", "增量加载的lastid") //增量
 	flag.StringVar(&gtid, "gtid", "", "历史增量的起始id")	//历史
 	flag.StringVar(&gtept, "gtept", "", "全量gte发布时间")//全量区间pt
@@ -250,27 +250,10 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	wg := &sync.WaitGroup{}
 	n, repeateN := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-		if n%10000 == 0 {
+		if n%1000 == 0 {
 			log.Println("current:", n, tmp["_id"],tmp["publishtime"], "repeateN:", repeateN)
 		}
-		//source := util.ObjToMap(tmp["jsondata"]) //前置-jsondata判重
-		//if util.IntAll((*source)["sourcewebsite"]) == 1 {
-		//	repeateN++
-		//	Update.updatePool <- []map[string]interface{}{
-		//		map[string]interface{}{
-		//			"_id": tmp["_id"],
-		//		},
-		//		map[string]interface{}{
-		//			"$set": map[string]interface{}{
-		//				"repeat": 1,
-		//				"dataging":0,
-		//				"repeat_reason": "sourcewebsite为1,重复",
-		//			},
-		//		},
-		//	}
-		//	tmp = make(map[string]interface{})
-		//	continue
-		//}
+
 		if util.IntAll(tmp["repeat"]) == 1 {
 			repeateN++
 			tmp = make(map[string]interface{})
@@ -319,7 +302,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 							"repeat":        1,
 							"repeat_reason": reason,
 							"repeat_id":     source.id,
-							"dataging":		 "0",
+							"dataging":		 0,
 						},
 					},
 				}
@@ -330,9 +313,14 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	wg.Wait()
 
 	log.Println("this task over.", n, "repeateN:", repeateN, mapInfo["stop"])
+	log.Println("当前数据池的数量:",DM.currentTotalCount())
 
 	time.Sleep(30 * time.Second)
 
+	//更新Ocr的标记
+	updateOcrFileData(mapInfo["lteid"].(string))
+
+
 	//任务完成,开始发送广播通知下面节点
 	if n >= repeateN && mapInfo["stop"] == nil {
 		log.Println("判重任务完成发送udp")
@@ -357,6 +345,47 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	}
 }
 
+func updateOcrFileData(cur_lteid string)  {
+	//更新ocr 分类表-判重的状态
+	log.Println("开始更新Ocr表-标记",cur_lteid)
+	task_sess := task_mgo.GetMgoConn()
+	defer task_mgo.DestoryMongoConn(task_sess)
+	q_task:=map[string]interface{}{}
+	it_last := task_sess.DB(task_mgo.DbName).C(task_collName).Find(&q_task).Sort("-_id").Iter()
+	isUpdateOcr:=false
+	updateOcrFile:=[][]map[string]interface{}{}
+	for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+		cur_id := BsonTOStringId(tmp["_id"])
+		lteid:=util.ObjToString(tmp["lteid"])
+		if (lteid==cur_lteid) { //需要更新
+			log.Println("找到该lteid数据",cur_lteid,cur_id)
+			isUpdateOcr = true
+			updateOcrFile = append(updateOcrFile, []map[string]interface{}{//重复数据打标签
+				map[string]interface{}{
+					"_id": tmp["_id"],
+				},
+				map[string]interface{}{
+					"$set": map[string]interface{}{
+						"is_repeat_status": 1,
+						"is_repeat_time" : util.Int64All(time.Now().Unix()),
+					},
+				},
+			})
+			tmp = make(map[string]interface{})
+			break
+		}else {
+			tmp = make(map[string]interface{})
+		}
+	}
+	if !isUpdateOcr {
+		log.Println("出现异常问题,查询不到ocr的lteid",cur_lteid)
+	}else {
+		if len(updateOcrFile) > 0 {
+			task_mgo.UpSertBulk(task_collName, updateOcrFile...)
+		}
+	}
+}
+
 //历史判重
 func historyTaskDay() {
 	defer util.Catch()
@@ -379,18 +408,32 @@ func historyTaskDay() {
 		//查询表最后一个id
 		task_sess := task_mgo.GetMgoConn()
 		defer task_mgo.DestoryMongoConn(task_sess)
-		q:=map[string]interface{}{
-			"isused":true,
-		}
+		q:=map[string]interface{}{}
 		between_time := time.Now().Unix() - (86400 * timingPubScope)//两年周期
 		it_last := task_sess.DB(task_mgo.DbName).C(task_collName).Find(&q).Sort("-_id").Iter()
+
+		isRepeatStatus:=false
 		for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
-			lteid = util.ObjToString(tmp["gtid"])
-			log.Println("查询的最后一个任务Id:",lteid)
-			break
+			is_repeat_status:=util.IntAll(tmp["is_repeat_status"])
+			if is_repeat_status == 1 {
+				lteid = util.ObjToString(tmp["lteid"])
+				log.Println("查询的最后一个已标记的任务lteid:",lteid)
+				isRepeatStatus = true
+				tmp = make(map[string]interface{})
+				break
+			}else  {
+				tmp = make(map[string]interface{})
+			}
 		}
 
-		log.Println("查询完毕-先睡眠5分钟",gtid,lteid)
+		if !isRepeatStatus {
+			log.Println("查询不到有标记的lteid数据")
+			log.Println("睡眠5分钟 gtid:",gtid,"lteid:",lteid)
+			time.Sleep(5 * time.Minute)
+			continue
+		}
+
+		log.Println("查询完毕-找到有标记的lteid-先睡眠5分钟",gtid,lteid)
 		time.Sleep(5 * time.Minute)
 
 		sess := mgo.GetMgoConn()//连接器
@@ -411,26 +454,6 @@ func historyTaskDay() {
 			if num%10000 == 0 {
 				log.Println("正序遍历:", num)
 			}
-			//source := util.ObjToMap(tmp["jsondata"])
-			//if util.IntAll((*source)["sourcewebsite"]) == 1 {
-			//	outnum++
-			//	Update.updatePool <- []map[string]interface{}{//重复数据打标签
-			//		map[string]interface{}{
-			//			"_id": tmp["_id"],
-			//		},
-			//		map[string]interface{}{
-			//			"$set": map[string]interface{}{
-			//				"repeat": 1,
-			//				"dataging": 0,
-			//				"history_updatetime":util.Int64All(time.Now().Unix()),
-			//				"repeat_reason": "sourcewebsite为1 重复",
-			//			},
-			//		},
-			//	}
-			//	tmp = make(map[string]interface{})
-			//	continue
-			//}
-
 			//取-符合-发布时间X年内的数据
 			if util.IntAll(tmp["dataging"]) == 1 {
 				pubtime := util.Int64All(tmp["publishtime"])
@@ -641,7 +664,7 @@ func judgeIsCurIds (gtid string,lteid string,curid string) bool {
 	gt_time, _ := strconv.ParseInt(gtid[:8], 16, 64)
 	lte_time, _ := strconv.ParseInt(lteid[:8], 16, 64)
 	cur_time, _ := strconv.ParseInt(curid[:8], 16, 64)
-	if cur_time>gt_time&&cur_time<=lte_time {
+	if cur_time>=gt_time&&cur_time<=lte_time {
 		return true
 	}
 	return false
@@ -732,5 +755,7 @@ func moveOnceTimeOut()  {
 
 
 
+
+
 
 

+ 794 - 0
udpfilterdup/src3/README.md

@@ -0,0 +1,794 @@
+
+{
+    "udpport": ":19097",
+    "dupdays": 7,
+    "mongodb": {
+        "addr": "172.17.4.85:27080",
+        "pool": 10,
+        "db": "qfw",
+        "extract": "result_20200715",
+        "extract_back": "result_20200714",
+        "site": {
+            "dbname": "qfw",
+            "coll": "site"
+        }
+    },
+    "task_mongodb": {
+        "task_addrName": "172.17.4.187:27081",
+        "task_dbName": "qfw",
+        "task_collName": "ocr_flie_over",
+        "pool": 5
+        
+    },
+    "jkmail": {
+        "to": "zhengkun@topnet.net.cn,zhangjinkun@topnet.net.cn",
+        "api": "http://10.171.112.160:19281/_send/_mail"
+    },
+    "nextNode": [
+        {
+            "addr": "172.17.4.194",
+            "port": 1782,
+            "stype": "project",
+            "memo": "合并项目"
+        },
+        {
+            "addr": "127.0.0.1",
+            "port": 1783,
+            "stype": "bidding",
+            "memo": "创建招标数据索引new"
+        }
+    ],
+    "threads": 1,
+    "isMerger": false,
+    "lowHeavy":true,
+    "timingTask":false,
+    "timingSpanDay": 5,
+    "timingPubScope": 720,
+    "specialwords": "(重招|重新招标|勘察|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批|期)",
+    "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
+    "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+}
+
+
+
+
+
+
+mgo = &MongodbSim{
+		MongodbAddr: "172.17.4.187:27083",
+		DbName:      "qfw",
+		Size:        10,
+	}
+mgo.InitPool()
+	return
+	
+func moveTimeoutData()  {
+	log.Println("部署迁移定时任务")
+	c := cron.New()
+	c.AddFunc("0 0 0 * * ?", func() { moveOnceTimeOut() })
+	c.Start()
+}
+
+func moveOnceTimeOut()  {
+	log.Println("执行一次迁移超时数据")
+
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	now:=time.Now()
+	move_time := time.Date(now.Year()-2, now.Month(), now.Day(), now.Hour(), 0, 0, 0, time.Local).Unix()
+	q := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$lt": move_time,
+		},
+	}
+	log.Println(q)
+	it := sess.DB(mgo.DbName).C("result_20200714").Find(&q).Iter()
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		if index%10000 == 0 {
+			log.Println("index", index)
+		}
+		del_id:=BsonTOStringId(tmp["_id"])
+		mgo.Save("result_20200713", tmp)
+		mgo.DeleteById("result_20200714",del_id)
+		tmp = map[string]interface{}{}
+	}
+	log.Println("save and delete", " ok index", index)
+}
+
+
+
+
+
+{
+    "udpport": ":1785",
+    "dupdays": 7,
+    "mongodb": {
+        "addr": "172.17.4.85:27080",
+        "pool": 10,
+        "db": "qfw",
+        "extract": "result_20200715",
+        "extract_back": "result_20200714",
+        "site": {
+            "dbname": "qfw",
+            "coll": "site"
+        }
+    },
+    "jkmail": {
+        "to": "zhengkun@topnet.net.cn,zhangjinkun@topnet.net.cn",
+        "api": "http://172.17.145.179:19281/_send/_mail"
+    },
+    "nextNode": [
+        {
+            "addr": "127.0.0.1",
+            "port": 1783,
+            "stype": "bidding",
+            "memo": "创建招标数据索引new"
+        }
+    ],
+    "threads": 1,
+    "isMerger": false,
+    "lowHeavy":true,
+    "timingTask":false,
+    "timingSpanDay": 5,
+    "timingPubScope": 720,
+    "specialwords": "(重招|重新招标|勘察|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批|期)",
+    "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
+    "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+}
+
+func historyTaskDay() {
+	defer util.Catch()
+
+	for {
+		start:=time.Now().Unix()
+
+		if gtid=="" {
+			log.Println("请传gtid,否则无法运行")
+			os.Exit(0)
+			return
+		}
+		if lteid!="" {
+			//先进行数据迁移
+			log.Println("开启一次迁移任务",gtid,lteid)
+			moveHistoryData(gtid,lteid)
+			gtid = lteid //替换数据
+		}
+
+		//查询表最后一个id
+		task_sess := task_mgo.GetMgoConn()
+		defer task_mgo.DestoryMongoConn(task_sess)
+		q:=map[string]interface{}{
+			"isused":true,
+		}
+		between_time := time.Now().Unix() - (86400 * timingPubScope)//两年周期
+		it_last := task_sess.DB(task_mgo.DbName).C(task_collName).Find(&q).Sort("-_id").Iter()
+		for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+			lteid = util.ObjToString(tmp["gtid"])
+			log.Println("查询的最后一个任务Id:",lteid)
+			break
+		}
+
+		log.Println("查询完毕-先睡眠5分钟",gtid,lteid)
+		time.Sleep(5 * time.Minute)
+
+		sess := mgo.GetMgoConn()//连接器
+		defer mgo.DestoryMongoConn(sess)
+		//开始判重
+		q = map[string]interface{}{
+			"_id": map[string]interface{}{
+				"$gt": StringTOBsonId(gtid),
+				"$lte": StringTOBsonId(lteid),
+			},
+		}
+		log.Println("历史判重查询条件:",q,"时间:", between_time)
+		it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
+		num,oknum,outnum, deterTime:= int64(0),int64(0),int64(0),int64(0) //计数
+		updateExtract := [][]map[string]interface{}{}//批量更新mongo数组
+		pendAllArr:=[][]map[string]interface{}{}//待处理数组
+		dayArr := []map[string]interface{}{}
+		for tmp := make(map[string]interface{}); it.Next(&tmp); num++ {
+			if num%10000 == 0 {
+				log.Println("正序遍历:", num)
+			}
+			source := util.ObjToMap(tmp["jsondata"])
+			if util.IntAll((*source)["sourcewebsite"]) == 1 {
+				outnum++
+				updatelock.Lock()
+				updateExtract = append(updateExtract, []map[string]interface{}{
+					map[string]interface{}{
+						"_id": tmp["_id"],
+					},
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat": 1,
+							"dataging": 0,
+							"repeat_reason": "sourcewebsite为1 重复",
+						},
+					},
+				})
+				if len(updateExtract) >= 200 {
+					log.Println("sourcewebsite,批量更新")
+					mgo.UpSertBulk(extract, updateExtract...)
+					updateExtract = [][]map[string]interface{}{}
+				}
+
+				updatelock.Unlock()
+
+
+				tmp = make(map[string]interface{})
+				continue
+			}
+
+			//取-符合-发布时间X年内的数据
+			updatelock.Lock()
+			if util.IntAll(tmp["dataging"]) == 1 {
+				pubtime := util.Int64All(tmp["publishtime"])
+				if pubtime > 0 && pubtime >= between_time {
+					oknum++
+					if deterTime==0 {
+						log.Println("找到第一条符合条件的数据")
+						deterTime = util.Int64All(tmp["publishtime"])
+						dayArr = append(dayArr,tmp)
+					}else {
+						if pubtime-deterTime >timingSpanDay*86400 {
+							//新数组重新构建,当前组数据加到全部组数据
+							pendAllArr = append(pendAllArr,dayArr)
+							dayArr = []map[string]interface{}{}
+							deterTime = util.Int64All(tmp["publishtime"])
+							dayArr = append(dayArr,tmp)
+						}else {
+							dayArr = append(dayArr,tmp)
+						}
+					}
+				}else {
+					outnum++
+					//不在两年内的也清标记
+					updateExtract = append(updateExtract, []map[string]interface{}{
+						map[string]interface{}{
+							"_id": tmp["_id"],
+						},
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"dataging": 0,
+							},
+						},
+					})
+					if len(updateExtract) >= 200 {
+						log.Println("不在周期内符合dataging==1,批量更新")
+						mgo.UpSertBulk(extract, updateExtract...)
+						updateExtract = [][]map[string]interface{}{}
+					}
+
+				}
+			}
+
+			updatelock.Unlock()
+
+			tmp = make(map[string]interface{})
+		}
+
+
+		//批量更新标记
+		updatelock.Lock()
+
+		if len(updateExtract) > 0 {
+			log.Println("分组后,最后更新不进行判重的数据:",len(updateExtract),oknum+outnum)
+			mgo.UpSertBulk(extract, updateExtract...)
+			updateExtract = [][]map[string]interface{}{}
+		}
+
+		updatelock.Unlock()
+
+
+		if len(dayArr)>0 {
+			pendAllArr = append(pendAllArr,dayArr)
+			dayArr = []map[string]interface{}{}
+		}
+
+		log.Println("查询数量:",num,"符合条件:",oknum)
+
+		if len(pendAllArr) <= 0 {
+			log.Println("没找到dataging==1的数据")
+		}
+
+		//测试分组数量是否正确
+		testNum:=0
+		for k,v:=range pendAllArr {
+			log.Println("第",k,"组--","数量:",len(v))
+			testNum = testNum+len(v)
+		}
+		log.Println("本地构建分组完成:",len(pendAllArr),"组","测试-总计数量:",testNum)
+
+		n, repeateN := 0, 0
+		log.Println("线程数:",threadNum)
+		pool := make(chan bool, threadNum)
+		wg := &sync.WaitGroup{}
+		for k,v:=range pendAllArr { //每组结束更新一波数据
+			pool <- true
+			wg.Add(1)
+			go func(k int, v []map[string]interface{}) {
+				defer func() {
+					<-pool
+					wg.Done()
+				}()
+				//每组临时数组 -  互不干扰
+				groupUpdateExtract := [][]map[string]interface{}{}
+				//
+				groupOtherExtract := [][]map[string]interface{}{}
+
+				//构建当前组的数据池
+				log.Println("构建第", k, "组---(数据池)")
+				//当前组的第一个发布时间
+				first_pt := util.Int64All(v[len(v)-1]["publishtime"])
+				curTM := TimedTaskDatamap(dupdays+int(timingSpanDay)+1, first_pt+86400, int(k))
+				log.Println("开始遍历判重第", k, "组  共计数量:", len(v))
+				n = n + len(v)
+				log.Println("统计目前总数量:", n, "重复数量:", repeateN)
+				for _, tmp := range v {
+					info := NewInfo(tmp)
+					b, source, reason := curTM.check(info)
+					if b { //有重复,生成更新语句,更新抽取和更新招标
+						repeateN++
+						//重复数据打标签
+						repeat_ids:=source.repeat_ids
+						repeat_ids =  append(repeat_ids,info.id)
+						source.repeat_ids = repeat_ids
+						//替换数据池-更新
+						DM.replacePoolData(source)
+						updatelock.Lock()
+
+
+						//更新数据源-   14 或者 15
+						//判断是否在当前段落
+						if judgeIsCurIds(gtid,lteid,source.id) {
+							groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{//重复数据打标签
+								map[string]interface{}{
+									"_id": StringTOBsonId(source.id),
+								},
+								map[string]interface{}{
+									"$set": map[string]interface{}{
+										"repeat_ids": repeat_ids,
+									},
+								},
+							})
+						}else {
+							groupOtherExtract = append(groupOtherExtract, []map[string]interface{}{//重复数据打标签
+								map[string]interface{}{
+									"_id": StringTOBsonId(source.id),
+								},
+								map[string]interface{}{
+									"$set": map[string]interface{}{
+										"repeat_ids": repeat_ids,
+									},
+								},
+							})
+						}
+						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat":        1,
+									"repeat_reason": reason,
+									"repeat_id":     source.id,
+									"dataging":      0,
+								},
+							},
+						})
+
+						if len(groupUpdateExtract) >= 500 {
+							mgo.UpSertBulk(extract, groupUpdateExtract...)
+							groupUpdateExtract = [][]map[string]interface{}{}
+						}
+
+						if len(groupOtherExtract) >= 500 {
+							mgo.UpSertBulk(extract_back, groupOtherExtract...)
+							groupOtherExtract = [][]map[string]interface{}{}
+						}
+
+						updatelock.Unlock()
+
+
+					} else {
+						updatelock.Lock()
+
+						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"dataging": 0, //符合条件的都为dataging==0
+								},
+							},
+						})
+
+						if len(groupUpdateExtract) >= 500 {
+							mgo.UpSertBulk(extract, groupUpdateExtract...)
+							groupUpdateExtract = [][]map[string]interface{}{}
+						}
+						updatelock.Unlock()
+					}
+				}
+				//每组数据结束-更新数据
+				updatelock.Lock()
+				if len(groupUpdateExtract) > 0 {
+					mgo.UpSertBulk(extract, groupUpdateExtract...)
+				}
+
+				if len(groupOtherExtract) > 0 {
+					mgo.UpSertBulk(extract_back, groupOtherExtract...)
+				}
+				updatelock.Unlock()
+
+			}(k, v)
+
+		}
+
+		wg.Wait()
+
+
+		//任务完成,开始发送广播通知下面节点 发udp 去升索引待定 + 合并
+		if n >= repeateN && gtid!=lteid{
+			for _, to := range nextNode {
+				next_sid := util.BsonIdToSId(gtid)
+				next_eid := util.BsonIdToSId(lteid)
+				key := next_sid + "-" + next_eid + "-" + util.ObjToString(to["stype"])
+				by, _ := json.Marshal(map[string]interface{}{
+					"gtid":  next_sid,
+					"lteid": next_eid,
+					"stype": util.ObjToString(to["stype"]),
+					"key":   key,
+				})
+				addr := &net.UDPAddr{
+					IP:   net.ParseIP(to["addr"].(string)),
+					Port: util.IntAll(to["port"]),
+				}
+				node := &udpNode{by, addr, time.Now().Unix(), 0}
+				udptaskmap.Store(key, node)
+				udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+			}
+		}
+
+		end:=time.Now().Unix()
+
+		log.Println("this timeTask over.", n, "repeateN:", repeateN,gtid,lteid)
+		log.Println(gtid,lteid)
+		if end-start<60*5 {
+			log.Println("睡眠.............")
+			time.Sleep(5 * time.Minute)
+		}
+		log.Println("继续下一段的历史判重")
+	}
+}func historyTaskDay() {
+ 	defer util.Catch()
+ 
+ 	for {
+ 		start:=time.Now().Unix()
+ 
+ 		if gtid=="" {
+ 			log.Println("请传gtid,否则无法运行")
+ 			os.Exit(0)
+ 			return
+ 		}
+ 		if lteid!="" {
+ 			//先进行数据迁移
+ 			log.Println("开启一次迁移任务",gtid,lteid)
+ 			moveHistoryData(gtid,lteid)
+ 			gtid = lteid //替换数据
+ 		}
+ 
+ 		//查询表最后一个id
+ 		task_sess := task_mgo.GetMgoConn()
+ 		defer task_mgo.DestoryMongoConn(task_sess)
+ 		q:=map[string]interface{}{
+ 			"isused":true,
+ 		}
+ 		between_time := time.Now().Unix() - (86400 * timingPubScope)//两年周期
+ 		it_last := task_sess.DB(task_mgo.DbName).C(task_collName).Find(&q).Sort("-_id").Iter()
+ 		for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+ 			lteid = util.ObjToString(tmp["gtid"])
+ 			log.Println("查询的最后一个任务Id:",lteid)
+ 			break
+ 		}
+ 
+ 		log.Println("查询完毕-先睡眠5分钟",gtid,lteid)
+ 		time.Sleep(5 * time.Minute)
+ 
+ 		sess := mgo.GetMgoConn()//连接器
+ 		defer mgo.DestoryMongoConn(sess)
+ 		//开始判重
+ 		q = map[string]interface{}{
+ 			"_id": map[string]interface{}{
+ 				"$gt": StringTOBsonId(gtid),
+ 				"$lte": StringTOBsonId(lteid),
+ 			},
+ 		}
+ 		log.Println("历史判重查询条件:",q,"时间:", between_time)
+ 		it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
+ 		num,oknum,outnum, deterTime:= int64(0),int64(0),int64(0),int64(0) //计数
+ 		updateExtract := [][]map[string]interface{}{}//批量更新mongo数组
+ 		pendAllArr:=[][]map[string]interface{}{}//待处理数组
+ 		dayArr := []map[string]interface{}{}
+ 		for tmp := make(map[string]interface{}); it.Next(&tmp); num++ {
+ 			if num%10000 == 0 {
+ 				log.Println("正序遍历:", num)
+ 			}
+ 			source := util.ObjToMap(tmp["jsondata"])
+ 			if util.IntAll((*source)["sourcewebsite"]) == 1 {
+ 				outnum++
+ 				updatelock.Lock()
+ 				updateExtract = append(updateExtract, []map[string]interface{}{
+ 					map[string]interface{}{
+ 						"_id": tmp["_id"],
+ 					},
+ 					map[string]interface{}{
+ 						"$set": map[string]interface{}{
+ 							"repeat": 1,
+ 							"dataging": 0,
+ 							"repeat_reason": "sourcewebsite为1 重复",
+ 						},
+ 					},
+ 				})
+ 				if len(updateExtract) >= 200 {
+ 					log.Println("sourcewebsite,批量更新")
+ 					mgo.UpSertBulk(extract, updateExtract...)
+ 					updateExtract = [][]map[string]interface{}{}
+ 				}
+ 
+ 				updatelock.Unlock()
+ 
+ 
+ 				tmp = make(map[string]interface{})
+ 				continue
+ 			}
+ 
+ 			//取-符合-发布时间X年内的数据
+ 			updatelock.Lock()
+ 			if util.IntAll(tmp["dataging"]) == 1 {
+ 				pubtime := util.Int64All(tmp["publishtime"])
+ 				if pubtime > 0 && pubtime >= between_time {
+ 					oknum++
+ 					if deterTime==0 {
+ 						log.Println("找到第一条符合条件的数据")
+ 						deterTime = util.Int64All(tmp["publishtime"])
+ 						dayArr = append(dayArr,tmp)
+ 					}else {
+ 						if pubtime-deterTime >timingSpanDay*86400 {
+ 							//新数组重新构建,当前组数据加到全部组数据
+ 							pendAllArr = append(pendAllArr,dayArr)
+ 							dayArr = []map[string]interface{}{}
+ 							deterTime = util.Int64All(tmp["publishtime"])
+ 							dayArr = append(dayArr,tmp)
+ 						}else {
+ 							dayArr = append(dayArr,tmp)
+ 						}
+ 					}
+ 				}else {
+ 					outnum++
+ 					//不在两年内的也清标记
+ 					updateExtract = append(updateExtract, []map[string]interface{}{
+ 						map[string]interface{}{
+ 							"_id": tmp["_id"],
+ 						},
+ 						map[string]interface{}{
+ 							"$set": map[string]interface{}{
+ 								"dataging": 0,
+ 							},
+ 						},
+ 					})
+ 					if len(updateExtract) >= 200 {
+ 						log.Println("不在周期内符合dataging==1,批量更新")
+ 						mgo.UpSertBulk(extract, updateExtract...)
+ 						updateExtract = [][]map[string]interface{}{}
+ 					}
+ 
+ 				}
+ 			}
+ 
+ 			updatelock.Unlock()
+ 
+ 			tmp = make(map[string]interface{})
+ 		}
+ 
+ 
+ 		//批量更新标记
+ 		updatelock.Lock()
+ 
+ 		if len(updateExtract) > 0 {
+ 			log.Println("分组后,最后更新不进行判重的数据:",len(updateExtract),oknum+outnum)
+ 			mgo.UpSertBulk(extract, updateExtract...)
+ 			updateExtract = [][]map[string]interface{}{}
+ 		}
+ 
+ 		updatelock.Unlock()
+ 
+ 
+ 		if len(dayArr)>0 {
+ 			pendAllArr = append(pendAllArr,dayArr)
+ 			dayArr = []map[string]interface{}{}
+ 		}
+ 
+ 		log.Println("查询数量:",num,"符合条件:",oknum)
+ 
+ 		if len(pendAllArr) <= 0 {
+ 			log.Println("没找到dataging==1的数据")
+ 		}
+ 
+ 		//测试分组数量是否正确
+ 		testNum:=0
+ 		for k,v:=range pendAllArr {
+ 			log.Println("第",k,"组--","数量:",len(v))
+ 			testNum = testNum+len(v)
+ 		}
+ 		log.Println("本地构建分组完成:",len(pendAllArr),"组","测试-总计数量:",testNum)
+ 
+ 		n, repeateN := 0, 0
+ 		log.Println("线程数:",threadNum)
+ 		pool := make(chan bool, threadNum)
+ 		wg := &sync.WaitGroup{}
+ 		for k,v:=range pendAllArr { //每组结束更新一波数据
+ 			pool <- true
+ 			wg.Add(1)
+ 			go func(k int, v []map[string]interface{}) {
+ 				defer func() {
+ 					<-pool
+ 					wg.Done()
+ 				}()
+ 				//每组临时数组 -  互不干扰
+ 				groupUpdateExtract := [][]map[string]interface{}{}
+ 				//
+ 				groupOtherExtract := [][]map[string]interface{}{}
+ 
+ 				//构建当前组的数据池
+ 				log.Println("构建第", k, "组---(数据池)")
+ 				//当前组的第一个发布时间
+ 				first_pt := util.Int64All(v[len(v)-1]["publishtime"])
+ 				curTM := TimedTaskDatamap(dupdays+int(timingSpanDay)+1, first_pt+86400, int(k))
+ 				log.Println("开始遍历判重第", k, "组  共计数量:", len(v))
+ 				n = n + len(v)
+ 				log.Println("统计目前总数量:", n, "重复数量:", repeateN)
+ 				for _, tmp := range v {
+ 					info := NewInfo(tmp)
+ 					b, source, reason := curTM.check(info)
+ 					if b { //有重复,生成更新语句,更新抽取和更新招标
+ 						repeateN++
+ 						//重复数据打标签
+ 						repeat_ids:=source.repeat_ids
+ 						repeat_ids =  append(repeat_ids,info.id)
+ 						source.repeat_ids = repeat_ids
+ 						//替换数据池-更新
+ 						DM.replacePoolData(source)
+ 						updatelock.Lock()
+ 
+ 
+ 						//更新数据源-   14 或者 15
+ 						//判断是否在当前段落
+ 						if judgeIsCurIds(gtid,lteid,source.id) {
+ 							groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{//重复数据打标签
+ 								map[string]interface{}{
+ 									"_id": StringTOBsonId(source.id),
+ 								},
+ 								map[string]interface{}{
+ 									"$set": map[string]interface{}{
+ 										"repeat_ids": repeat_ids,
+ 									},
+ 								},
+ 							})
+ 						}else {
+ 							groupOtherExtract = append(groupOtherExtract, []map[string]interface{}{//重复数据打标签
+ 								map[string]interface{}{
+ 									"_id": StringTOBsonId(source.id),
+ 								},
+ 								map[string]interface{}{
+ 									"$set": map[string]interface{}{
+ 										"repeat_ids": repeat_ids,
+ 									},
+ 								},
+ 							})
+ 						}
+ 						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
+ 							map[string]interface{}{
+ 								"_id": tmp["_id"],
+ 							},
+ 							map[string]interface{}{
+ 								"$set": map[string]interface{}{
+ 									"repeat":        1,
+ 									"repeat_reason": reason,
+ 									"repeat_id":     source.id,
+ 									"dataging":      0,
+ 								},
+ 							},
+ 						})
+ 
+ 						if len(groupUpdateExtract) >= 500 {
+ 							mgo.UpSertBulk(extract, groupUpdateExtract...)
+ 							groupUpdateExtract = [][]map[string]interface{}{}
+ 						}
+ 
+ 						if len(groupOtherExtract) >= 500 {
+ 							mgo.UpSertBulk(extract_back, groupOtherExtract...)
+ 							groupOtherExtract = [][]map[string]interface{}{}
+ 						}
+ 
+ 						updatelock.Unlock()
+ 
+ 
+ 					} else {
+ 						updatelock.Lock()
+ 
+ 						groupUpdateExtract = append(groupUpdateExtract, []map[string]interface{}{
+ 							map[string]interface{}{
+ 								"_id": tmp["_id"],
+ 							},
+ 							map[string]interface{}{
+ 								"$set": map[string]interface{}{
+ 									"dataging": 0, //符合条件的都为dataging==0
+ 								},
+ 							},
+ 						})
+ 
+ 						if len(groupUpdateExtract) >= 500 {
+ 							mgo.UpSertBulk(extract, groupUpdateExtract...)
+ 							groupUpdateExtract = [][]map[string]interface{}{}
+ 						}
+ 						updatelock.Unlock()
+ 					}
+ 				}
+ 				//每组数据结束-更新数据
+ 				updatelock.Lock()
+ 				if len(groupUpdateExtract) > 0 {
+ 					mgo.UpSertBulk(extract, groupUpdateExtract...)
+ 				}
+ 
+ 				if len(groupOtherExtract) > 0 {
+ 					mgo.UpSertBulk(extract_back, groupOtherExtract...)
+ 				}
+ 				updatelock.Unlock()
+ 
+ 			}(k, v)
+ 
+ 		}
+ 
+ 		wg.Wait()
+ 
+ 
+ 		//任务完成,开始发送广播通知下面节点 发udp 去升索引待定 + 合并
+ 		if n >= repeateN && gtid!=lteid{
+ 			for _, to := range nextNode {
+ 				next_sid := util.BsonIdToSId(gtid)
+ 				next_eid := util.BsonIdToSId(lteid)
+ 				key := next_sid + "-" + next_eid + "-" + util.ObjToString(to["stype"])
+ 				by, _ := json.Marshal(map[string]interface{}{
+ 					"gtid":  next_sid,
+ 					"lteid": next_eid,
+ 					"stype": util.ObjToString(to["stype"]),
+ 					"key":   key,
+ 				})
+ 				addr := &net.UDPAddr{
+ 					IP:   net.ParseIP(to["addr"].(string)),
+ 					Port: util.IntAll(to["port"]),
+ 				}
+ 				node := &udpNode{by, addr, time.Now().Unix(), 0}
+ 				udptaskmap.Store(key, node)
+ 				udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+ 			}
+ 		}
+ 
+ 		end:=time.Now().Unix()
+ 
+ 		log.Println("this timeTask over.", n, "repeateN:", repeateN,gtid,lteid)
+ 		log.Println(gtid,lteid)
+ 		if end-start<60*5 {
+ 			log.Println("睡眠.............")
+ 			time.Sleep(5 * time.Minute)
+ 		}
+ 		log.Println("继续下一段的历史判重")
+ 	}
+ }	       					

+ 38 - 0
udpfilterdup/src3/config.json

@@ -0,0 +1,38 @@
+{
+    "udpport": ":17859",
+    "dupdays": 7,
+    "mongodb": {
+        "addr": "192.168.3.207:27092",
+        "pool": 10,
+        "db": "zhengkun",
+        "extract": "test",
+        "extract_back": "test",
+        "site": {
+            "dbname": "zhengkun",
+            "coll": "site"
+        }
+    },
+    "task_mongodb": {
+        "task_addrName": "192.168.3.207:27092",
+        "task_dbName": "zhengkun",
+        "task_collName": "test",
+        "pool": 10
+    },
+    "jkmail": {
+        "to": "zhengkun@topnet.net.cn,zhangjinkun@topnet.net.cn",
+        "api": "http://10.171.112.160:19281/_send/_mail"
+    },
+    "nextNode": [
+    ],
+    "threads": 1,
+    "isMerger": false,
+    "lowHeavy":true,
+    "timingTask":false,
+    "timingSpanDay": 4,
+    "timingPubScope": 720,
+    "specialwords": "(重招|重新招标|勘察|施工|监理|总承包|土石方|可研)",
+    "specialtitle_0": "(包|标段|标包)[((]?[0-9a-zA-Z一二三四五六七八九十零123456789][))]?",
+    "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包|批|期)",
+    "specialtitle_2": "项目[((][0-9a-zA-Z一二三四五六七八九十零123456789][))]",
+    "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
+}

+ 258 - 0
udpfilterdup/src3/dataMethod.go

@@ -0,0 +1,258 @@
+package main
+
+import (
+	"math"
+	"regexp"
+	"strings"
+	qutil "qfw/util"
+)
+
+
+
+//完善判重数据检测-前置条件
+func convertArabicNumeralsAndLetters(data string) string {
+	newData :=data
+	res1, _ := regexp.Compile("[a-zA-Z]+");
+	if res1.MatchString(data) {
+		newData = res1.ReplaceAllStringFunc(data, strings.ToUpper);
+	}
+	res2, _ := regexp.Compile("[0-9]+");
+	if res2.MatchString(newData) {
+		arr1:=[]string {"0","1","2","3","4","5","6","7","8","9"}
+		arr2:=[]string {"零","一","二","三","四","五","六","七","八","九"}
+		for i:=0 ;i<len(arr1) ;i++  {
+			resTemp ,_:=regexp.Compile(arr1[i])
+			newData= resTemp.ReplaceAllString(newData, arr2[i]);
+		}
+	}
+	return newData
+}
+
+func dealWithSpecialPhrases(str1 string,str2 string) (string,string) {
+	newStr1:=str1
+	newStr2:=str2
+	res, _ := regexp.Compile("重新招标");
+	if res.MatchString(newStr1) {
+		newStr1 = res.ReplaceAllString(newStr1,"重招");
+	}
+	if res.MatchString(newStr2) {
+		newStr2 = res.ReplaceAllString(newStr2,"重招");
+	}
+	return newStr1,newStr2
+}
+//关键词数量v
+func dealWithSpecialWordNumber(info*Info,v*Info) int {
+	okNum:=0
+	if  info.titleSpecialWord || info.specialWord {
+		okNum++
+	}
+	if  v.titleSpecialWord || v.specialWord {
+		okNum++
+	}
+	return okNum
+}
+
+//关键词再次判断
+func againRepeat(v *Info, info *Info) bool {
+	if isBidopentimeInterval(info.bidopentime,v.bidopentime) {
+		return true
+	}
+	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+		return true
+	}
+	if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
+		return true
+	}
+	if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
+		return true
+	}
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+
+	return false
+}
+
+////站点再次判断
+//func againSite(v *Info, info *Info) bool {
+//
+//	if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+//		return true
+//	}
+//	if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
+//		return true
+//	}
+//	if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
+//		return true
+//	}
+//	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+//		return true
+//	}
+//	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+//		return true
+//	}
+//
+//	return false
+//}
+
+
+
+
+
+
+//删除中标单位字符串中多余的空格(含tab)
+func deleteExtraSpace(s string) string {
+	//删除字符串中的多余空格,有多个空格时,仅保留一个空格
+	s1 := strings.Replace(s, "  ", " ", -1)      //替换tab为空格
+	regstr := "\\s{2,}"                          //两个及两个以上空格的正则表达式
+	reg, _ := regexp.Compile(regstr)             //编译正则表达式
+	s2 := make([]byte, len(s1))                  //定义字符数组切片
+	copy(s2, s1)                                 //将字符串复制到切片
+	spc_index := reg.FindStringIndex(string(s2)) //在字符串中搜索
+	for len(spc_index) > 0 {                     //找到适配项
+		s2 = append(s2[:spc_index[0]+1], s2[spc_index[1]:]...) //删除多余空格
+		spc_index = reg.FindStringIndex(string(s2))            //继续在字符串中搜索
+	}
+	return string(s2)
+}
+
+//中标金额倍率:10000
+func isBidWinningAmount(f1 float64 ,f2 float64) bool {
+
+	if f1==f2||f1*10000==f2||f2*10000==f1 {
+		return false
+	}
+	return true
+}
+
+
+//开标时间区间为一天
+func isBidopentimeInterval(i1 int64 ,i2 int64) bool {
+	if i1==0||i2==0 {
+		return false
+	}
+	//不在同一天-或者同一天间隔超过六小时,属于不相等返回true
+	timeOne,timeTwo:=i1,i2
+	day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
+	day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
+	if day1==day2 {
+		//是否间隔超过十二小时
+		if math.Abs(float64(i1-i2)) >43200.0 {
+			return true
+		}else {
+			return false
+		}
+	}else {
+		return true
+	}
+}
+
+//开标时间区间为一天
+func isTheSameDay(i1 int64 ,i2 int64) bool {
+	if i1==0||i2==0 {
+		return false
+	}
+	timeOne,timeTwo:=i1,i2
+	day1 := qutil.FormatDateByInt64(&timeOne, qutil.Date_yyyyMMdd)
+	day2 := qutil.FormatDateByInt64(&timeTwo, qutil.Date_yyyyMMdd)
+	if day1==day2 {
+		return true
+	}
+	return false
+}
+
+
+
+//前置0 五要素均相等认为重复
+func leadingElementSame(v *Info, info *Info) bool {
+
+	isok:= 0
+	if info.projectname != "" && v.projectname == info.projectname {
+		isok++
+	}
+	if info.buyer != "" && v.buyer == info.buyer {
+		isok++
+	}
+	if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+		if info.contractnumber != "" && v.contractnumber == info.contractnumber {
+			isok++
+		}
+	}else {
+		if info.projectcode != "" && v.projectcode == info.projectcode {
+			isok++
+		}
+	}
+	if info.title != "" && v.title == info.title {
+		isok++
+	}
+	if v.agency == info.agency &&info.agency != "" {
+		isok++
+	}
+	if v.winner == info.winner&&info.winner != "" {
+		isok++
+	}
+
+	if isok>=5 {
+		return true
+	}
+
+
+
+	return false
+}
+
+//buyer的优先级
+func buyerIsContinue(v *Info, info *Info) bool {
+	if !isTheSameDay(info.publishtime,v.publishtime) {
+		return true
+	}
+	if v.title != info.title && v.title != "" && info.title != ""{
+		return true
+	}
+	if v.projectname != info.projectname && v.projectname != "" && info.projectname != ""{
+		return true
+	}
+	//if v.budget != info.budget && v.budget != 0 && info.budget != 0 {
+	//	return true
+	//}
+	//if isBidWinningAmount(v.bidamount,info.bidamount) && v.bidamount != 0 && info.bidamount != 0{
+	//	return true
+	//}
+	//if deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) && v.winner != "" && info.winner != "" {
+	//	return true
+	//}
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+
+	return false
+}
+
+
+
+//无效数据
+func invalidData(d1 string, d2 string, d3 string, d4 string) bool {
+	var n int
+	if d1 != "" {
+		n++
+	}
+	if d2 != "" {
+		n++
+	}
+	if d3 != "" {
+		n++
+	}
+	if d4 != "" {
+		n++
+	}
+	if n == 0 {
+		return true
+	}
+	return false
+}

+ 467 - 0
udpfilterdup/src3/dataMethodHeavy.go

@@ -0,0 +1,467 @@
+package main
+
+import "strings"
+
+//判重方法1
+func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
+
+	isMeet := false
+	if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
+		info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
+		info.subtype == "变更" || info.subtype == "其他" {
+		//招标结果
+		if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
+			if tenderRepeat_C(v, info) {
+				return false, reason
+			} else {
+				reason = reason + "---招标类"
+				return true, reason
+			}
+		} else {
+			return false, reason
+		}
+
+	} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
+		//中标结果
+		if isMeet, reason = winningRepeat_A(v, info, reason); isMeet {
+			if winningRepeat_C(v, info) {
+				return false, reason
+			} else {
+				reason = reason + "---中标类"
+				return true, reason
+			}
+		} else {
+			return false, reason
+		}
+
+	} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+		//合同
+		if isMeet, reason = contractRepeat_A(v, info, reason); isMeet {
+			if contractRepeat_C(v, info) {
+				return false, reason
+			} else {
+				reason = reason + "---合同类"
+				return true, reason
+			}
+		} else {
+			return false, reason
+		}
+	} else {
+		//招标结果
+		if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
+			if tenderRepeat_C(v, info) {
+				return false, reason
+			} else {
+				reason = reason + "---类别空-招标类"
+				return true, reason
+			}
+		} else {
+			return false, reason
+		}
+	}
+
+	return false, reason
+}
+
+//判重方法2
+func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
+	isMeet := false
+	if v.agency == info.agency && v.agency != "" && info.agency != "" {
+		if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
+			info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
+			info.subtype == "变更" || info.subtype == "其他" {
+			//招标结果
+			if isMeet, reason = tenderRepeat_B(v, info, reason); isMeet {
+				if tenderRepeat_C(v, info) { //有不同
+					return false, reason
+				} else {
+					reason = reason + "---招标类"
+					return true, reason
+				}
+			} else {
+				return false, reason
+			}
+
+		} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
+			//中标结果
+			if isMeet, reason = winningRepeat_B(v, info, reason); isMeet {
+				if winningRepeat_C(v, info) { //有不同
+					return false, reason
+				} else {
+					reason = reason + "---中标类"
+					return true, reason
+				}
+			} else {
+				return false, reason
+			}
+
+		} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
+			//合同
+			if isMeet, reason = contractRepeat_B(v, info, reason); isMeet {
+				if contractRepeat_C(v, info) { //有不同
+					return false, reason
+				} else {
+					reason = reason + "---合同类"
+					return true, reason
+				}
+			} else {
+				return false, reason
+			}
+		} else {
+			//招标结果
+			if isMeet, reason = tenderRepeat_B(v, info, reason); isMeet {
+				if tenderRepeat_C(v, info) { //有不同
+					return false, reason
+				} else {
+					reason = reason + "---类别空-招标类"
+					return true, reason
+				}
+			} else {
+				return false, reason
+			}
+		}
+	}
+
+	//不同
+	if v.agency != info.agency && v.agency != "" && info.agency != "" {
+		return false, reason
+	}
+	//机构最少一个为空
+	if v.agency == "" || info.agency == "" {
+		var repeat = false
+		if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
+			reason = reason + "---机构最少一个空"
+			return true, reason
+		} else {
+			return false, reason
+		}
+	}
+
+	return false, reason
+}
+
+//招标_A
+func tenderRepeat_A(v *Info, info *Info, reason string) (bool, string) {
+
+	var ss string
+	p1, p2, p3, p4, p9, p10, p11 := false, false, false, false, false, false, false
+	if v.projectname != "" && v.projectname == info.projectname {
+		ss = ss + "p1-名称-"
+		p1 = true
+	}
+	if v.buyer != "" && v.buyer == info.buyer {
+		ss = ss + "p2-单位-"
+		p2 = true
+	}
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
+		ss = ss + "p3-编号组-"
+		p3 = true
+	}
+	if v.budget != 0 && v.budget == info.budget {
+		ss = ss + "p4-预算-"
+		p4 = true
+	}
+	if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
+		ss = ss + "p9-开标时间相同-"
+		p9 = true
+	}
+	if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
+		ss = ss + "p10-开标地点-"
+		p10 = true
+	}
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
+		ss = ss + "p11-标题-"
+		p11 = true
+	}
+
+	if info.subtype !=""&&(p1 && p3 && p11)  {
+		reason = reason + "满足招标A,3要素组合-" + ss + ","
+		return true, reason
+	}
+
+	if  (p1 && p2 && p3) || (p1 && p2 && p4) || (p1 && p2 && p9) ||
+		(p1 && p2 && p10) || (p1 && p2 && p11) || (p1 && p3 && p9) || (p1 && p3 && p10) ||
+		(p1 && p4 && p9) || (p1 && p4 && p10) || (p2 && p3 && p4) ||
+		(p2 && p3 && p9) || (p2 && p3 && p10) || (p2 && p3 && p11) ||
+		(p2 && p4 && p9) || (p2 && p4 && p10) || (p2 && p4 && p11) ||
+		(p3 && p4 && p9) || (p3 && p4 && p10) || (p3 && p4 && p11) ||
+		(p4 && p9 && p10) || (p4 && p9 && p11) || (p9 && p10 && p11) {
+		reason = reason + "满足招标A,3要素组合-" + ss + ","
+		return true, reason
+	}
+	return false, reason
+}
+
+//招标_B
+func tenderRepeat_B(v *Info, info *Info, reason string) (bool, string) {
+
+	m, n := 0, 0
+	if v.projectname != "" && v.projectname == info.projectname {
+		m++
+		n++
+	}
+	if v.buyer != "" && v.buyer == info.buyer {
+		m++
+	}
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
+		m++
+	}
+	if v.budget != 0 && v.budget == info.budget {
+		m++
+	}
+	if v.bidopentime != 0 && v.bidopentime == info.bidopentime {
+		m++
+	}
+	//if v.bidopenaddress != "" && v.bidopenaddress == info.bidopenaddress {
+	//	m++
+	//}
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
+		m++
+		n++
+	}
+	if m >= 2 {
+		if n == 2 && m == 2 {
+			return false, reason
+		} else {
+			reason = reason + "满足招标B,六选二,"
+			return true, reason
+		}
+	}
+	return false, reason
+}
+
+//招标_C
+func tenderRepeat_C(v *Info, info *Info) bool {
+
+	if v.budget != 0 && info.budget != 0 && v.budget != info.budget {
+		return true
+	}
+	if v.bidopentime != 0 && info.bidopentime != 0 && isBidopentimeInterval(info.bidopentime,v.bidopentime) {
+		return true
+	}
+	return false
+}
+
+//中标_A
+func winningRepeat_A(v *Info, info *Info, reason string) (bool, string) {
+
+	var ss string
+	p1, p2, p3, p5, p6, p11 := false, false, false, false, false, false
+	if v.projectname != "" && v.projectname == info.projectname {
+		ss = ss + "p1-项目名称-"
+		p1 = true
+	}
+	if v.buyer != "" && v.buyer == info.buyer {
+		ss = ss + "p2-单位-"
+		p2 = true
+	}
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
+		ss = ss + "p3-编号组--"
+		p3 = true
+	}
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
+		ss = ss + "p5-中标金-"
+		p5 = true
+	}
+	if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
+		ss = ss + "p6-中标人-"
+		p6 = true
+	}
+
+
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
+		ss = ss + "p11-标题-"
+		p11 = true
+	}
+
+	if 	(p1 && p2 && p3) || (p1 && p2 && p5) || (p1 && p2 && p6) ||
+		(p1 && p2 && p11)|| (p1 && p3 && p11)||
+		(p1 && p3 && p5) || (p1 && p3 && p6) || (p1 && p5 && p6) ||
+		(p2 && p3 && p5) || (p2 && p3 && p6) || (p2 && p3 && p11) ||
+		(p2 && p5 && p6) || (p2 && p5 && p11) || (p2 && p6 && p11) ||
+		(p3 && p5 && p6) || (p3 && p5 && p11) || (p3 && p6 && p11) ||
+		(p5 && p6 && p11) {
+		reason = reason + "满足中标A,3要素组合-" + ss + ","
+		return true, reason
+	}
+
+	return false, reason
+}
+
+//中标_B
+func winningRepeat_B(v *Info, info *Info, reason string) (bool, string) {
+
+	m, n := 0, 0
+	if v.projectname != "" && v.projectname == info.projectname {
+		m++
+		n++
+	}
+	if v.buyer != "" && v.buyer == info.buyer {
+		m++
+	}
+	if (v.projectcode != "" && v.projectcode == info.projectcode && len(v.projectcode) >= 5) ||
+		(v.contractnumber != "" && v.contractnumber == info.contractnumber && len(v.contractnumber) >= 5) {
+		m++
+	}
+	if v.bidamount != 0 && !isBidWinningAmount(v.bidamount,info.bidamount) {
+		m++
+	}
+	if v.winner != "" && deleteExtraSpace(v.winner) == deleteExtraSpace(info.winner) {
+		m++
+	}
+	if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+		(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
+		m++
+		n++
+	}
+	if m >= 2 {
+		if n == 2 && m == 2 {
+			return false, reason
+		} else {
+			reason = reason + "满足中标B.六选二,"
+			return true, reason
+		}
+	}
+	return false, reason
+}
+
+//中标_C
+func winningRepeat_C(v *Info, info *Info) bool {
+
+	if v.bidamount != 0 && info.bidamount != 0 && isBidWinningAmount(v.bidamount,info.bidamount) {
+		//避免抽错金额-
+		if ((v.projectcode!=""&&info.projectcode!=""&&v.projectcode==info.projectcode)||
+			(v.contractnumber!=""&&info.contractnumber!=""&&v.contractnumber==info.contractnumber)) &&
+			(v.winner!=""&&info.winner!=""&&v.winner==info.winner) {
+			return false
+		}
+		return true
+	}
+	if v.winner != "" && info.winner != "" && deleteExtraSpace(v.winner) != deleteExtraSpace(info.winner) {
+		return true
+	}
+	return false
+}
+
+//合同_A
+func contractRepeat_A(v *Info, info *Info, reason string) (bool, string) {
+
+	isMeet_1 := false
+	if isMeet_1, reason = tenderRepeat_A(v, info, reason); isMeet_1 {
+		return true, reason
+	}
+
+	isMeet_2 := false
+	if isMeet_2, reason = winningRepeat_A(v, info, reason); isMeet_2 {
+		return true, reason
+	}
+	return false, reason
+}
+
+//合同_B
+func contractRepeat_B(v *Info, info *Info, reason string) (bool, string) {
+
+	isMeet_1 := false
+	if isMeet_1, reason = tenderRepeat_B(v, info, reason); isMeet_1 {
+		return true, reason
+	}
+	isMeet_2 := false
+	if isMeet_2, reason = winningRepeat_B(v, info, reason); isMeet_2 {
+		return true, reason
+	}
+	return false, reason
+}
+
+//合同_C
+func contractRepeat_C(v *Info, info *Info) bool {
+
+	if tenderRepeat_C(v, info) {
+		return true
+	}
+	if winningRepeat_C(v, info) {
+		return true
+	}
+
+	//合同类 - 新增编号
+	if v.contractnumber != "" && info.contractnumber != "" && v.contractnumber != info.contractnumber {
+		return true
+	}
+	if v.projectcode != "" && info.projectcode != "" && v.projectcode != info.projectcode {
+		return true
+	}
+
+	return false
+}
+
+
+
+
+
+
+
+
+
+//快速低质量数据判重
+func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
+	if !isTheSameDay(v.publishtime,info.publishtime) {
+		return false,reason
+	}
+	//首先判定是否为低质量数据    info目标数据
+	if info.title!=""&&(info.agency==""||v.agency=="")&&
+		info.title==v.title&&info.projectcode==""&&info.contractnumber==""&&info.buyer=="" {
+		isValue:=0//五要素判断
+		if info.projectname != "" {//项目名称
+			isValue++
+		}
+		if info.budget != 0 {//预算
+			isValue++
+		}
+		if info.winner != ""{//中标单位
+			isValue++
+		}
+		if info.bidamount != 0 {//中标金额
+			isValue++
+		}
+		if isValue==0 {
+			reason = reason + "---低质量-要素均为空-标题满足"
+			return true, reason
+		}else if isValue==1 {
+			isMeet := false
+			if isMeet, reason = judgeLowQualityData(v, info, reason); isMeet {
+				reason = reason + "---低质量-有且一个要素组合"
+				return true, reason
+			}
+		}else {
+
+		}
+	}
+	return false,reason
+}
+
+
+//类别细节原因记录
+func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
+	if info.projectname!="" && info.projectname == v.projectname{//项目名称
+		reason = reason + "---项目名称"
+		return true,reason
+	}
+	if info.budget != 0 && info.budget == v.budget{//预算
+		reason = reason + "---预算"
+		return true,reason
+	}
+	if v.winner != "" && info.winner == v.winner{//中标单位
+		reason = reason + "---中标单位"
+		return true,reason
+	}
+	if v.bidamount != 0 && info.bidamount == v.bidamount{//中标金额
+		reason = reason + "---中标金额"
+		return true,reason
+	}
+	return false,reason
+}

+ 420 - 0
udpfilterdup/src3/dataMethodMerge.go

@@ -0,0 +1,420 @@
+package main
+
+import "qfw/util"
+
+
+func mergeDataFields(source *Info, info *Info) (*Info,map[string]interface{} ,bool) {
+	update_map := map[string]interface{}{
+		"$set": map[string]interface{}{},
+	}
+	mergeMap :=source.mergemap
+	isReplace:=false
+	//项目名称
+	if source.projectname == "" && info.projectname != "" {
+		mergeMap["projectname"] = map[string]interface{}{
+			"projectname":info.projectname,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["projectname"] = info.projectname
+		source.projectname = info.projectname
+		isReplace = true
+	}
+
+	//项目编号
+	if source.projectcode == "" && info.projectcode != "" {
+		mergeMap["projectcode"] = map[string]interface{}{
+			"projectcode":info.projectcode,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["projectcode"] = info.projectcode
+		source.projectcode = info.projectcode
+		isReplace = true
+	}
+
+	//采购单位
+	if source.buyer == "" && info.buyer != "" {
+		mergeMap["buyer"] = map[string]interface{}{
+			"buyer":info.buyer,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["buyer"] = info.buyer
+		source.buyer = info.buyer
+		isReplace = true
+	}
+
+	//预算
+	if source.budget == 0 && info.budget != 0 {
+		mergeMap["budget"] = map[string]interface{}{
+			"budget":info.budget,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["budget"] = info.budget
+		source.budget = info.budget
+		isReplace = true
+	}
+
+	//中标单位
+	if source.winner == "" && info.winner != "" {
+		mergeMap["winner"] = map[string]interface{}{
+			"winner":info.winner,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["winner"] = info.winner
+		source.winner = info.winner
+		isReplace = true
+	}
+
+	//中标金额
+	if source.bidamount == 0 && info.bidamount != 0 {
+		mergeMap["bidamount"] = map[string]interface{}{
+			"bidamount":info.bidamount,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["bidamount"] = info.bidamount
+		source.bidamount = info.bidamount
+		isReplace = true
+	}
+
+	//开标时间
+	if source.bidopentime == 0 && info.bidopentime != 0 {
+		mergeMap["bidopentime"] = map[string]interface{}{
+			"bidopentime":info.bidopentime,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["bidopentime"] = info.bidopentime
+		source.bidopentime = info.bidopentime
+		isReplace = true
+	}
+
+	//合同编号
+	if source.contractnumber == "" && info.contractnumber != "" {
+		mergeMap["contractnumber"] = map[string]interface{}{
+			"contractnumber":info.contractnumber,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["contractnumber"] = info.contractnumber
+		source.contractnumber = info.contractnumber
+		isReplace = true
+	}
+
+	//代理机构
+	if source.agency == "" && info.agency != "" {
+		mergeMap["agency"] = map[string]interface{}{
+			"agency":info.agency,
+			"id":info.id,
+		}
+		update_map["$set"].(map[string]interface{})["agency"] = info.agency
+		source.agency = info.agency
+		isReplace = true
+	}
+
+	source.mergemap = mergeMap
+	update_map["$set"].(map[string]interface{})["merge"] = mergeMap
+
+	return source,update_map,isReplace
+}
+
+
+
+
+
+
+
+//合并字段-并更新merge字段的值-
+func mergeDataFieldsArr(source *Info, info *Info) (*Info, []int64, bool) {
+
+	merge_recordMap := make(map[string]interface{}, 0)
+	mergeArr := make([]int64, 0)
+	//是否替换数据了-记录原始的数据
+	is_replace := false
+	//1、城市
+	if source.area == "" || source.area == "全国" {
+		//为空
+		if info.area != "全国" && info.area != "" {
+			merge_recordMap["area"] = source.area
+			merge_recordMap["city"] = source.city
+			source.area = info.area
+			source.city = info.city
+			mergeArr = append(mergeArr, 1)
+			is_replace = true
+		}
+	} else {
+		//不为空-查看站点相关-有值必替换
+		if source.is_site {
+			//是站点替换的城市
+			merge_recordMap["site_area"] = source.area
+			merge_recordMap["site_city"] = source.city
+			mergeArr = append(mergeArr, 0)
+			is_replace = true
+			source.is_site = false
+
+		}
+	}
+	//2、项目名称
+	if source.projectname == "" && info.projectname != "" {
+		merge_recordMap["projectname"] = source.projectname
+		source.projectname = info.projectname
+		mergeArr = append(mergeArr, 2)
+		is_replace = true
+	}
+	//3、项目编号
+	if source.projectcode == "" && info.projectcode != "" {
+		merge_recordMap["projectcode"] = source.projectcode
+		source.projectcode = info.projectcode
+		mergeArr = append(mergeArr, 3)
+		is_replace = true
+	}
+	//4、采购单位
+	if source.buyer == "" && info.buyer != "" {
+		merge_recordMap["buyer"] = source.buyer
+		source.buyer = info.buyer
+		mergeArr = append(mergeArr, 4)
+		is_replace = true
+	}
+	//5、预算
+	if source.budget == 0 && info.budget != 0 {
+		merge_recordMap["budget"] = source.budget
+		source.budget = info.budget
+		mergeArr = append(mergeArr, 5)
+		is_replace = true
+	}
+	//6、中标单位
+	if source.winner == "" && info.winner != "" {
+		merge_recordMap["winner"] = source.winner
+		source.winner = info.winner
+		mergeArr = append(mergeArr, 6)
+		is_replace = true
+	}
+	//7、中标金额
+	if source.bidamount == 0 && info.bidamount != 0 {
+		merge_recordMap["bidamount"] = source.bidamount
+		source.bidamount = info.bidamount
+		mergeArr = append(mergeArr, 7)
+		is_replace = true
+	}
+	//8、开标时间-地点
+	if source.bidopentime == 0 && info.bidopentime != 0 {
+		merge_recordMap["bidopentime"] = source.bidopentime
+		source.bidopentime = info.bidopentime
+		mergeArr = append(mergeArr, 8)
+		is_replace = true
+	}
+
+	//9、合同编号
+	if source.contractnumber == "" && info.contractnumber != "" {
+		merge_recordMap["contractnumber"] = source.contractnumber
+		source.contractnumber = info.contractnumber
+		mergeArr = append(mergeArr, 9)
+		is_replace = true
+	}
+
+	//10、发布时间
+	if source.publishtime == 0 && info.publishtime != 0 {
+		merge_recordMap["publishtime"] = source.publishtime
+		source.publishtime = info.publishtime
+		mergeArr = append(mergeArr, 10)
+		is_replace = true
+	}
+	//11、代理机构
+	if source.agency == "" && info.agency != "" {
+		merge_recordMap["agency"] = source.agency
+		source.agency = info.agency
+		mergeArr = append(mergeArr, 11)
+		is_replace = true
+	}
+
+	if is_replace { //有过替换更新
+		//总次数+1
+		source.mergemap["total_num"] = util.Int64All(source.mergemap["total_num"]) + 1
+		merge_recordMap["num"] = util.Int64All(source.mergemap["total_num"])
+		//和哪一个数据id进行非空替换的-记录
+		key := info.id
+		source.mergemap[key] = merge_recordMap
+	}
+
+	//待进一步优化
+	return source, mergeArr, is_replace
+}
+
+//权重评估
+func basicDataScore(v *Info, info *Info) bool {
+
+	/*
+	  权重评估
+	  网站优先级判定规则:
+	  1、国家>省级>市级>县区
+	  2、政府采购>公共资源>官方网站|政府门户>社会公共招标平台|企业招标平台
+	  3、同sitetype-分析weight
+	  4、要素打分-分析
+	*/
+	v_score, info_score := -1, -1
+	dict_v := SiteMap[v.site]
+	dict_info := SiteMap[info.site]
+	//先判断level
+	if dict_v != nil {
+		v_level := util.ObjToString(dict_v["level"])
+		if v_level == "国家" {
+			v_score = 4
+		} else if v_level == "省级" {
+			v_score = 3
+		} else if v_level == "市级" {
+			v_score = 2
+		} else if v_level == "县区" {
+			v_score = 1
+		} else if v_level == "" {
+		} else {
+			v_score = 0
+		}
+	}
+
+	if dict_info != nil {
+		info_level := util.ObjToString(dict_info["level"])
+		if info_level == "国家" {
+			info_score = 4
+		} else if info_level == "省级" {
+			info_score = 3
+		} else if info_level == "市级" {
+			info_score = 2
+		} else if info_level == "县区" {
+			info_score = 1
+		} else if info_level == "" {
+
+		} else {
+			v_score = 0
+		}
+	}
+
+	if v_score > info_score {
+		return true
+	}
+	if v_score < info_score {
+		return false
+	}
+
+	//判断sitetype
+	if dict_v != nil {
+		v_sitetype := util.ObjToString(dict_v["sitetype"])
+		if v_sitetype == "政府采购" {
+			v_score = 4
+		} else if v_sitetype == "公共资源" {
+			v_score = 3
+		} else if v_sitetype == "官方网站"|| v_sitetype == "政府门户" {
+			v_score = 2
+		} else if v_sitetype == "社会公共招标平台" || v_sitetype == "企业招标平台" {
+			v_score = 1
+		} else if v_sitetype == "" {
+		} else {
+			v_score = 0
+		}
+	}
+
+	if dict_info != nil {
+		info_sitetype := util.ObjToString(dict_info["sitetype"])
+		if info_sitetype == "政府采购" {
+			info_score = 4
+		} else if info_sitetype == "公共资源" {
+			info_score = 3
+		} else if info_sitetype == "官方网站"|| info_sitetype == "政府门户" {
+			info_score = 2
+		} else if info_sitetype == "社会公共招标平台" || info_sitetype == "企业招标平台" {
+			info_score = 1
+		} else if info_sitetype == "" {
+		} else {
+			info_score = 0
+		}
+	}
+
+	if v_score > info_score {
+		return true
+	}
+	if v_score < info_score {
+		return false
+	}
+
+	if v_score == info_score {//同sitetype 情况下   分析weight
+		v_weight := util.IntAll(dict_v["weight"])
+		info_weight := util.IntAll(dict_info["weight"])
+		if v_weight>info_weight {
+			return true
+		}
+		if info_weight>v_weight {
+			return false
+		}
+	}
+
+	//网站评估
+	m, n := 0, 0
+	if v.projectname != "" {
+		m++
+	}
+	if v.buyer != "" {
+		m++
+	}
+	if v.projectcode != "" || v.contractnumber != "" {
+		m++
+	}
+	if v.budget != 0 {
+		m++
+	}
+	if v.bidamount != 0 {
+		m++
+	}
+	if v.winner != "" {
+		m++
+	}
+	if v.bidopentime != 0 {
+		m++
+	}
+	if v.bidopenaddress != "" {
+		m++
+	}
+	if v.agency != "" {
+		m = m + 2
+	}
+	if v.city != "" {
+		m = m + 2
+	}
+
+	if info.projectname != "" {
+		n++
+	}
+	if info.buyer != "" {
+		n++
+	}
+	if info.projectcode != "" || info.contractnumber != "" {
+		n++
+	}
+	if info.budget != 0 {
+		n++
+	}
+	if info.bidamount != 0 {
+		n++
+	}
+	if info.winner != "" {
+		n++
+	}
+	if info.bidopentime != 0 {
+		n++
+	}
+	if info.bidopenaddress != "" {
+		n++
+	}
+	if info.agency != "" {
+		n = n + 2
+	}
+	if info.city != "" {
+		n = n + 2
+	}
+
+	if m > n {
+		return true
+	} else if m == n {
+		if v.publishtime >= info.publishtime {
+			return true
+		} else {
+			return false
+		}
+	} else {
+		return false
+	}
+}

+ 588 - 0
udpfilterdup/src3/datamap.go

@@ -0,0 +1,588 @@
+package main
+
+import (
+	"fmt"
+	"log"
+	qutil "qfw/util"
+	"reflect"
+	"regexp"
+	"strings"
+	"sync"
+	"time"
+)
+
+type Info struct {
+	id    string //id
+	title string //标题
+
+	area           string  //省份
+	city           string  //城市
+	subtype        string  //信息类型
+	buyer          string  //采购单位
+	agency         string  //代理机构
+	winner         string  //中标单位
+	budget         float64 //预算金额
+	bidamount      float64 //中标金额
+	projectname    string  //项目名称
+	projectcode    string  //项目编号
+	contractnumber string  //合同编号
+	publishtime    int64   //发布时间
+	comeintime     int64   //入库时间
+	bidopentime    int64   //开标时间
+	bidopenaddress string  //开标地点
+	site 		   string //站点
+	href 		     string //正文的url
+	repeatid         string                 //重复id
+	titleSpecialWord bool                   //标题特殊词
+	specialWord      bool                   //再次判断的特殊词
+	mergemap         map[string]interface{} //合并记录
+	is_site          bool                   //是否站点城市
+	repeat_ids        []string               //记录所有重复id
+
+}
+
+var datelimit = float64(432000) //五天
+var sitelock sync.Mutex         //锁
+
+//一般数据判重
+type datamap struct {
+	lock   sync.Mutex //锁
+	days   int        //保留几天数据
+	data   map[string][]*Info
+	keymap []string
+	areakeys []string
+	keys   map[string]bool
+}
+
+//历史
+func TimedTaskDatamap(days int,lasttime int64,numIndex int) *datamap {
+	datelimit = qutil.Float64All(days * 86400)
+	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{},map[string]bool{}}
+	if lasttime <0 {
+		log.Println("数据池空数据")
+		return dm
+	}
+	start := int(time.Now().Unix())
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	query := map[string]interface{}{"publishtime": map[string]interface{}{
+		"$lt": lasttime,
+	}}
+	log.Println("query", query)
+	it := sess.DB(mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
+	n, continuSum := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 ||
+			qutil.IntAll(tmp["dataging"]) == 1 {
+
+		} else {
+			if fmt.Sprint(reflect.TypeOf(tmp["publishtime"]))=="string" {
+				continue
+			}
+			pt := tmp["publishtime"]
+			pt_time := qutil.Int64All(pt)
+
+			if pt_time > time.Now().Unix() {
+				continue
+			}
+			if qutil.Float64All(lasttime-pt_time) < datelimit {
+				continuSum++
+				info := NewInfo(tmp)
+				dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
+				k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
+				data := dm.data[k]
+				if data == nil {
+					data = []*Info{}
+				}
+				data = append(data, info)
+				dm.data[k] = data
+				dm.keys[dkey] = true
+				//添加省
+				isAreaExist :=false
+				for _,v:= range dm.areakeys {
+					if v==info.area {
+						isAreaExist = true
+					}
+				}
+				if !isAreaExist {
+					areaArr := dm.areakeys
+					areaArr = append(areaArr,info.area)
+					dm.areakeys = areaArr
+				}
+			} else {
+				break
+			}
+		}
+
+		tmp = make(map[string]interface{})
+	}
+
+	log.Printf("第%d组:数据池构建完成:%d秒,%d个\n",numIndex ,int(time.Now().Unix())-start, n)
+
+	return dm
+}
+
+//增量
+func NewDatamap(days int, lastid string) *datamap {
+	datelimit = qutil.Float64All(days * 86400 * 2)
+	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{},[]string{}, map[string]bool{}}
+	if lastid == "" {
+		log.Println("不构建数据池")
+		return dm
+	}
+	//初始化加载数据
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	query := map[string]interface{}{"_id": map[string]interface{}{
+		"$lte": StringTOBsonId(lastid),
+	}}
+	log.Println("query", query)
+	it := sess.DB(mgo.DbName).C(extract).Find(query).Sort("-publishtime").Iter()
+	nowTime := time.Now().Unix()//当前时间的时间戳
+	n, continuSum := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+
+		//source := util.ObjToMap(tmp["jsondata"]) //修复临时添加
+		//if util.IntAll((*source)["sourcewebsite"]) == 1 {
+		//	continue
+		//}
+
+		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1{
+
+		} else {
+			if fmt.Sprint(reflect.TypeOf(tmp["publishtime"]))=="string" {
+				continue
+			}
+			pt:= tmp["publishtime"]
+			pt_time := qutil.Int64All(pt)
+			if pt_time > time.Now().Unix() {
+				continue
+			}
+			if qutil.Float64All(nowTime-pt_time) <= datelimit {
+				continuSum++
+				info := NewInfo(tmp)
+				dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
+				k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
+				data := dm.data[k]
+				if data == nil {
+					data = []*Info{}
+				}
+				data = append(data, info)
+				dm.data[k] = data
+				dm.keys[dkey] = true
+				//添加省
+				isAreaExist :=false
+				for _,v:= range dm.areakeys {
+					if v==info.area {
+						isAreaExist = true
+					}
+				}
+				if !isAreaExist {
+					areaArr := dm.areakeys
+					areaArr = append(areaArr,info.area)
+					dm.areakeys = areaArr
+				}
+			} else {
+				break
+			}
+		}
+		if n%10000 == 0 {
+			log.Println("当前 n:", n,"数量:" ,continuSum,tmp["_id"])
+		}
+		tmp = make(map[string]interface{})
+	}
+	log.Println("load data:", n,"总数:",continuSum)
+	return dm
+}
+
+//数据构建
+func NewInfo(tmp map[string]interface{}) *Info {
+	subtype := qutil.ObjToString(tmp["subtype"])
+	area := qutil.ObjToString(tmp["area"])
+	if area == "A" {
+		area = "全国"
+	}
+	info := &Info{}
+	info.id = BsonTOStringId(tmp["_id"])
+	info.title = qutil.ObjToString(tmp["title"])
+	info.area = area
+	info.subtype = subtype
+	info.buyer = qutil.ObjToString(tmp["buyer"])
+	info.projectname = qutil.ObjToString(tmp["projectname"])
+	info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
+	info.projectcode = qutil.ObjToString(tmp["projectcode"])
+	info.city = qutil.ObjToString(tmp["city"])
+	info.agency = qutil.ObjToString(tmp["agency"])
+	info.winner = qutil.ObjToString(tmp["winner"])
+	info.budget = qutil.Float64All(tmp["budget"])
+	info.bidamount = qutil.Float64All(tmp["bidamount"])
+	info.publishtime = qutil.Int64All(tmp["publishtime"])
+	info.comeintime = qutil.Int64All(tmp["comeintime"])
+	info.bidopentime = qutil.Int64All(tmp["bidopentime"])
+	info.bidopenaddress = qutil.ObjToString(tmp["bidopenaddress"])
+	info.site = qutil.ObjToString(tmp["site"])
+	info.href = qutil.ObjToString(tmp["href"])
+	info.repeatid = qutil.ObjToString(tmp["repeatid"])
+	info.specialWord = FilterRegTitle.MatchString(info.title)
+	info.titleSpecialWord = FilterRegTitle_0.MatchString(info.title) ||FilterRegTitle_1.MatchString(info.title) || FilterRegTitle_2.MatchString(info.title)
+	info.mergemap = *qutil.ObjToMap(tmp["merge"])
+	if info.mergemap == nil {
+		info.mergemap = make(map[string]interface{}, 0)
+	}
+	if info.repeat_ids == nil {
+		info.repeat_ids = make([]string, 0)
+	}
+
+
+
+	info.is_site = false
+
+	return info
+}
+
+//判重方法
+//判重方法
+//判重方法
+func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
+	reason := ""
+	keys := []string{}
+	d.lock.Lock()
+	for k, _ := range d.keys { //不同时间段
+		if info.area=="全国" {
+			//匹配所有省
+			for _,v := range d.areakeys{
+				keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, v))
+			}
+		}else {
+			//匹配指定省
+			keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
+		}
+		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
+
+	}
+
+
+	d.lock.Unlock()
+L:
+	for _, k := range keys {
+		d.lock.Lock()
+		data := d.data[k]
+		d.lock.Unlock()
+		if len(data) > 0 { //对比v   找到同类型,同省或全国的数据作对比
+			for _, v := range data {
+				reason = ""
+				if v.id == info.id { //正常重复
+					return false, v, ""
+				}
+
+				//buyer 优先级高,有值且不相等过滤
+				if info.buyer!=""&&v.buyer!=""&&info.buyer!=v.buyer {
+					if buyerIsContinue(v,info) {
+						continue
+					}
+				}
+
+				if info.site != "" {//站点临时赋值
+					sitelock.Lock()
+					dict := SiteMap[info.site]
+					sitelock.Unlock()
+					if dict != nil {
+						if (info.area == "全国" && dict["area"] != "")||
+							(info.city == "" && dict["city"] != ""){
+							info.is_site = true
+							info.area = qutil.ObjToString(dict["area"])
+							info.city = qutil.ObjToString(dict["city"])
+						}
+					}
+				}
+
+
+				//前置条件 - 站点相关
+				if info.site != "" && info.site == v.site {
+					if info.href != "" && info.href == v.href {
+						reason = "同站点-href相同"
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+					if info.href != "" && info.href != v.href { //待优化
+						if v.title==info.title{
+							if !againRepeat(v, info) {//进行同站点二次判断
+								reason = "同站点-href不同-标题相同等"
+								b = true
+								source = v
+								reasons = reason
+								break L
+							}else {
+								continue
+							}
+						}else {
+							if againRepeat(v, info) {//进行同站点二次判断
+								continue
+							}
+						}
+					}
+				}
+
+				specialNum:= dealWithSpecialWordNumber(info,v)
+				//前置条件 - 标题相关,有且一个关键词
+				if specialNum==1 {
+					if againRepeat(v, info) {
+						continue
+					}
+				}
+				//前置条件3 - 标题相关,均含有关键词
+				if specialNum==2 {
+					if len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 &&
+						v.title != "" && info.title != "" {
+						letter1,letter2:=v.title,info.title
+						res, _ := regexp.Compile("[0-9a-zA-Z]+");
+						if res.MatchString(letter1)||res.MatchString(letter2) {
+							letter1=convertArabicNumeralsAndLetters(letter1)
+							letter2=convertArabicNumeralsAndLetters(letter2)
+						}
+						if strings.Contains(letter1,"重新招标")|| strings.Contains(letter2,"重新招标"){
+							letter1,letter2=dealWithSpecialPhrases(letter1,letter2)
+						}
+						if letter1==letter2 {
+							reason = reason + "标题关键词相等关系"
+							if !againRepeat(v, info) {//进行二级金额判断
+								b = true
+								source = v
+								reasons = reason
+								break L
+							}
+						}else {
+							if !(strings.Contains(letter1, letter2) || strings.Contains(letter2, letter1)) {
+								//无包含关系-即不相等
+								if againRepeat(v, info) {
+									continue
+								}
+							}
+						}
+					}
+				}
+
+
+				//前置条件-五要素均相等
+				if leadingElementSame(v,info) {
+					reason = "五要素-相同-满足"
+					b = true
+					source = v
+					reasons = reason
+					break L
+				}
+
+
+
+				//新增快速数据过少判重
+				if LowHeavy {
+					repeat := false
+					if repeat, reason = fastLowQualityHeavy(v, info, reason); repeat {
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+				}
+
+				//代理机构相同-非空相等
+				if v.agency != "" && info.agency != "" && v.agency == info.agency {
+					reason = reason + "同机构-"
+					repeat := false
+					if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
+						b = true
+						source = v
+						reasons = reason
+						break L
+					}
+				} else {
+					reason = reason + "非同机构-"
+					if info.city != "" && info.city == v.city {
+						reason = reason + "同城-"
+						repeat := false
+						if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break L
+						}
+					} else {
+						reason = reason + "不同城-"
+						repeat := false
+						if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
+							b = true
+							source = v
+							reasons = reason
+							break L
+						}
+					}
+				}
+			}
+
+		}
+	}
+
+	//往预存数据 d 添加
+	if !b {
+		ct := info.publishtime
+		dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
+		k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
+		d.lock.Lock()
+		data := d.data[k]
+		if data == nil {
+			data = []*Info{info}
+			d.data[k] = data
+			if !d.keys[dkey] {
+				d.keys[dkey] = true
+				d.update(ct)
+			}
+		} else {
+			data = append(data, info)
+			d.data[k] = data
+		}
+
+		//添加省
+		isAreaExist :=false
+		for _,v:= range d.areakeys {
+			if v==info.area {
+				isAreaExist = true
+			}
+		}
+		if !isAreaExist {
+			areaArr := d.areakeys
+			areaArr = append(areaArr,info.area)
+			d.areakeys = areaArr
+		}
+
+		d.lock.Unlock()
+	}
+
+	return
+}
+
+func (d *datamap) update(t int64) {
+
+	if TimingTask {
+
+	}else {
+		if IsFull {
+			d.keymap = d.GetLatelyFiveDay(t)//全量
+		}else {
+			d.keymap = d.GetLatelyFiveDayDouble(t) //增量
+		}
+		m := map[string]bool{}
+		for _, v := range d.keymap {
+			m[v] = true
+		}
+		for k, _ := range d.data {
+			if !m[k[:8]] {
+				delete(d.data, k)
+			}
+		}
+		for k, _ := range d.keys {
+			if !m[k] {
+				delete(d.keys, k)
+			}
+		}
+	}
+
+}
+
+func (d *datamap) GetLatelyFiveDay(t int64) []string  {
+	array := make([]string, d.days)
+	now := time.Unix(t, 0)
+	for i := 0; i < d.days; i++ {
+		array[i] = now.Format(qutil.Date_yyyyMMdd)
+		now = now.AddDate(0, 0, -1)
+	}
+	return array
+}
+
+func (d *datamap) GetLatelyFiveDayDouble(t int64) []string  {//增量-两倍
+	array := make([]string, d.days*2)
+	now := time.Now()
+	for i := 0; i < d.days*2; i++ {
+		array[i] = now.Format(qutil.Date_yyyyMMdd)
+		now = now.AddDate(0, 0, -1)
+	}
+	return array
+}
+
+
+
+//替换原始数据池
+func (d *datamap) replacePoolData(newData *Info) {
+	d.lock.Lock()
+	ct := newData.publishtime
+	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
+	k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
+	data := d.data[k]
+	for k, v := range data {
+		if v.id == newData.id {//替换
+			data[k] = newData
+			break
+		}
+	}
+	d.data[k] = data
+	d.lock.Unlock()
+}
+
+
+
+//替换原始数据池
+func (d *datamap) replaceSourceData(newData *Info, oldData *Info) {
+	//删除数据池的老数据
+	ct_old := oldData.publishtime
+	dkey_old := qutil.FormatDateByInt64(&ct_old, qutil.Date_yyyyMMdd)
+	k_old := fmt.Sprintf("%s_%s_%s", dkey_old, oldData.subtype, oldData.area)
+	data_old := d.data[k_old]
+	for k, v := range data_old {
+		if v.id == oldData.id {//删除对应当前的老数据
+			data_old = append(data_old[:k], data_old[k+1:]...)
+			break
+		}
+	}
+	d.data[k_old] = data_old
+
+	//添加新的
+	ct := newData.publishtime
+	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
+	k := fmt.Sprintf("%s_%s_%s", dkey, newData.subtype, newData.area)
+	d.lock.Lock()
+	data := d.data[k]
+	if data == nil {
+		data = []*Info{newData}
+		d.data[k] = data
+		if !d.keys[dkey] {
+			d.keys[dkey] = true
+			d.update(ct)
+		}
+	} else {
+		data = append(data, newData)
+		d.data[k] = data
+	}
+	//添加省
+	isAreaExist :=false
+	for _,v:= range d.areakeys {
+		if v==newData.area {
+			isAreaExist = true
+		}
+	}
+	if !isAreaExist {
+		areaArr := d.areakeys
+		areaArr = append(areaArr,newData.area)
+		d.areakeys = areaArr
+	}
+
+	d.lock.Unlock()
+}
+
+
+
+
+
+
+
+

+ 759 - 0
udpfilterdup/src3/main.go

@@ -0,0 +1,759 @@
+package main
+
+/**
+招标信息判重
+**/
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"github.com/cron"
+	"gopkg.in/mgo.v2/bson"
+	"log"
+	mu "mfw/util"
+	"net"
+	"os"
+	"qfw/util"
+	"regexp"
+	"strconv"
+	"sync"
+	"time"
+)
+
+
+var (
+	Sysconfig    map[string]interface{} //配置文件
+	mconf        map[string]interface{} //mongodb配置信息
+	mgo          *MongodbSim            //mongodb操作对象
+	task_mgo     *MongodbSim            //mongodb操作对象
+	task_collName	string
+	extract      string
+	extract_back string
+	udpclient    mu.UdpClient             //udp对象
+	nextNode     []map[string]interface{} //下节点数组
+	dupdays      = 7                      //初始化判重范围
+	DM           *datamap                 //
+	Update		 *updateInfo
+	//正则筛选相关
+	FilterRegTitle   = regexp.MustCompile("^_$")
+	FilterRegTitle_0 = regexp.MustCompile("^_$")
+	FilterRegTitle_1 = regexp.MustCompile("^_$")
+	FilterRegTitle_2 = regexp.MustCompile("^_$")
+
+	isMerger       bool                              //是否合并
+	threadNum      int                               //线程数量
+	SiteMap        map[string]map[string]interface{} //站点map
+	LowHeavy       bool                              //低质量数据判重
+	TimingTask     bool                              //是否定时任务
+	timingSpanDay  int64                             //时间跨度
+	timingPubScope int64                             //发布时间周期
+	gtid,lastid,gtept,ltept string			//命令输入
+	lteid	string							//历史增量属性
+	IsFull		   bool								//是否全量
+	updatelock 		sync.Mutex         //锁4
+
+)
+
+
+
+func init() {
+
+	flag.StringVar(&lastid, "id", "", "增量加载的lastid") //增量
+	flag.StringVar(&gtid, "gtid", "", "历史增量的起始id")	//历史
+	flag.StringVar(&gtept, "gtept", "", "全量gte发布时间")//全量区间pt
+	flag.StringVar(&ltept, "ltept", "", "全量lte发布时间") //全量区间pt
+
+	flag.Parse()
+
+	util.ReadConfig(&Sysconfig)
+
+	task_mconf := Sysconfig["task_mongodb"].(map[string]interface{})
+	task_mgo = &MongodbSim{
+		MongodbAddr: task_mconf["task_addrName"].(string),
+		DbName:      task_mconf["task_dbName"].(string),
+		Size:        util.IntAllDef(task_mconf["task_pool"], 10),
+	}
+	task_mgo.InitPool()
+	task_collName = task_mconf["task_collName"].(string)
+
+
+	nextNode = util.ObjArrToMapArr(Sysconfig["nextNode"].([]interface{}))
+	mconf = Sysconfig["mongodb"].(map[string]interface{})
+	mgo = &MongodbSim{
+		MongodbAddr: mconf["addr"].(string),
+		DbName:      mconf["db"].(string),
+		Size:        util.IntAllDef(mconf["pool"], 10),
+	}
+	mgo.InitPool()
+
+	extract = mconf["extract"].(string)
+	extract_back = mconf["extract_back"].(string)
+
+	dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
+	//加载数据
+	DM = NewDatamap(dupdays, lastid)
+	//更新池
+	Update = newUpdatePool()
+	go Update.updateData()
+
+	FilterRegTitle = regexp.MustCompile(util.ObjToString(Sysconfig["specialwords"]))
+	FilterRegTitle_0 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_0"]))
+	FilterRegTitle_1 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_1"]))
+	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
+	isMerger = Sysconfig["isMerger"].(bool)
+	threadNum = util.IntAllDef(Sysconfig["threads"], 1)
+	LowHeavy = Sysconfig["lowHeavy"].(bool)
+	TimingTask = Sysconfig["timingTask"].(bool)
+	timingSpanDay = util.Int64All(Sysconfig["timingSpanDay"])
+	timingPubScope = util.Int64All(Sysconfig["timingPubScope"])
+
+	//站点配置
+	site := mconf["site"].(map[string]interface{})
+	SiteMap = make(map[string]map[string]interface{}, 0)
+	start := int(time.Now().Unix())
+	sess_site := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess_site)
+	res_site := sess_site.DB(site["dbname"].(string)).C(site["coll"].(string)).Find(map[string]interface{}{}).Sort("_id").Iter()
+	for site_dict := make(map[string]interface{}); res_site.Next(&site_dict); {
+		data_map := map[string]interface{}{
+			"area":     util.ObjToString(site_dict["area"]),
+			"city":     util.ObjToString(site_dict["city"]),
+			"district": util.ObjToString(site_dict["district"]),
+			"sitetype": util.ObjToString(site_dict["sitetype"]),
+			"level":    util.ObjToString(site_dict["level"]),
+			"weight":   util.ObjToString(site_dict["weight"]),
+		}
+		SiteMap[util.ObjToString(site_dict["site"])] = data_map
+	}
+	log.Printf("new站点加载用时:%d秒,%d个\n", int(time.Now().Unix())-start, len(SiteMap))
+}
+
+
+func main() {
+	go checkMapJob()
+	updport := Sysconfig["udpport"].(string)
+	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
+	udpclient.Listen(processUdpMsg)
+	log.Println("Udp服务监听", updport)
+	if TimingTask {
+		log.Println("正常历史部署")
+		go historyTaskDay()
+	}else {
+		if gtept!=""&&ltept!="" {
+			log.Println("全量判重-准备开始")
+			IsFull = true	//全量判重
+			sid := "1fffffffffffffffffffffff"
+			eid := "9fffffffffffffffffffffff"
+			mapinfo := map[string]interface{}{}
+			if sid == "" || eid == "" {
+				log.Println("sid,eid参数不能为空")
+				os.Exit(0)
+			}
+			mapinfo["gtid"] = sid
+			mapinfo["lteid"] = eid
+			mapinfo["stop"] = "true"
+			task([]byte{}, mapinfo)
+			time.Sleep(99999 * time.Hour)
+		}else {
+			//正常增量
+			log.Println("正常增量部署")
+		}
+	}
+	time.Sleep(99999 * time.Hour)
+}
+
+//测试组人员使用
+func mainT() {
+
+	if TimingTask {
+		go historyTaskDay()
+		time.Sleep(99999 * time.Hour)
+	} else {
+		IsFull = true	//全量判重
+		sid := "1fffffffffffffffffffffff"
+		eid := "9fffffffffffffffffffffff"
+		mapinfo := map[string]interface{}{}
+		if sid == "" || eid == "" {
+			log.Println("sid,eid参数不能为空")
+			os.Exit(0)
+		}
+		mapinfo["gtid"] = sid
+		mapinfo["lteid"] = eid
+		mapinfo["stop"] = "true"
+
+		log.Println("测试:全量判重-准备开始")
+		task([]byte{}, mapinfo)
+
+		time.Sleep(99999 * time.Hour)
+	}
+}
+//upd接收
+func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
+	fmt.Println("接受的段数据")
+	switch act {
+	case mu.OP_TYPE_DATA: //上个节点的数据
+		//从表中开始处理
+		var mapInfo map[string]interface{}
+		err := json.Unmarshal(data, &mapInfo)
+		log.Println("err:", err, "mapInfo:", mapInfo)
+		if err != nil {
+			udpclient.WriteUdp([]byte("err:"+err.Error()), mu.OP_NOOP, ra)
+		} else if mapInfo != nil {
+			go task(data, mapInfo)
+			key, _ := mapInfo["key"].(string)
+			if key == "" {
+				key = "udpok"
+			}
+			udpclient.WriteUdp([]byte(key), mu.OP_NOOP, ra)
+		}
+	case mu.OP_NOOP: //下个节点回应
+		ok := string(data)
+		if ok != "" {
+			log.Println("ok:", ok)
+			udptaskmap.Delete(ok)
+		}
+	}
+}
+
+//开始判重程序
+func task(data []byte, mapInfo map[string]interface{}) {
+	log.Println("开始数据判重")
+	defer util.Catch()
+	//区间id
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
+	}
+	//全量
+	if IsFull && gtept!="" && ltept!=""{
+		log.Println("执行全量分段模式")
+		log.Println(gtept,"---",ltept)
+		q = map[string]interface{}{
+			"publishtime": map[string]interface{}{
+				"$gte": util.Int64All(gtept),
+				"$lte": util.Int64All(ltept),
+			},
+		}
+	}
+
+
+	log.Println("查询条件:",mgo.DbName, extract, q)
+
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
+	pool := make(chan bool, threadNum)
+	wg := &sync.WaitGroup{}
+	n, repeateN := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		if n%100 == 0 {
+			log.Println("current:", n, tmp["_id"],tmp["publishtime"], "repeateN:", repeateN)
+		}
+
+		if util.IntAll(tmp["repeat"]) == 1 {
+			repeateN++
+			tmp = make(map[string]interface{})
+			continue
+		}
+
+		if util.IntAll(tmp["dataging"]) == 1 && !IsFull{
+			tmp = make(map[string]interface{})
+			continue
+		}
+
+		pool <- true
+		wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-pool
+				wg.Done()
+			}()
+			info := NewInfo(tmp)
+			//正常判重
+			b, source, reason := DM.check(info)
+			if b {
+				repeateN++
+				var updateID = map[string]interface{}{} //记录更新判重的
+				updateID["_id"] = StringTOBsonId(info.id)
+				repeat_ids:=source.repeat_ids
+				repeat_ids =  append(repeat_ids,info.id)
+				source.repeat_ids = repeat_ids
+				//替换数据池-更新
+				DM.replacePoolData(source)
+
+				Update.updatePool <- []map[string]interface{}{//原始数据打标签
+					map[string]interface{}{
+						"_id": StringTOBsonId(source.id),
+					},
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat_ids": repeat_ids,
+						},
+					},
+				}
+				Update.updatePool <- []map[string]interface{}{//重复数据打标签
+					updateID,
+					map[string]interface{}{
+						"$set": map[string]interface{}{
+							"repeat":        1,
+							"repeat_reason": reason,
+							"repeat_id":     source.id,
+							"dataging":		 0,
+						},
+					},
+				}
+			}
+		}(tmp)
+		tmp = make(map[string]interface{})
+	}
+	wg.Wait()
+
+	log.Println("this task over.", n, "repeateN:", repeateN, mapInfo["stop"])
+
+	time.Sleep(30 * time.Second)
+
+	//更新Ocr的标记
+	updateOcrFileData(mapInfo["lteid"].(string))
+
+
+	//任务完成,开始发送广播通知下面节点
+	if n >= repeateN && mapInfo["stop"] == nil {
+		log.Println("判重任务完成发送udp")
+		for _, to := range nextNode {
+			sid, _ := mapInfo["gtid"].(string)
+			eid, _ := mapInfo["lteid"].(string)
+			key := sid + "-" + eid + "-" + util.ObjToString(to["stype"])
+			by, _ := json.Marshal(map[string]interface{}{
+				"gtid":  sid,
+				"lteid": eid,
+				"stype": util.ObjToString(to["stype"]),
+				"key":   key,
+			})
+			addr := &net.UDPAddr{
+				IP:   net.ParseIP(to["addr"].(string)),
+				Port: util.IntAll(to["port"]),
+			}
+			node := &udpNode{by, addr, time.Now().Unix(), 0}
+			udptaskmap.Store(key, node)
+			udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+		}
+	}
+}
+
+func updateOcrFileData(cur_lteid string)  {
+	//更新ocr 分类表-判重的状态
+	log.Println("开始更新Ocr表-标记",cur_lteid)
+	task_sess := task_mgo.GetMgoConn()
+	defer task_mgo.DestoryMongoConn(task_sess)
+	q_task:=map[string]interface{}{}
+	it_last := task_sess.DB(task_mgo.DbName).C(task_collName).Find(&q_task).Sort("-_id").Iter()
+	isUpdateOcr:=false
+	updateOcrFile:=[][]map[string]interface{}{}
+	for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+		cur_id := BsonTOStringId(tmp["_id"])
+		lteid:=util.ObjToString(tmp["lteid"])
+		if (lteid==cur_lteid) { //需要更新
+			log.Println("找到该lteid数据",cur_lteid,cur_id)
+			isUpdateOcr = true
+			updateOcrFile = append(updateOcrFile, []map[string]interface{}{//重复数据打标签
+				map[string]interface{}{
+					"_id": tmp["_id"],
+				},
+				map[string]interface{}{
+					"$set": map[string]interface{}{
+						"is_repeat_status": 1,
+						"is_repeat_time" : util.Int64All(time.Now().Unix()),
+					},
+				},
+			})
+			tmp = make(map[string]interface{})
+			break
+		}else {
+			tmp = make(map[string]interface{})
+		}
+	}
+	if !isUpdateOcr {
+		log.Println("出现异常问题,查询不到ocr的lteid",cur_lteid)
+	}else {
+		if len(updateOcrFile) > 0 {
+			task_mgo.UpSertBulk(task_collName, updateOcrFile...)
+		}
+	}
+}
+
+//历史判重
+func historyTaskDay() {
+	defer util.Catch()
+
+	for {
+		start:=time.Now().Unix()
+
+		if gtid=="" {
+			log.Println("请传gtid,否则无法运行")
+			os.Exit(0)
+			return
+		}
+		if lteid!="" {
+			//先进行数据迁移
+			log.Println("开启一次迁移任务",gtid,lteid)
+			moveHistoryData(gtid,lteid)
+			gtid = lteid //替换数据
+		}
+
+		//查询表最后一个id
+		task_sess := task_mgo.GetMgoConn()
+		defer task_mgo.DestoryMongoConn(task_sess)
+		q:=map[string]interface{}{}
+		between_time := time.Now().Unix() - (86400 * timingPubScope)//两年周期
+		it_last := task_sess.DB(task_mgo.DbName).C(task_collName).Find(&q).Sort("-_id").Iter()
+
+		isRepeatStatus:=false
+		for tmp := make(map[string]interface{}); it_last.Next(&tmp); {
+			is_repeat_status:=util.IntAll(tmp["is_repeat_status"])
+			if is_repeat_status == 1 {
+				lteid = util.ObjToString(tmp["lteid"])
+				log.Println("查询的最后一个已标记的任务lteid:",lteid)
+				isRepeatStatus = true
+				tmp = make(map[string]interface{})
+				break
+			}else  {
+				tmp = make(map[string]interface{})
+			}
+		}
+
+		if !isRepeatStatus {
+			log.Println("查询不到有标记的lteid数据")
+			log.Println("睡眠5分钟 gtid:",gtid,"lteid:",lteid)
+			time.Sleep(5 * time.Minute)
+			continue
+		}
+
+		log.Println("查询完毕-找到有标记的lteid-先睡眠5分钟",gtid,lteid)
+		time.Sleep(5 * time.Minute)
+
+		sess := mgo.GetMgoConn()//连接器
+		defer mgo.DestoryMongoConn(sess)
+		//开始判重
+		q = map[string]interface{}{
+			"_id": map[string]interface{}{
+				"$gt": StringTOBsonId(gtid),
+				"$lte": StringTOBsonId(lteid),
+			},
+		}
+		log.Println("历史判重查询条件:",q,"时间:", between_time)
+		it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
+		num,oknum,outnum, deterTime:= int64(0),int64(0),int64(0),int64(0) //计数
+		pendAllArr:=[][]map[string]interface{}{}//待处理数组
+		dayArr := []map[string]interface{}{}
+		for tmp := make(map[string]interface{}); it.Next(&tmp); num++ {
+			if num%10000 == 0 {
+				log.Println("正序遍历:", num)
+			}
+			//取-符合-发布时间X年内的数据
+			if util.IntAll(tmp["dataging"]) == 1 {
+				pubtime := util.Int64All(tmp["publishtime"])
+				if pubtime > 0 && pubtime >= between_time {
+					oknum++
+					if deterTime==0 {
+						log.Println("找到第一条符合条件的数据")
+						deterTime = util.Int64All(tmp["publishtime"])
+						dayArr = append(dayArr,tmp)
+					}else {
+						if pubtime-deterTime >timingSpanDay*86400 {
+							//新数组重新构建,当前组数据加到全部组数据
+							pendAllArr = append(pendAllArr,dayArr)
+							dayArr = []map[string]interface{}{}
+							deterTime = util.Int64All(tmp["publishtime"])
+							dayArr = append(dayArr,tmp)
+						}else {
+							dayArr = append(dayArr,tmp)
+						}
+					}
+				}else {
+					outnum++
+					//不在两年内的也清标记
+					Update.updatePool <- []map[string]interface{}{//重复数据打标签
+						map[string]interface{}{
+							"_id": tmp["_id"],
+						},
+						map[string]interface{}{
+							"$set": map[string]interface{}{
+								"dataging": 0,
+								"history_updatetime":util.Int64All(time.Now().Unix()),
+							},
+						},
+					}
+				}
+			}
+			tmp = make(map[string]interface{})
+		}
+
+		if len(dayArr)>0 {
+			pendAllArr = append(pendAllArr,dayArr)
+			dayArr = []map[string]interface{}{}
+		}
+
+		log.Println("查询数量:",num,"符合条件:",oknum,"未在两年内:",outnum)
+
+		if len(pendAllArr) <= 0 {
+			log.Println("没找到dataging==1的数据")
+		}
+
+		//测试分组数量是否正确
+		testNum:=0
+		for k,v:=range pendAllArr {
+			log.Println("第",k,"组--","数量:",len(v))
+			testNum = testNum+len(v)
+		}
+		log.Println("本地构建分组完成:",len(pendAllArr),"组","测试-总计数量:",testNum)
+
+		n, repeateN := 0, 0
+		log.Println("线程数:",threadNum)
+		pool := make(chan bool, threadNum)
+		wg := &sync.WaitGroup{}
+		for k,v:=range pendAllArr { //每组结束更新一波数据
+			pool <- true
+			wg.Add(1)
+			go func(k int, v []map[string]interface{}) {
+				defer func() {
+					<-pool
+					wg.Done()
+				}()
+				//相关ids 跨表
+				groupOtherExtract := [][]map[string]interface{}{}
+
+				//构建当前组的数据池
+				log.Println("构建第", k, "组---(数据池)")
+				//当前组的第一个发布时间
+				first_pt := util.Int64All(v[len(v)-1]["publishtime"])
+				curTM := TimedTaskDatamap(dupdays+int(timingSpanDay)+1, first_pt+86400, int(k))
+				log.Println("开始遍历判重第", k, "组  共计数量:", len(v))
+				n = n + len(v)
+				log.Println("统计目前总数量:", n, "重复数量:", repeateN)
+				for _, tmp := range v {
+					info := NewInfo(tmp)
+					b, source, reason := curTM.check(info)
+					if b { //有重复,生成更新语句,更新抽取和更新招标
+						repeateN++
+						//重复数据打标签
+						repeat_ids:=source.repeat_ids
+						repeat_ids =  append(repeat_ids,info.id)
+						source.repeat_ids = repeat_ids
+
+						updatelock.Lock()
+						//替换数据池-更新
+						DM.replacePoolData(source)
+						//更新数据源
+						//判断是否在当前段落
+						if judgeIsCurIds(gtid,lteid,source.id) {
+							Update.updatePool <- []map[string]interface{}{//重复数据打标签
+								map[string]interface{}{
+									"_id": StringTOBsonId(source.id),
+								},
+								map[string]interface{}{
+									"$set": map[string]interface{}{
+										"repeat_ids": repeat_ids,
+									},
+								},
+							}
+						}else {
+							groupOtherExtract = append(groupOtherExtract, []map[string]interface{}{//重复数据打标签
+								map[string]interface{}{
+									"_id": StringTOBsonId(source.id),
+								},
+								map[string]interface{}{
+									"$set": map[string]interface{}{
+										"repeat_ids": repeat_ids,
+									},
+								},
+							})
+						}
+						Update.updatePool <- []map[string]interface{}{//重复数据打标签
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat":        1,
+									"repeat_reason": reason,
+									"repeat_id":     source.id,
+									"dataging":      0,
+									"history_updatetime":util.Int64All(time.Now().Unix()),
+								},
+							},
+						}
+						if len(groupOtherExtract) >= 500 {
+							mgo.UpSertBulk(extract_back, groupOtherExtract...)
+							groupOtherExtract = [][]map[string]interface{}{}
+						}
+
+						updatelock.Unlock()
+
+
+					} else {
+						Update.updatePool <- []map[string]interface{}{//重复数据打标签
+							map[string]interface{}{
+								"_id": tmp["_id"],
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"dataging": 0, //符合条件的都为dataging==0
+									"history_updatetime":util.Int64All(time.Now().Unix()),
+								},
+							},
+						}
+					}
+				}
+				//每组数据结束-更新数据
+				updatelock.Lock()
+				if len(groupOtherExtract) > 0 {
+					mgo.UpSertBulk(extract_back, groupOtherExtract...)
+				}
+				updatelock.Unlock()
+
+			}(k, v)
+
+		}
+
+		wg.Wait()
+
+		log.Println("this timeTask over.", n, "repeateN:", repeateN,gtid,lteid)
+
+		time.Sleep(30 * time.Second)
+		//任务完成,开始发送广播通知下面节点 发udp 去升索引待定 + 合并
+		if n >= repeateN && gtid!=lteid{
+			for _, to := range nextNode {
+				next_sid := util.BsonIdToSId(gtid)
+				next_eid := util.BsonIdToSId(lteid)
+				key := next_sid + "-" + next_eid + "-" + util.ObjToString(to["stype"])
+				by, _ := json.Marshal(map[string]interface{}{
+					"gtid":  next_sid,
+					"lteid": next_eid,
+					"stype": util.ObjToString(to["stype"]),
+					"key":   key,
+				})
+				addr := &net.UDPAddr{
+					IP:   net.ParseIP(to["addr"].(string)),
+					Port: util.IntAll(to["port"]),
+				}
+				node := &udpNode{by, addr, time.Now().Unix(), 0}
+				udptaskmap.Store(key, node)
+				udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+			}
+		}
+
+		end:=time.Now().Unix()
+
+		log.Println(gtid,lteid)
+		if end-start<60*5 {
+			log.Println("睡眠.............")
+			time.Sleep(5 * time.Minute)
+		}
+		log.Println("继续下一段的历史判重")
+	}
+}
+
+//判断是否在当前id段落
+func judgeIsCurIds (gtid string,lteid string,curid string) bool {
+
+	gt_time, _ := strconv.ParseInt(gtid[:8], 16, 64)
+	lte_time, _ := strconv.ParseInt(lteid[:8], 16, 64)
+	cur_time, _ := strconv.ParseInt(curid[:8], 16, 64)
+	if cur_time>=gt_time&&cur_time<=lte_time {
+		return true
+	}
+	return false
+}
+
+//迁移上一段数据
+func moveHistoryData(startid string,endid string) {
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	year, month, day := time.Now().Date()
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt": StringTOBsonId(startid),
+			"$lte": StringTOBsonId(endid),
+		},
+	}
+	log.Println(q)
+	it := sess.DB(mgo.DbName).C(extract).Find(&q).Iter()
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		mgo.Save(extract_back, tmp)
+		tmp = map[string]interface{}{}
+		if index%1000 == 0 {
+			log.Println("index", index)
+		}
+	}
+	log.Println("save to", extract_back, " ok index", index)
+
+	qv := map[string]interface{}{
+		"comeintime": map[string]interface{}{
+			"$lt": time.Date(year, month, day, 0, 0, 0, 0, time.Local).Add(-time.Duration(dupdays+1) * 24 * time.Hour*2).Unix(),
+	},
+	}
+	delnum := mgo.Delete(extract, qv)
+	log.Println("remove from ", extract, delnum)
+
+}
+
+func moveTimeoutData()  {
+	log.Println("部署迁移定时任务")
+	c := cron.New()
+	c.AddFunc("0 0 0 * * ?", func() { moveOnceTimeOut() })
+	c.Start()
+}
+
+func moveOnceTimeOut()  {
+	log.Println("执行一次迁移超时数据")
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	now:=time.Now()
+	move_time := time.Date(now.Year()-2, now.Month(), now.Day(), 0, 0, 0, 0, time.Local)
+	task_id := util.BsonIdToSId(bson.NewObjectIdWithTime(move_time))
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$lt": StringTOBsonId(task_id),
+		},
+	}
+
+	it := sess.DB(mgo.DbName).C("result_20200714").Find(&q).Iter()
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		if index%10000 == 0 {
+			log.Println("index", index)
+		}
+		del_id:=BsonTOStringId(tmp["_id"])
+		mgo.Save("result_20200713", tmp)
+		mgo.DeleteById("result_20200714",del_id)
+		tmp = map[string]interface{}{}
+	}
+	log.Println("save and delete", " ok index", index)
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

+ 315 - 0
udpfilterdup/src3/mgo.go

@@ -0,0 +1,315 @@
+package main
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"go.mongodb.org/mongo-driver/bson"
+	"go.mongodb.org/mongo-driver/bson/primitive"
+	"go.mongodb.org/mongo-driver/mongo"
+	"go.mongodb.org/mongo-driver/mongo/options"
+)
+
+type MgoSess struct {
+	Db     string
+	Coll   string
+	Query  interface{}
+	Sorts  []string
+	fields interface{}
+	limit  int64
+	skip   int64
+	M      *MongodbSim
+}
+
+type MgoIter struct {
+	Cursor *mongo.Cursor
+}
+
+func (mt *MgoIter) Next(result interface{}) bool {
+	if mt.Cursor != nil {
+		if mt.Cursor.Next(nil) {
+			err := mt.Cursor.Decode(result)
+			if err != nil {
+				log.Println("mgo cur err", err.Error())
+				mt.Cursor.Close(nil)
+				return false
+			}
+			return true
+		} else {
+			mt.Cursor.Close(nil)
+			return false
+		}
+	} else {
+		return false
+	}
+
+}
+
+func (ms *MgoSess) DB(name string) *MgoSess {
+	ms.Db = name
+	return ms
+}
+
+func (ms *MgoSess) C(name string) *MgoSess {
+	ms.Coll = name
+	return ms
+}
+
+func (ms *MgoSess) Find(q interface{}) *MgoSess {
+	ms.Query = q
+	return ms
+}
+
+func (ms *MgoSess) Select(fields interface{}) *MgoSess {
+	ms.fields = fields
+	return ms
+}
+
+func (ms *MgoSess) Limit(limit int64) *MgoSess {
+	ms.limit = limit
+	return ms
+}
+func (ms *MgoSess) Skip(skip int64) *MgoSess {
+	ms.skip = skip
+	return ms
+}
+
+func (ms *MgoSess) Sort(sorts ...string) *MgoSess {
+	ms.Sorts = sorts
+	return ms
+}
+
+func (ms *MgoSess) Iter() *MgoIter {
+	it := &MgoIter{}
+	find := options.Find()
+	if ms.skip > 0 {
+		find.SetSkip(ms.skip)
+	}
+	if ms.limit > 0 {
+		find.SetLimit(ms.limit)
+	}
+	find.SetBatchSize(100)
+	if len(ms.Sorts) > 0 {
+		sort := bson.M{}
+		for _, k := range ms.Sorts {
+			switch k[:1] {
+			case "-":
+				sort[k[1:]] = -1
+			case "+":
+				sort[k[1:]] = 1
+			default:
+				sort[k] = 1
+			}
+		}
+		find.SetSort(sort)
+	}
+	if ms.fields != nil {
+		find.SetProjection(ms.fields)
+	}
+	cur, err := ms.M.C.Database(ms.Db).Collection(ms.Coll).Find(ms.M.Ctx, ms.Query, find)
+	if err != nil {
+		log.Println("mgo find err", err.Error())
+	} else {
+		it.Cursor = cur
+	}
+	return it
+}
+
+type MongodbSim struct {
+	MongodbAddr string
+	Size        int
+	//	MinSize     int
+	DbName   string
+	C        *mongo.Client
+	Ctx      context.Context
+	ShortCtx context.Context
+	pool     chan bool
+}
+
+func (m *MongodbSim) GetMgoConn() *MgoSess {
+	//m.Open()
+	ms := &MgoSess{}
+	ms.M = m
+	return ms
+}
+
+func (m *MongodbSim) DestoryMongoConn(ms *MgoSess) {
+	//m.Close()
+	ms.M = nil
+	ms = nil
+}
+
+func (m *MongodbSim) InitPool() {
+	opts := options.Client()
+	opts.SetConnectTimeout(3 * time.Second)
+	opts.ApplyURI("mongodb://" + m.MongodbAddr)
+	opts.SetMaxPoolSize(uint64(m.Size))
+	m.pool = make(chan bool, m.Size)
+	opts.SetMaxConnIdleTime(2 * time.Hour)
+	m.Ctx, _ = context.WithTimeout(context.Background(), 99999*time.Hour)
+	m.ShortCtx, _ = context.WithTimeout(context.Background(), 1*time.Minute)
+	client, err := mongo.Connect(m.ShortCtx, opts)
+	if err != nil {
+		log.Println("mgo init error:", err.Error())
+	} else {
+		m.C = client
+		log.Println("init success")
+	}
+}
+
+func (m *MongodbSim) Open() {
+	m.pool <- true
+}
+func (m *MongodbSim) Close() {
+	<-m.pool
+}
+
+//批量插入
+func (m *MongodbSim) UpSertBulk(c string, doc ...[]map[string]interface{}) (map[int64]interface{}, bool) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	var writes []mongo.WriteModel
+	for _, d := range doc {
+		write := mongo.NewUpdateOneModel()
+		write.SetFilter(d[0])
+		write.SetUpdate(d[1])
+		write.SetUpsert(true)
+		writes = append(writes, write)
+	}
+	r, e := coll.BulkWrite(m.Ctx, writes)
+	if e != nil {
+		log.Println("mgo upsert error:", e.Error())
+		return nil, false
+	}
+	//	else {
+	//		if r.UpsertedCount != int64(len(doc)) {
+	//			log.Println("mgo upsert uncomplete:uc/dc", r.UpsertedCount, len(doc))
+	//		}
+	//		return true
+	//	}
+	return r.UpsertedIDs, true
+}
+
+//批量插入
+func (m *MongodbSim) SaveBulk(c string, doc ...map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	var writes []mongo.WriteModel
+	for _, d := range doc {
+		write := mongo.NewInsertOneModel()
+		write.SetDocument(d)
+		writes = append(writes, write)
+	}
+	_, e := coll.BulkWrite(m.Ctx, writes)
+	if e != nil {
+		log.Println("mgo savebulk error:", e.Error())
+		return false
+	}
+	return true
+}
+
+//保存
+func (m *MongodbSim) Save(c string, doc map[string]interface{}) interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.InsertOne(m.Ctx, doc)
+	if err != nil {
+		return nil
+	}
+	return r.InsertedID
+}
+
+//更新by Id
+func (m *MongodbSim) UpdateById(c, id string, doc map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	_, err := coll.UpdateOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)}, doc)
+	if err != nil {
+		return false
+	}
+	return true
+}
+
+//删除by id
+func (m *MongodbSim) DeleteById(c, id string) int64 {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.DeleteOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)})
+	if err != nil {
+		return 0
+	}
+	return r.DeletedCount
+}
+
+//通过条件删除
+func (m *MongodbSim) Delete(c string, query map[string]interface{}) int64 {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.DeleteMany(m.Ctx, query)
+	if err != nil {
+		return 0
+	}
+	return r.DeletedCount
+}
+
+//findbyid
+func (m *MongodbSim) FindById(c, id string) map[string]interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r := coll.FindOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)})
+	v := map[string]interface{}{}
+	r.Decode(&v)
+	return v
+}
+
+//findone
+func (m *MongodbSim) FindOne(c string, query map[string]interface{}) map[string]interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r := coll.FindOne(m.Ctx, query)
+	v := map[string]interface{}{}
+	r.Decode(&v)
+	return v
+}
+
+//find
+func (m *MongodbSim) Find(c string, query map[string]interface{}, sort, fields interface{}) ([]map[string]interface{}, error) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	op := options.Find()
+	r, err := coll.Find(m.Ctx, query, op.SetSort(sort), op.SetProjection(fields))
+	if err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+	var results []map[string]interface{}
+	if err = r.All(m.Ctx, &results); err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+	return results, nil
+}
+
+//创建_id
+func NewObjectId() primitive.ObjectID {
+	return primitive.NewObjectID()
+}
+
+func StringTOBsonId(id string) primitive.ObjectID {
+	objectId, _ := primitive.ObjectIDFromHex(id)
+	return objectId
+}
+
+func BsonTOStringId(id interface{}) string {
+	return id.(primitive.ObjectID).Hex()
+}

+ 59 - 0
udpfilterdup/src3/udptaskmap.go

@@ -0,0 +1,59 @@
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"log"
+	mu "mfw/util"
+	"net"
+	"net/http"
+	"sync"
+	"time"
+)
+
+var udptaskmap = &sync.Map{}
+var tomail string
+var api string
+
+type udpNode struct {
+	data      []byte
+	addr      *net.UDPAddr
+	timestamp int64
+	retry     int
+}
+
+func checkMapJob() {
+	//阿里云内网无法发送邮件
+	jkmail, _ := Sysconfig["jkmail"].(map[string]interface{})
+	if jkmail != nil {
+		tomail, _ = jkmail["to"].(string)
+		api, _ = jkmail["api"].(string)
+	}
+	log.Println("start checkMapJob", tomail, Sysconfig["jkmail"])
+	for {
+		udptaskmap.Range(func(k, v interface{}) bool {
+			now := time.Now().Unix()
+			node, _ := v.(*udpNode)
+			if now-node.timestamp > 120 {
+				node.retry++
+				if node.retry > 5 {
+					log.Println("udp重试失败", k)
+					udptaskmap.Delete(k)
+					res, err := http.Get(fmt.Sprintf("%s?to=%s&title=%s&body=%s", api, tomail, "extract-send-fail", k.(string)))
+					if err == nil {
+						defer res.Body.Close()
+						read, err := ioutil.ReadAll(res.Body)
+						log.Println("邮件发发送:", string(read), err)
+					}
+				} else {
+					log.Println("udp重发", k)
+					udpclient.WriteUdp(node.data, mu.OP_TYPE_DATA, node.addr)
+				}
+			} else if now-node.timestamp > 10 {
+				log.Println("udp任务超时中..", k)
+			}
+			return true
+		})
+		time.Sleep(60 * time.Second)
+	}
+}

+ 62 - 0
udpfilterdup/src3/updateMethod.go

@@ -0,0 +1,62 @@
+package main
+
+import (
+	"log"
+	"time"
+)
+
+type updateInfo struct {
+
+	//更新或新增通道
+	updatePool chan []map[string]interface{}
+	//数量
+	saveSize   	int
+
+}
+
+
+
+
+var sp = make(chan bool, 5)
+
+func newUpdatePool() *updateInfo {
+	update:=&updateInfo{make(chan []map[string]interface{}, 50000),500}
+	return update
+}
+
+
+func (update *updateInfo) updateData() {
+	log.Println("开始不断监听--待更新数据")
+	tmpArr := make([][]map[string]interface{}, update.saveSize)
+	tmpIndex := 0
+	for {
+		select {
+		case value := <-update.updatePool:
+			tmpArr[tmpIndex] = value
+			tmpIndex++
+			if tmpIndex == update.saveSize {
+				sp <- true
+				go func(dataArr [][]map[string]interface{}) {
+					defer func() {
+						<-sp
+					}()
+					mgo.UpSertBulk(extract, dataArr...)
+				}(tmpArr)
+				tmpArr = make([][]map[string]interface{}, update.saveSize)
+				tmpIndex = 0
+			}
+		case <-time.After(10 * time.Second)://无反应时每x秒检测一次
+			if tmpIndex > 0 {
+				sp <- true
+				go func(dataArr [][]map[string]interface{}) {
+					defer func() {
+						<-sp
+					}()
+					mgo.UpSertBulk(extract, dataArr...)
+				}(tmpArr[:tmpIndex])
+				tmpArr = make([][]map[string]interface{}, update.saveSize)
+				tmpIndex = 0
+			}
+		}
+	}
+}

+ 23 - 0
udpfusion/src/config.json

@@ -0,0 +1,23 @@
+{
+  "udpport": ":17330",
+  "mongodb": {
+    "addrName": "192.168.3.207:27092",
+    "dbName": "zhengkun",
+    "collName": "test",
+    "pool": 10,
+    "site": {
+      "site_dbname": "qfw",
+      "site_coll": "site"
+    }
+  },
+  "fusion_coll_name":"fusiondata",
+  "record_coll_name":"recorddata",
+  "":"",
+  "jkmail": {
+    "to": "zhengkun@topnet.net.cn",
+    "api": "http://10.171.112.160:19281/_send/_mail"
+  },
+  "nextNode": [
+
+  ]
+}

+ 200 - 0
udpfusion/src/main.go

@@ -0,0 +1,200 @@
+package main
+
+import (
+	"encoding/json"
+	"log"
+	mu "mfw/util"
+	"net"
+	"os"
+	"qfw/common/src/qfw/util"
+	qu "qfw/util"
+	"time"
+)
+
+
+var (
+	sysconfig    map[string]interface{} //配置文件
+	mgo          *MongodbSim            //mongodb操作对象
+	udpclient    mu.UdpClient             //udp对象
+	nextNode     []map[string]interface{} //下节点数组
+	coll_name,fusion_coll_name,record_coll_name 	 string
+)
+
+
+
+func initMgo()  {
+	mconf := sysconfig["mongodb"].(map[string]interface{})
+	log.Println(mconf)
+	mgo = &MongodbSim{
+		MongodbAddr: mconf["addrName"].(string),
+		DbName:      mconf["dbName"].(string),
+		Size:        qu.IntAllDef(mconf["pool"], 10),
+	}
+	mgo.InitPool()
+
+
+	coll_name = mconf["collName"].(string)
+	fusion_coll_name = sysconfig["fusion_coll_name"].(string)
+	record_coll_name = sysconfig["record_coll_name"].(string)
+}
+
+
+func init() {
+	//加载配置文件
+	qu.ReadConfig(&sysconfig)
+	initMgo()
+	log.Println("采用udp模式")
+}
+
+
+func mainT() {
+	go checkMapJob()
+	updport := sysconfig["udpport"].(string)
+	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
+	udpclient.Listen(processUdpMsg)
+	log.Println("Udp服务监听", updport)
+	time.Sleep(99999 * time.Hour)
+}
+
+//快速测试使用
+func main() {
+
+	sid := "1f0000000000000000000000"
+	eid := "9f0000000000000000000000"
+	//log.Println(sid, "---", eid)
+	mapinfo := map[string]interface{}{}
+	if sid == "" || eid == "" {
+		log.Println("sid,eid参数不能为空")
+		os.Exit(0)
+	}
+	mapinfo["gtid"] = sid
+	mapinfo["lteid"] = eid
+	startTask([]byte{}, mapinfo)
+	time.Sleep(99999 * time.Hour)
+
+}
+
+
+func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
+	switch act {
+	case mu.OP_TYPE_DATA: //上个节点的数据
+		//从表中开始处理
+		var mapInfo map[string]interface{}
+		err := json.Unmarshal(data, &mapInfo)
+		log.Println("err:", err, "mapInfo:", mapInfo)
+		if err != nil {
+			udpclient.WriteUdp([]byte("err:"+err.Error()), mu.OP_NOOP, ra)
+		} else if mapInfo != nil {
+			taskType := qu.ObjToString(mapInfo["stype"])
+			if taskType == "fusion" {
+				go startTask(data, mapInfo)
+			} else {
+				log.Println("未知类型:融合异常... ...")
+			}
+			key, _ := mapInfo["key"].(string)
+			if key == "" {
+				key = "udpok"
+			}
+			udpclient.WriteUdp([]byte(key), mu.OP_NOOP, ra)
+		}
+	case mu.OP_NOOP: //下个节点回应
+		ok := string(data)
+		if ok != "" {
+			log.Println("ok:", ok)
+			udptaskmap.Delete(ok)
+		}
+	}
+}
+
+
+
+//融合具体方法
+func startTask(data []byte, mapInfo map[string]interface{}) {
+
+	//遍历数据
+	log.Println("开始融合流程")
+
+
+	//分组数据-分组融合
+
+	//构建数据
+	weight :=NewWeightData([]string{},"")
+	//整理数据-筛选排名,模板
+	weight.analyzeBuildStandardData()
+	log.Println("筛选出模拟数据:",weight.templateid)
+	weight.dealWithMultipleFusionStruct()
+	//进行融合
+
+
+
+
+	return
+
+
+	defer qu.Catch()
+	//区间id
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt":  StringTOBsonId(mapInfo["gtid"].(string)),
+			"$lte": StringTOBsonId(mapInfo["lteid"].(string)),
+		},
+	}
+	log.Println("查询条件:",q)
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	it := sess.DB(mgo.DbName).C(coll_name).Find(&q).Iter()
+	updateExtract := [][]map[string]interface{}{}
+	index:=0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		if index%10000 == 0 {
+			log.Println("当前数量:", index, tmp["_id"])
+		}
+
+		//log.Println(we)
+
+
+
+		tmp = make(map[string]interface{})
+	}
+
+
+	if len(updateExtract) >0 {
+		mgo.UpSertBulk(coll_name, updateExtract...)
+
+	}
+
+
+	log.Println("task fusion over - 总计数量",index)
+
+	time.Sleep(30 * time.Second)
+
+	//任务完成,开始发送广播通知下面节点
+
+	sendUdp(mapInfo)
+
+
+
+}
+
+func sendUdp(mapinfo map[string]interface{})  {
+
+	//log.Println("信息融合结束-发送udp")
+	for _, to := range nextNode {
+		sid, _ := mapinfo["gtid"].(string)
+		eid, _ := mapinfo["lteid"].(string)
+		key := sid + "-" + eid + "-" + util.ObjToString(to["stype"])
+		by, _ := json.Marshal(map[string]interface{}{
+			"gtid":  sid,
+			"lteid": eid,
+			"stype": util.ObjToString(to["stype"]),
+			"key":   key,
+		})
+		addr := &net.UDPAddr{
+			IP:   net.ParseIP(to["addr"].(string)),
+			Port: util.IntAll(to["port"]),
+		}
+		node := &udpNode{by, addr, time.Now().Unix(), 0}
+		udptaskmap.Store(key, node)
+		udpclient.WriteUdp(by, mu.OP_TYPE_DATA, addr)
+	}
+}

+ 315 - 0
udpfusion/src/mgo.go

@@ -0,0 +1,315 @@
+package main
+
+import (
+	"context"
+	"log"
+	"time"
+
+	"go.mongodb.org/mongo-driver/bson"
+	"go.mongodb.org/mongo-driver/bson/primitive"
+	"go.mongodb.org/mongo-driver/mongo"
+	"go.mongodb.org/mongo-driver/mongo/options"
+)
+
+type MgoSess struct {
+	Db     string
+	Coll   string
+	Query  interface{}
+	Sorts  []string
+	fields interface{}
+	limit  int64
+	skip   int64
+	M      *MongodbSim
+}
+
+type MgoIter struct {
+	Cursor *mongo.Cursor
+}
+
+func (mt *MgoIter) Next(result interface{}) bool {
+	if mt.Cursor != nil {
+		if mt.Cursor.Next(nil) {
+			err := mt.Cursor.Decode(result)
+			if err != nil {
+				log.Println("mgo cur err", err.Error())
+				mt.Cursor.Close(nil)
+				return false
+			}
+			return true
+		} else {
+			mt.Cursor.Close(nil)
+			return false
+		}
+	} else {
+		return false
+	}
+
+}
+
+func (ms *MgoSess) DB(name string) *MgoSess {
+	ms.Db = name
+	return ms
+}
+
+func (ms *MgoSess) C(name string) *MgoSess {
+	ms.Coll = name
+	return ms
+}
+
+func (ms *MgoSess) Find(q interface{}) *MgoSess {
+	ms.Query = q
+	return ms
+}
+
+func (ms *MgoSess) Select(fields interface{}) *MgoSess {
+	ms.fields = fields
+	return ms
+}
+
+func (ms *MgoSess) Limit(limit int64) *MgoSess {
+	ms.limit = limit
+	return ms
+}
+func (ms *MgoSess) Skip(skip int64) *MgoSess {
+	ms.skip = skip
+	return ms
+}
+
+func (ms *MgoSess) Sort(sorts ...string) *MgoSess {
+	ms.Sorts = sorts
+	return ms
+}
+
+func (ms *MgoSess) Iter() *MgoIter {
+	it := &MgoIter{}
+	find := options.Find()
+	if ms.skip > 0 {
+		find.SetSkip(ms.skip)
+	}
+	if ms.limit > 0 {
+		find.SetLimit(ms.limit)
+	}
+	find.SetBatchSize(100)
+	if len(ms.Sorts) > 0 {
+		sort := bson.M{}
+		for _, k := range ms.Sorts {
+			switch k[:1] {
+			case "-":
+				sort[k[1:]] = -1
+			case "+":
+				sort[k[1:]] = 1
+			default:
+				sort[k] = 1
+			}
+		}
+		find.SetSort(sort)
+	}
+	if ms.fields != nil {
+		find.SetProjection(ms.fields)
+	}
+	cur, err := ms.M.C.Database(ms.Db).Collection(ms.Coll).Find(ms.M.Ctx, ms.Query, find)
+	if err != nil {
+		log.Println("mgo find err", err.Error())
+	} else {
+		it.Cursor = cur
+	}
+	return it
+}
+
+type MongodbSim struct {
+	MongodbAddr string
+	Size        int
+	//	MinSize     int
+	DbName   string
+	C        *mongo.Client
+	Ctx      context.Context
+	ShortCtx context.Context
+	pool     chan bool
+}
+
+func (m *MongodbSim) GetMgoConn() *MgoSess {
+	//m.Open()
+	ms := &MgoSess{}
+	ms.M = m
+	return ms
+}
+
+func (m *MongodbSim) DestoryMongoConn(ms *MgoSess) {
+	//m.Close()
+	ms.M = nil
+	ms = nil
+}
+
+func (m *MongodbSim) InitPool() {
+	opts := options.Client()
+	opts.SetConnectTimeout(3 * time.Second)
+	opts.ApplyURI("mongodb://" + m.MongodbAddr)
+	opts.SetMaxPoolSize(uint64(m.Size))
+	m.pool = make(chan bool, m.Size)
+	opts.SetMaxConnIdleTime(2 * time.Hour)
+	m.Ctx, _ = context.WithTimeout(context.Background(), 99999*time.Hour)
+	m.ShortCtx, _ = context.WithTimeout(context.Background(), 1*time.Minute)
+	client, err := mongo.Connect(m.ShortCtx, opts)
+	if err != nil {
+		log.Println("mgo init error:", err.Error())
+	} else {
+		m.C = client
+		log.Println("init success")
+	}
+}
+
+func (m *MongodbSim) Open() {
+	m.pool <- true
+}
+func (m *MongodbSim) Close() {
+	<-m.pool
+}
+
+//批量插入
+func (m *MongodbSim) UpSertBulk(c string, doc ...[]map[string]interface{}) (map[int64]interface{}, bool) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	var writes []mongo.WriteModel
+	for _, d := range doc {
+		write := mongo.NewUpdateOneModel()
+		write.SetFilter(d[0])
+		write.SetUpdate(d[1])
+		write.SetUpsert(true)
+		writes = append(writes, write)
+	}
+	r, e := coll.BulkWrite(m.Ctx, writes)
+	if e != nil {
+		log.Println("mgo upsert error:", e.Error())
+		return nil, false
+	}
+	//	else {
+	//		if r.UpsertedCount != int64(len(doc)) {
+	//			log.Println("mgo upsert uncomplete:uc/dc", r.UpsertedCount, len(doc))
+	//		}
+	//		return true
+	//	}
+	return r.UpsertedIDs, true
+}
+
+//批量插入
+func (m *MongodbSim) SaveBulk(c string, doc ...map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	var writes []mongo.WriteModel
+	for _, d := range doc {
+		write := mongo.NewInsertOneModel()
+		write.SetDocument(d)
+		writes = append(writes, write)
+	}
+	_, e := coll.BulkWrite(m.Ctx, writes)
+	if e != nil {
+		log.Println("mgo savebulk error:", e.Error())
+		return false
+	}
+	return true
+}
+
+//保存
+func (m *MongodbSim) Save(c string, doc map[string]interface{}) interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.InsertOne(m.Ctx, doc)
+	if err != nil {
+		return nil
+	}
+	return r.InsertedID
+}
+
+//更新by Id
+func (m *MongodbSim) UpdateById(c, id string, doc map[string]interface{}) bool {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	_, err := coll.UpdateOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)}, doc)
+	if err != nil {
+		return false
+	}
+	return true
+}
+
+//删除by id
+func (m *MongodbSim) DeleteById(c, id string) int64 {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.DeleteOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)})
+	if err != nil {
+		return 0
+	}
+	return r.DeletedCount
+}
+
+//通过条件删除
+func (m *MongodbSim) Delete(c string, query map[string]interface{}) int64 {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r, err := coll.DeleteMany(m.Ctx, query)
+	if err != nil {
+		return 0
+	}
+	return r.DeletedCount
+}
+
+//findbyid
+func (m *MongodbSim) FindById(c, id string) map[string]interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r := coll.FindOne(m.Ctx, map[string]interface{}{"_id": StringTOBsonId(id)})
+	v := map[string]interface{}{}
+	r.Decode(&v)
+	return v
+}
+
+//findone
+func (m *MongodbSim) FindOne(c string, query map[string]interface{}) map[string]interface{} {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	r := coll.FindOne(m.Ctx, query)
+	v := map[string]interface{}{}
+	r.Decode(&v)
+	return v
+}
+
+//find
+func (m *MongodbSim) Find(c string, query map[string]interface{}, sort, fields interface{}) ([]map[string]interface{}, error) {
+	m.Open()
+	defer m.Close()
+	coll := m.C.Database(m.DbName).Collection(c)
+	op := options.Find()
+	r, err := coll.Find(m.Ctx, query, op.SetSort(sort), op.SetProjection(fields))
+	if err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+	var results []map[string]interface{}
+	if err = r.All(m.Ctx, &results); err != nil {
+		log.Fatal(err)
+		return nil, err
+	}
+	return results, nil
+}
+
+//创建_id
+func NewObjectId() primitive.ObjectID {
+	return primitive.NewObjectID()
+}
+
+func StringTOBsonId(id string) primitive.ObjectID {
+	objectId, _ := primitive.ObjectIDFromHex(id)
+	return objectId
+}
+
+func BsonTOStringId(id interface{}) string {
+	return id.(primitive.ObjectID).Hex()
+}

+ 59 - 0
udpfusion/src/sendmail.go

@@ -0,0 +1,59 @@
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"log"
+	mu "mfw/util"
+	"net"
+	"net/http"
+	"sync"
+	"time"
+)
+
+var udptaskmap = &sync.Map{}
+var tomail string
+var api string
+
+type udpNode struct {
+	data      []byte
+	addr      *net.UDPAddr
+	timestamp int64
+	retry     int
+}
+
+func checkMapJob() {
+	//阿里云内网无法发送邮件
+	jkmail, _ := sysconfig["jkmail"].(map[string]interface{})
+	if jkmail != nil {
+		tomail, _ = jkmail["to"].(string)
+		api, _ = jkmail["api"].(string)
+	}
+	log.Println("start checkMapJob", tomail, sysconfig["jkmail"])
+	for {
+		udptaskmap.Range(func(k, v interface{}) bool {
+			now := time.Now().Unix()
+			node, _ := v.(*udpNode)
+			if now-node.timestamp > 120 {
+				node.retry++
+				if node.retry > 5 {
+					log.Println("udp重试失败", k)
+					udptaskmap.Delete(k)
+					res, err := http.Get(fmt.Sprintf("%s?to=%s&title=%s&body=%s", api, tomail, "extract-send-fail", k.(string)))
+					if err == nil {
+						defer res.Body.Close()
+						read, err := ioutil.ReadAll(res.Body)
+						log.Println("邮件发发送:", string(read), err)
+					}
+				} else {
+					log.Println("udp重发", k)
+					udpclient.WriteUdp(node.data, mu.OP_TYPE_DATA, node.addr)
+				}
+			} else if now-node.timestamp > 10 {
+				log.Println("udp任务超时中..", k)
+			}
+			return true
+		})
+		time.Sleep(60 * time.Second)
+	}
+}

+ 62 - 0
udpfusion/src/updateFusion.go

@@ -0,0 +1,62 @@
+package main
+
+import (
+	"log"
+	"time"
+)
+
+type updateFusionInfo struct {
+
+	//更新或新增通道
+	updatePool chan []map[string]interface{}
+	//数量
+	saveSize   	int
+
+}
+
+
+
+
+var sp_f = make(chan bool, 5)
+
+func newUpdateFusionPool() *updateFusionInfo {
+	update:=&updateFusionInfo{make(chan []map[string]interface{}, 50000),500}
+	return update
+}
+
+
+func (update *updateFusionInfo) updateFusionData() {
+	log.Println("开始不断监听--待更新数据")
+	tmpArr := make([][]map[string]interface{}, update.saveSize)
+	tmpIndex := 0
+	for {
+		select {
+		case value := <-update.updatePool:
+			tmpArr[tmpIndex] = value
+			tmpIndex++
+			if tmpIndex == update.saveSize {
+				sp_f <- true
+				go func(dataArr [][]map[string]interface{}) {
+					defer func() {
+						<-sp_f
+					}()
+					mgo.UpSertBulk(fusion_coll_name, dataArr...)
+				}(tmpArr)
+				tmpArr = make([][]map[string]interface{}, update.saveSize)
+				tmpIndex = 0
+			}
+		case <-time.After(10 * time.Second)://无反应时每x秒检测一次
+			if tmpIndex > 0 {
+				sp_f <- true
+				go func(dataArr [][]map[string]interface{}) {
+					defer func() {
+						<-sp_f
+					}()
+					mgo.UpSertBulk(fusion_coll_name, dataArr...)
+				}(tmpArr[:tmpIndex])
+				tmpArr = make([][]map[string]interface{}, update.saveSize)
+				tmpIndex = 0
+			}
+		}
+	}
+}

+ 62 - 0
udpfusion/src/updateRecord.go

@@ -0,0 +1,62 @@
+package main
+
+import (
+	"log"
+	"time"
+)
+
+type updateRecordInfo struct {
+
+	//更新或新增通道
+	updatePool chan []map[string]interface{}
+	//数量
+	saveSize   	int
+
+}
+
+
+
+
+var sp_r = make(chan bool, 5)
+
+func newUpdateRecordPool() *updateRecordInfo {
+	update:=&updateRecordInfo{make(chan []map[string]interface{}, 50000),500}
+	return update
+}
+
+
+func (update *updateRecordInfo) updateRecordData() {
+	log.Println("开始不断监听--待更新数据")
+	tmpArr := make([][]map[string]interface{}, update.saveSize)
+	tmpIndex := 0
+	for {
+		select {
+		case value := <-update.updatePool:
+			tmpArr[tmpIndex] = value
+			tmpIndex++
+			if tmpIndex == update.saveSize {
+				sp_r <- true
+				go func(dataArr [][]map[string]interface{}) {
+					defer func() {
+						<-sp_r
+					}()
+					mgo.UpSertBulk(record_coll_name, dataArr...)
+				}(tmpArr)
+				tmpArr = make([][]map[string]interface{}, update.saveSize)
+				tmpIndex = 0
+			}
+		case <-time.After(10 * time.Second)://无反应时每x秒检测一次
+			if tmpIndex > 0 {
+				sp_r <- true
+				go func(dataArr [][]map[string]interface{}) {
+					defer func() {
+						<-sp_r
+					}()
+					mgo.UpSertBulk(record_coll_name, dataArr...)
+				}(tmpArr[:tmpIndex])
+				tmpArr = make([][]map[string]interface{}, update.saveSize)
+				tmpIndex = 0
+			}
+		}
+	}
+}

+ 190 - 0
udpfusion/src/weightFusion.go

@@ -0,0 +1,190 @@
+package main
+
+import (
+	"log"
+	qu "qfw/util"
+	"time"
+	"go.mongodb.org/mongo-driver/bson/primitive"
+)
+
+//处理融合数据-返回,融合新数据数据-新增
+func (weight *weightDataMap) dealWithAddFusionStruct ()(map[string]interface{}){
+	log.Println(weight.saveids)
+	log.Println(weight.templateid)
+	log.Println(len(weight.data))
+
+	//
+	//指定模板数据dict-单条数据
+	dict :=weight.data[weight.templateid].data
+
+	//采用新增id
+	delete(dict,"_id")
+
+	//最早发布时间 (小)
+	dict["early_publishtime"] = qu.IntAll(dict["publishtime"])
+	//最近发布时间  (大)
+	dict["lately_publishtime"] = qu.IntAll(dict["publishtime"])
+	//最早入库时间  (小)
+	dict["early_comeintime"] = qu.IntAll(dict["comeintime"])
+	//最近入库时间  (大)
+	dict["lately_comeintime"] = qu.IntAll(dict["comeintime"])
+	//当前更新时间
+	dict["current_updatetime"] = qu.IntAll(time.Now().Unix())
+	//融合生成时间
+	dict["current_updatetime"] = qu.IntAll(time.Now().Unix())
+	//所有相关联ids
+	dict["fusion_allids"] = weight.allids
+	//融合保存相关联ids
+	dict["fusion_saveids"] = weight.saveids
+
+
+	return dict
+}
+
+//处理多条融合数据-返回融合新数据,融合细节数据
+func (weight *weightDataMap) dealWithMultipleFusionStruct ()(map[string]interface{},map[string]interface{}){
+	//log.Println(weight.saveids)
+	//log.Println(weight.templateid)
+	//log.Println(len(weight.data))
+
+
+	//指定模板数据dict
+	dict :=weight.data[weight.templateid].data
+
+	//最早|近发布时间
+	dict["early_publishtime"],dict["lately_publishtime"] = weight.dealWithTimeData("publishtime")
+	//最早|近入库时间
+	dict["early_comeintime"],dict["lately_comeintime"] = weight.dealWithTimeData("comeintime")
+	//当前更新时间
+	dict["current_updatetime"] = qu.IntAll(time.Now().Unix())
+
+	//融合生成时间-取融合表融合数据
+	dict["current_updatetime"] = ""
+	//所有相关联ids
+	dict["fusion_allids"] = weight.allids
+	//融合保存相关联ids
+	dict["fusion_saveids"] = weight.saveids
+
+
+
+
+	//其他字段逻辑处理
+
+
+
+
+	return dict,dict
+}
+
+//处理结构数据
+func (weight *weightDataMap)dealWithStructData()  {
+
+	//模板id 数据
+	templateid:=weight.templateid
+	templateTmp:=weight.data[templateid].data
+
+	//联系人 winnerorder
+	winnerCount:=qu.IntAll(0)
+	winnerArr,b:=make([]interface{},0),false
+	if winnerArr,b = templateTmp["winnerorder"].(primitive.A);b {
+		winnerCount = qu.IntAll(len(winnerArr))
+	}
+
+	//分包 package
+	packageCount:=qu.IntAll(0)
+	packageArr,b:=make([]interface{},0),false
+	if packageArr,b = templateTmp["package"].(primitive.A);b {
+		packageCount = qu.IntAll(len(packageArr))
+	}
+
+	//附件attach_text
+	/*
+		"attach_text" : {
+        "1" : {
+            "0" : {
+                "file_name" : "政采贷融资.doc",
+                "attach_url" : "d5ca0944-6af1-11eb-a8bb-0242ac120002"
+            }
+        },
+        "0" : {
+            "0" : {
+                "file_name" : "01永嘉县人民医院发光免疫试剂采购及设备租赁项目公开招标文件(电子招标).doc",
+                "attach_url" : "7827b2d4-6adb-11eb-bd40-0242ac120002"
+            }
+        }
+    },
+	*/
+	attach_text:=make(map[string]interface{},0)
+	if attach_text,b = templateTmp["attach_text"].(primitive.M);b {
+
+	}
+	log.Println(attach_text)
+
+
+	//遍历其他数据-
+	for _,value:=range weight.saveids {
+		if templateid == value {
+			continue
+		}
+		//winnerorder
+		tmp:=weight.data[value].data
+		if arr_1,b := tmp["winnerorder"].(primitive.A);b {
+			count:=qu.IntAll(len(arr_1))
+			if count > winnerCount {
+				winnerCount = count
+				winnerArr = arr_1
+			}
+		}
+
+		//package
+		if arr_2,b := tmp["package"].(primitive.A);b {
+			count:=qu.IntAll(len(arr_2))
+			if count > packageCount {
+				packageCount = count
+				packageArr = arr_2
+			}
+		}
+
+
+
+
+	}
+
+
+
+
+
+}
+
+
+
+
+
+
+//处理时间方法
+func (weight *weightDataMap)dealWithTimeData(key string) (int,int) {
+
+	saveids:=weight.saveids
+	data:=weight.data
+	timeArr := make([]int,0)
+	for _,v:=range saveids{
+		timeArr = append(timeArr,qu.IntAll(data[v].data[key]))
+	}
+
+	//最小 最大排序方法
+	return sortTimeArrMethod(timeArr)
+}
+//时间排序方法 小 → 大
+func sortTimeArrMethod(arr []int) (int,int) {
+
+	for i := 0; i < len(arr); i++ {
+		for j := i + 1; j < len(arr); j++ {
+			if arr[i] > arr[j] {
+				arr[i], arr[j] = arr[j], arr[i]
+			}
+		}
+	}
+	log.Println(arr)
+	indexEarly,indexLately := 0,len(arr)-1
+	return arr[indexEarly],arr[indexLately]
+}

+ 328 - 0
udpfusion/src/weightValue.go

@@ -0,0 +1,328 @@
+package main
+
+import (
+	"log"
+	"math/rand"
+	"sync"
+	"time"
+)
+
+type weightInfo struct {
+	maxLevel 		bool
+	minLevel		bool
+	siteLevel		int
+	qualityScore 	int
+	ranking			int
+	data      		map[string]interface{}
+}
+
+
+//一般数据判重
+type weightDataMap struct {
+	lock   sync.Mutex //锁
+	data   map[string]*weightInfo
+	allids    []string
+	saveids    []string
+	templateid 	string 	//模板id
+}
+
+func NewWeightData(arr []string,templateid string) *weightDataMap {
+	//测试-默认第一个
+	arr = []string{"5638baccaf53745d9a000994","5638baccaf53745d9a000995","5638baccaf53745d9a000998",
+		"603717b8fc702705550b8df4","603717b8fc702705550b8df5","603717b8fc702705550b8df6"}
+	weight := &weightDataMap{sync.Mutex{},map[string]*weightInfo{},[]string{},[]string{},templateid}
+
+	data := make(map[string]*weightInfo,0)
+	for _,v:=range arr {
+		dict := mgo.FindById(coll_name,v)
+		if dict!=nil && len(dict)>2{
+			data[v] = analyzeTheSoureData(dict)
+		}
+	}
+
+	//测试模拟分数
+	//weight := &weightDataMap{sync.Mutex{},map[string]*weightInfo{},[]string{},[]string{},templateid}
+	//data := make(map[string]*weightInfo,0)
+	//max :=[]bool{false,false,false,false,false,false,false,false,false,false}
+	//min :=[]bool{false,false,false,false,false,false,false,false,false,false}
+	//site :=[]int{2,1,5,3,4,2,3,5,1,0}
+	//qua :=[]int{15,11,11,11,22,19,22,44,22,66}
+	//rank :=[]int{-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
+	//for k,v:=range arr {
+	//	data[v] = &weightInfo{
+	//		max[k],
+	//		min[k],
+	//		site[k],
+	//		qua[k],
+	//		rank[k],
+	//	}
+	//}
+
+
+	weight.data = data
+
+	return weight
+
+}
+
+//分析源数据数据
+func analyzeTheSoureData(tmp map[string]interface{}) *weightInfo {
+
+	maxLevel := false
+	minLevel := false
+	ranking := -1  //默认 无排名
+
+	//分析站点方法
+	siteLevel := analyzeTheSite(tmp)
+	//质量评分
+	qualityScore := analyzeTheElements(tmp)
+
+	return &weightInfo{
+		maxLevel,
+		minLevel,
+		siteLevel,
+		qualityScore,
+		ranking,
+		tmp,
+	}
+}
+
+//分析模板数据-打标记构建数据结构
+func (weight *weightDataMap) analyzeBuildStandardData() {
+
+	//log.Print("分析前",weight.allids,weight.saveids,weight.templateid,len(weight.data))
+
+	weight.lock.Lock()
+	
+	//分析里面的打分,以及是否参与融合来决定
+	data:=weight.data
+
+	//分析不同维度的数据-排列ranking,并调换顺序
+
+	//先构建
+	arrAllIds := make([]string,0)
+	arrSaveIds := make([]string,0)
+	arrMaxLevel := make([]bool,0)
+	arrMinLevel	:= make([]bool,0)
+	arrSiteLevel := make([]int,0)
+	arrQualityScore := make([]int,0)
+	arrRanking		:= make([]int,0) //主要排名
+
+	//无序
+	for k,v:=range data{
+		//log.Println(v)
+		//涉及前置条件,哪些数据不需要融合
+		arrAllIds =  append(arrAllIds,k)
+		arrSaveIds = append(arrSaveIds,k)
+		//
+
+
+
+		arrMaxLevel = append(arrMaxLevel,v.maxLevel)
+		arrMinLevel = append(arrMinLevel,v.minLevel)
+
+		arrSiteLevel = append(arrSiteLevel,v.siteLevel)
+		arrQualityScore = append(arrQualityScore,v.qualityScore)
+		arrRanking = append(arrRanking,v.ranking)
+
+
+	}
+
+	log.Println("初始排名:",arrRanking)
+	log.Println("初始质量:",arrQualityScore)
+	log.Println("初始站点:",arrSiteLevel)
+
+	//第一步,最大权重,重置排名
+	isMaxIndexArr := make([]int,0) //记录索引
+	isMaxIndexValueArr := make([]int,0)
+	for k,v :=range  arrMaxLevel {
+		if v == true {
+			arrRanking[k]=1
+			isMaxIndexArr = append(isMaxIndexArr,k)
+			isMaxIndexValueArr = append(isMaxIndexValueArr,arrQualityScore[k])
+		}
+	}
+	rank_s :=1
+
+	if len(isMaxIndexArr)>=1 {
+		log.Println("进行最大权重...")
+		rankIndexArr := dealWithGroupScores(isMaxIndexArr,isMaxIndexValueArr,arrSiteLevel)
+		//log.Println(rankIndexArr)
+		//重新排名
+		for _,v:=range rankIndexArr {
+			arrRanking[v] = rank_s
+			rank_s++
+		}
+	}else {
+		log.Println("无最大权重-质量-站点排序")
+	}
+
+	log.Println("第一步:经过最高权重比较得出--",arrRanking)
+
+
+	//第二步,最小权重,重置排名
+	isMinIndexArr := make([]int,0)
+	isMinIndexValueArr := make([]int,0)
+	for k,v :=range  arrMinLevel {
+		if v == true && arrMaxLevel[k]!=true {
+			isMinIndexArr = append(isMinIndexArr,k)
+			isMinIndexValueArr = append(isMinIndexValueArr,arrQualityScore[k])
+		}
+	}
+
+	if len(isMinIndexArr)>=1 {
+		log.Println("进行最小权重...")
+		rankIndexArr := dealWithGroupScores(isMinIndexArr,isMinIndexValueArr,arrSiteLevel)
+		//重新排名
+		lastRank := len(arrSaveIds)
+		//log.Println("最小排名分",lastRank,rankIndexArr)
+		for i:=len(rankIndexArr)-1;i>=0;i--  {
+			index:=rankIndexArr[i]
+			arrRanking[index] = lastRank
+			lastRank--
+		}
+	}else {
+		log.Println("无最小权重-质量-站点排序")
+	}
+
+	log.Println("第二步:经过最小权重比较得出--",arrRanking)
+
+	//第三步,分析第一步没排名的数据
+	isQuaIndexArr := make([]int,0)
+	isQuaIndexValueArr := make([]int,0)
+	for k,v:=range arrRanking{
+		if v==-1 {
+			isQuaIndexArr = append(isQuaIndexArr,k)
+			isQuaIndexValueArr = append(isQuaIndexValueArr,arrQualityScore[k])
+		}
+	}
+	if len(isQuaIndexArr)>=1 {
+		log.Println("进行质量-站点组合...")
+		rankIndexArr := dealWithGroupScores(isQuaIndexArr,isQuaIndexValueArr,arrSiteLevel)
+		//log.Println(rankIndexArr)
+		//重新排名
+		for _,v:=range rankIndexArr {
+			arrRanking[v] = rank_s
+			rank_s++
+		}
+	}else {
+		log.Println("不需要进行质量-站点组合...")
+	}
+
+	log.Println("第三步:经过质量-站点权重比较得出--",arrRanking)
+
+
+	template_id:=""
+	//根据-排名-修改
+	for k,v:=range arrRanking {
+		index:=arrSaveIds[k]
+		data[index].ranking = v
+		//log.Println("key:",index,"排名:",v)
+		if v==1 {
+			template_id = index
+		}
+	}
+
+	weight.data = data
+	weight.templateid = template_id
+	weight.allids = arrAllIds
+	weight.saveids = arrSaveIds
+
+	weight.lock.Unlock()
+}
+
+func dealWithGroupScores(indexArr []int, scoreArr []int,siteArr []int) []int {
+
+	//log.Println("下标组",indexArr,"质量分组",scoreArr,"整体站点组",siteArr)
+	//处理分组
+	sort_scoreArr,sort_indexArr := sortGroupInt(scoreArr,indexArr)
+	//log.Println("排序质量分:",sort_scoreArr,sort_indexArr)
+
+	totalIndexArr:=make([][]int,0)
+	lastTmp := -1
+	for k,v :=range sort_scoreArr {
+		if v<lastTmp || k==0 {
+			arr_s := make([]int,0)
+			arr_i := make([]int,0)
+			for index,value :=range scoreArr {
+				if v==value {
+					arr_s = append(arr_s,value)
+					arr_i = append(arr_i,sort_indexArr[index])
+				}
+			}
+			totalIndexArr = append(totalIndexArr,arr_i)
+			lastTmp = v
+		}
+	}
+
+	finallyIndexArr := make([]int,0)
+	for _,v:=range totalIndexArr{
+		if len(v)>1 {
+			//[6 3 4]
+			arr_s :=make([]int,0)
+			for _,v1:=range v{
+				arr_s = append(arr_s,siteArr[v1])
+			}
+			_,b:=sortGroupInt(arr_s,v)
+			for _,v2:=range b {
+				finallyIndexArr = append(finallyIndexArr,v2)
+			}
+		}else {
+			finallyIndexArr = append(finallyIndexArr,v[0])
+		}
+	}
+	return finallyIndexArr
+}
+
+//排序 正常排序 ,站点
+func sortNormalInt(arrValue []int) ([]int){
+	for i := 0; i < len(arrValue); i++ {
+		for j := i + 1; j < len(arrValue); j++ {
+			if arrValue[i] < arrValue[j] {
+				arrValue[i], arrValue[j] = arrValue[j], arrValue[i]
+			}
+		}
+	}
+	return arrValue
+}
+
+
+//排序 质量,分组
+func sortGroupInt(arrValue []int,arrIndex []int) ([]int ,[]int){
+
+	for i := 0; i < len(arrValue); i++ {
+		for j := i + 1; j < len(arrValue); j++ {
+			if arrValue[i] < arrValue[j] {
+				arrValue[i], arrValue[j] = arrValue[j], arrValue[i]
+				arrIndex[i], arrIndex[j] = arrIndex[j], arrIndex[i]
+			}
+		}
+	}
+	return arrValue,arrIndex
+}
+
+
+
+
+
+
+
+//分析站点评分
+func analyzeTheSite(tmp map[string]interface{}) int {
+	/*
+		站点评分1-5级
+	*/
+	//测试随机分
+	rand.Seed(time.Now().UnixNano()) //以当前系统时间作为种子参数
+	return rand.Intn(10)
+}
+
+//分析要素评分
+func analyzeTheElements(tmp map[string]interface{}) int {
+	/*
+		质量评分总分
+	*/
+	//测试随机分
+	rand.Seed(time.Now().UnixNano()) //以当前系统时间作为种子参数
+	return rand.Intn(100)
+}

+ 3 - 3
udps/main.go

@@ -18,12 +18,12 @@ func main() {
 	ip, p, tmptime, tmpkey, id1, id2, stype, q, bkey, param, ids := "", 0, 0, "", "", "", "", "", "", "", ""
 	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
-	flag.StringVar(&ip, "ip", "192.168.3.205", "ip")
+	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")
 	flag.IntVar(&p, "p", 1484, "端口")
 	flag.IntVar(&tmptime, "tmptime", 0, "时间查询")
 	flag.StringVar(&tmpkey, "tmpkey", "", "时间字段")
-	flag.StringVar(&id1, "gtid", "5a86328a40d2d9bbe88e4138", "gtid")
-	flag.StringVar(&id2, "lteid", "5a8f8cd340d2d9bbe8a21400", "lteid")
+	flag.StringVar(&id1, "gtid", "5e8fae3485a9271abf2b8330", "gtid")
+	flag.StringVar(&id2, "lteid", "5fe50860f0f9d716c17d1b99", "lteid")
 	flag.StringVar(&ids, "ids", "", "id1,id2")
 	flag.StringVar(&stype, "stype", "biddingall", "stype,传递类型")
 	flag.StringVar(&bkey, "bkey", "", "bkey,加上此参数表示不生关键词和摘要")