Browse Source

修改时间等细节

apple 5 năm trước cách đây
mục cha
commit
5ee7625038

+ 3 - 2
udpfilterdup/src/config.json

@@ -1,11 +1,11 @@
 {
-    "udpport": ":1485",
+    "udpport": ":1488",
     "dupdays": 5,
     "mongodb": {
         "addr": "192.168.3.207:27092",
         "pool": 5,
         "db": "extract_kf",
-        "extract": "ceshi_info",
+        "extract": "zk",
         "site": {
             "dbname": "zhaolongyue",
             "coll": "site"
@@ -17,6 +17,7 @@
     },
     "nextNode": [],
     "isMerger": false,
+    "threads": 1,
     "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
     "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包)",
     "specialtitle_2": "项目([0-9a-zA-Z一二三四五六七八九十零123456789])",

+ 146 - 145
udpfilterdup/src/datamap.go

@@ -5,7 +5,6 @@ import (
 	"log"
 	qutil "qfw/util"
 	"qfw/util/mongodb"
-	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -31,7 +30,6 @@ type Info struct {
 	site        string  //站点
 	href        string  //正文的url
 	repeatid    string  //重复id
-
 	titleSpecialWord bool                   //标题特殊词
 	specialWord      bool                   //再次判断的特殊词
 	mergemap         map[string]interface{} //合并记录
@@ -75,20 +73,17 @@ func NewDatamap(days int, lastid string) *datamap {
 		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 {
 			continuSum++
 		} else {
-			cm := tmp["comeintime"] //时间单位?
-			comeintime := qutil.Int64All(cm)
-			if comeintime == 0 {
-				id := qutil.BsonIdToSId(tmp["_id"])[0:8]
-				comeintime, _ = strconv.ParseInt(id, 16, 64)
+			pt:=tmp["publishtime"]
+			pt_time:=qutil.Int64All(pt)
+			if pt_time<=0 {
+				continue
 			}
 			if now1 == 0 {
-				now1 = comeintime
+				now1 = pt_time
 			}
-			if qutil.Float64All(now1-comeintime) < datelimit {
+			if qutil.Float64All(now1-pt_time) < datelimit {
 				info := NewInfo(tmp)
-				//时间字符串
-				dkey := qutil.FormatDateWithObj(&cm, qutil.Date_yyyyMMdd)
-				//拼接的一个时间字符串 xxxx_类型_省份
+				dkey := qutil.FormatDateWithObj(&pt, qutil.Date_yyyyMMdd)
 				k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
 				data := dm.data[k]
 				if data == nil {
@@ -124,17 +119,15 @@ func NewHistorymap(startid string, lastid string, startTime int64, lastTime int6
 		true)).Sort("-_id").Iter()
 	m, n := 0, 0
 	for tmp_start := make(map[string]interface{}); it_start.Next(&tmp_start); {
-		cm := tmp_start["comeintime"]
-		comeintime := qutil.Int64All(tmp_start["comeintime"])
-		if comeintime == 0 {
-			id := qutil.BsonIdToSId(tmp_start["_id"])[0:8]
-			comeintime, _ = strconv.ParseInt(id, 16, 64)
+		pt_s:=tmp_start["publishtime"]
+		pt_time:=qutil.Int64All(pt_s)
+		if pt_time<=0 {
+			continue
 		}
-
-		if qutil.Float64All(startTime-comeintime) <= datelimit {
+		if qutil.Float64All(startTime-pt_time) <= datelimit {
 			n++
 			info := NewInfo(tmp_start)
-			dkey := qutil.FormatDateWithObj(&cm, qutil.Date_yyyyMMdd)
+			dkey := qutil.FormatDateWithObj(&pt_s, qutil.Date_yyyyMMdd)
 			k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
 			data := hm.data[k]
 			if data == nil {
@@ -157,17 +150,15 @@ func NewHistorymap(startid string, lastid string, startTime int64, lastTime int6
 		true)).Sort("_id").Iter()
 
 	for tmp_last := make(map[string]interface{}); it_last.Next(&tmp_last); {
-		cm := tmp_last["comeintime"]
-		comeintime := qutil.Int64All(tmp_last["comeintime"])
-		if comeintime == 0 {
-			id := qutil.BsonIdToSId(tmp_last["_id"])[0:8]
-			comeintime, _ = strconv.ParseInt(id, 16, 64)
+		pt_l:=tmp_last["publishtime"]
+		pt_time:=qutil.Int64All(pt_l)
+		if pt_time<=0 {
+			continue
 		}
-
-		if qutil.Float64All(comeintime-lastTime) <= datelimit {
+		if qutil.Float64All(pt_time-lastTime) <= datelimit {
 			m++
 			info := NewInfo(tmp_last)
-			dkey := qutil.FormatDateWithObj(&cm, qutil.Date_yyyyMMdd)
+			dkey := qutil.FormatDateWithObj(&pt_l, qutil.Date_yyyyMMdd)
 			k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
 			data := hm.data[k]
 			if data == nil {
@@ -209,7 +200,6 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.publishtime = qutil.Int64All(tmp["publishtime"])
 	info.bidopentime = qutil.Int64All(tmp["bidopentime"])
 	info.agencyaddr = qutil.ObjToString(tmp["agencyaddr"])
-	//info.detail = qutil.ObjToString(tmp["detail"])
 	info.site = qutil.ObjToString(tmp["site"])
 	info.href = qutil.ObjToString(tmp["href"])
 	info.repeatid = qutil.ObjToString(tmp["repeatid"])
@@ -220,16 +210,13 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	if info.mergemap == nil {
 		info.mergemap = make(map[string]interface{}, 0)
 	}
-
 	return info
 }
-
 //判重方法
 func (d *datamap) check(info *Info) (b bool, source *Info, reason string) {
 	keys := []string{}
 	d.lock.Lock()
 	for k, _ := range d.keys { //不同时间段
-		//...代码
 		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
 		if info.area != "全国" { //这个后续可以不要
 			keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
@@ -247,15 +234,12 @@ L:
 				if v.id == info.id { //正常重复
 					return false, v, ""
 				}
-				//类型分组
 				if info.subtype == v.subtype {
-					//站点配置--
 					if info.site != "" {
 						sitelock.Lock()
 						dict := SiteMap[info.site]
 						sitelock.Unlock()
 						if dict != nil {
-							//临时改变--具体值
 							if info.area == "全国" && dict["area"] != "" {
 								info.area = qutil.ObjToString(dict["area"])
 								info.city = qutil.ObjToString(dict["city"])
@@ -267,8 +251,7 @@ L:
 							}
 						}
 					}
-
-					//前置条件1  	站点相关
+					//前置条件1 - 站点相关
 					if info.site != "" && info.site == v.site {
 						if info.href != "" && info.href == v.href {
 							reason = "href相同"
@@ -277,17 +260,17 @@ L:
 							break L
 						}
 						if info.href != "" && info.href != v.href {
-							reason = "href不同"
+							reason = "href不同-"
 						}
 					}
 
-					//前置条件2  标题相关 - 有且一个关键词
+					//前置条件2 - 标题相关,有且一个关键词
 					if ((info.titleSpecialWord && !v.titleSpecialWord) || (info.specialWord && !v.specialWord)) &&
 						info.title != v.title && v.title != "" && info.title != "" {
 						continue
 					}
 
-					//前置条件3 	标题相关 - 均含有关键词
+					//前置条件3 - 标题相关,均含有关键词
 					if ((info.titleSpecialWord && v.titleSpecialWord) || (info.specialWord && v.specialWord)) &&
 						len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 && v.title != "" && info.title != "" {
 						if !(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
@@ -359,35 +342,34 @@ L:
 	return
 }
 
-func (h *historymap) checkHistory(info *Info) (b bool, source *Info, reasons string) {
-	h.lock.Lock()
-	defer h.lock.Unlock()
+func (h *historymap) checkHistory(info *Info) (b bool, source *Info, reason string) {
 	keys := []string{}
-	//不同时间段
-	for k, _ := range h.keys {
-		//...代码
+	h.lock.Lock()
+	for k, _ := range h.keys { //不同时间段
 		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
 		if info.area != "全国" { //这个后续可以不要
 			keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, "全国"))
 		}
 	}
+	h.lock.Unlock()
+
 L:
 	for _, k := range keys {
+		h.lock.Lock()
 		data := h.data[k]
+		h.lock.Unlock()
 		if len(data) > 0 { //对比v   找到同类型,同省或全国的数据作对比
 			for _, v := range data {
-				reason := ""
+				reason = ""
 				if v.id == info.id { //正常重复
 					return false, v, ""
 				}
-				//类型分组
 				if info.subtype == v.subtype {
-					//站点配置--
 					if info.site != "" {
+						sitelock.Lock()
 						dict := SiteMap[info.site]
-
+						sitelock.Unlock()
 						if dict != nil {
-							//临时改变--具体值
 							if info.area == "全国" && dict["area"] != "" {
 								info.area = qutil.ObjToString(dict["area"])
 								info.city = qutil.ObjToString(dict["city"])
@@ -399,28 +381,26 @@ L:
 							}
 						}
 					}
-
-					//前置条件1  	站点相关
+					//前置条件1 - 站点相关
 					if info.site != "" && info.site == v.site {
 						if info.href != "" && info.href == v.href {
 							reason = "href相同"
 							b = true
 							source = v
-							reasons = reason
 							break L
 						}
 						if info.href != "" && info.href != v.href {
-							reason = "href不同"
+							reason = "href不同-"
 						}
 					}
 
-					//前置条件2  标题相关 - 有且一个关键词
+					//前置条件2 - 标题相关,有且一个关键词
 					if ((info.titleSpecialWord && !v.titleSpecialWord) || (info.specialWord && !v.specialWord)) &&
 						info.title != v.title && v.title != "" && info.title != "" {
 						continue
 					}
 
-					//前置条件3 	标题相关 - 均含有关键词
+					//前置条件3 - 标题相关,均含有关键词
 					if ((info.titleSpecialWord && v.titleSpecialWord) || (info.specialWord && v.specialWord)) &&
 						len([]rune(v.title)) > 10 && len([]rune(info.title)) > 10 && v.title != "" && info.title != "" {
 						if !(strings.Contains(v.title, info.title) || strings.Contains(info.title, v.title)) {
@@ -430,7 +410,6 @@ L:
 							reason = "标题关键词且包含关系"
 							b = true
 							source = v
-							reasons = reason
 							break L
 						}
 					}
@@ -471,28 +450,20 @@ L:
 
 	//
 	if b {
-		//判重
 		if info.repeatid == source.id {
-			//重复-无变化-不处理
-			b = false
-		} else {
-			if source.id != "" {
-				//重复-有变化-覆盖记录处理
-			}
+			b = false//重复-无变化-不处理
 		}
 	} else {
 		if source != nil {
-			if source.repeatid != "" {
-				//未判重-有变化--记录
+			if source.repeatid != "" {//未判重-有变化--记录
 				b = true
-				reasons = "未判重记录"
+				reason = "未判重记录"
 			}
 		}
 	}
-
 	//往预存数据 d 添加
 	if !b {
-		ct, _ := strconv.ParseInt(info.id[:8], 16, 64)
+		ct := info.publishtime
 		dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
 		k := fmt.Sprintf("%s_%s_%s", dkey, info.subtype, info.area)
 		data := h.data[k]
@@ -513,7 +484,7 @@ L:
 
 //替换原始数据池
 func (d *datamap) replaceSourceData(replaceData *Info, replaceId string) {
-	ct, _ := strconv.ParseInt(replaceId[:8], 16, 64)
+	ct := replaceData.publishtime
 	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
 	k := fmt.Sprintf("%s_%s_%s", dkey, replaceData.subtype, replaceData.area)
 	d.lock.Lock()
@@ -523,7 +494,6 @@ func (d *datamap) replaceSourceData(replaceData *Info, replaceId string) {
 		d.data[k] = data
 		if !d.keys[dkey] {
 			d.keys[dkey] = true
-			d.update(ct)
 		}
 	} else {
 		//遍历替换
@@ -539,16 +509,16 @@ func (d *datamap) replaceSourceData(replaceData *Info, replaceId string) {
 }
 
 func (h *historymap) replaceSourceData(replaceData *Info, replaceId string) {
-	ct, _ := strconv.ParseInt(replaceId[:8], 16, 64)
+	ct := replaceData.publishtime
 	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
 	k := fmt.Sprintf("%s_%s_%s", dkey, replaceData.subtype, replaceData.area)
+	h.lock.Lock()
 	data := h.data[k]
 	if data == nil {
 		data = []*Info{replaceData}
 		h.data[k] = data
 		if !h.keys[dkey] {
 			h.keys[dkey] = true
-			//h.update(ct)
 		}
 	} else {
 		//遍历替换
@@ -558,19 +528,69 @@ func (h *historymap) replaceSourceData(replaceData *Info, replaceId string) {
 				break
 			}
 		}
-
 		h.data[k] = data
 	}
+	h.lock.Unlock()
+}
+
+
+
+func (d *datamap) update(t int64) {
+	//每天0点清除历史数据
+	d.keymap = d.GetLatelyFiveDay(t)
+	m := map[string]bool{}
+	for _, v := range d.keymap {
+		m[v] = true
+	}
+	all, all1 := 0, 0
+	for k, v := range d.data {
+		all += len(v)
+		if !m[k[:8]] {
+			delete(d.data, k)
+		}
+	}
+	for k, _ := range d.keys {
+		if !m[k] {
+			delete(d.keys, k)
+		}
+	}
+	for _, v := range d.data {
+		all1 += len(v)
+	}
+	//log.Println("更新前后数据:", all, all1)
 }
 
-//以下为判重   -   一揽子的方法
+func (d *datamap) GetLatelyFiveDay(t int64) []string {
+	array := make([]string, d.days)
+	now := time.Unix(t, 0)
+	for i := 0; i < d.days; i++ {
+		array[i] = now.Format(qutil.Date_yyyyMMdd)
+		now = now.AddDate(0, 0, -1)
+	}
+	return array
+}
+
+
+
+
+
+
+
+/*
+**************************
+******* 以下为判重 ********
+**************************
+*/
+
 //判重方法1
 func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
+
+	isMeet:=false
 	if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
 		info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
 		info.subtype == "变更" || info.subtype == "其他" {
 		//招标结果
-		if tenderRepeat_A(v, info, reason) {
+		if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
 			if tenderRepeat_C(v, info) {
 				return false, reason
 			} else {
@@ -583,7 +603,10 @@ func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 
 	} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
 		//中标结果
-		if winningRepeat_A(v, info, reason) {
+		if isMeet, reason = tenderRepeat_A(v, info, reason); isMeet {
+
+		}
+		if isMeet, reason = winningRepeat_A(v, info, reason);isMeet {
 			if winningRepeat_C(v, info) {
 				return false, reason
 			} else {
@@ -596,7 +619,7 @@ func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 
 	} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
 		//合同
-		if contractRepeat_A(v, info, reason) {
+		if isMeet, reason = contractRepeat_A(v, info, reason);isMeet {
 			if contractRepeat_C(v, info) {
 				return false, reason
 			} else {
@@ -608,7 +631,7 @@ func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 		}
 	} else {
 		//招标结果
-		if tenderRepeat_A(v, info, reason) {
+		if isMeet, reason = tenderRepeat_A(v, info, reason);isMeet {
 			if tenderRepeat_C(v, info) {
 				return false, reason
 			} else {
@@ -625,14 +648,13 @@ func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 
 //判重方法2
 func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
-	//相同
+	isMeet:=false
 	if v.agency == info.agency && v.agency != "" && info.agency != "" {
-
 		if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
 			info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
 			info.subtype == "变更" || info.subtype == "其他" {
 			//招标结果
-			if tenderRepeat_B(v, info, reason) {
+			if isMeet, reason =  tenderRepeat_B(v, info, reason);isMeet {
 				if tenderRepeat_C(v, info) { //有不同
 					return false, reason
 				} else {
@@ -645,7 +667,7 @@ func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
 
 		} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
 			//中标结果
-			if winningRepeat_B(v, info, reason) {
+			if isMeet, reason =  winningRepeat_B(v, info, reason);isMeet {
 				if winningRepeat_C(v, info) { //有不同
 					return false, reason
 				} else {
@@ -658,7 +680,7 @@ func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
 
 		} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
 			//合同
-			if contractRepeat_B(v, info, reason) {
+			if isMeet, reason = contractRepeat_B(v, info, reason);isMeet {
 				if contractRepeat_C(v, info) { //有不同
 					return false, reason
 				} else {
@@ -670,7 +692,7 @@ func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
 			}
 		} else {
 			//招标结果
-			if tenderRepeat_B(v, info, reason) {
+			if isMeet, reason = tenderRepeat_B(v, info, reason);isMeet {
 				if tenderRepeat_C(v, info) { //有不同
 					return false, reason
 				} else {
@@ -702,7 +724,7 @@ func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
 }
 
 //招标_A
-func tenderRepeat_A(v *Info, info *Info, reason string) bool {
+func tenderRepeat_A(v *Info, info *Info, reason string) (bool ,string) {
 
 	var ss string
 	p1, p2, p3, p4, p9, p10, p11 := false, false, false, false, false, false, false
@@ -744,13 +766,13 @@ func tenderRepeat_A(v *Info, info *Info, reason string) bool {
 		(p3 && p4 && p9) || (p3 && p4 && p10) || (p3 && p4 && p11) ||
 		(p4 && p9 && p10) || (p4 && p9 && p11) || (p9 && p10 && p11) {
 		reason = reason + "满足招标A,3要素组合-" + ss + ","
-		return true
+		return true,reason
 	}
-	return false
+	return false,reason
 }
 
 //招标_B
-func tenderRepeat_B(v *Info, info *Info, reason string) bool {
+func tenderRepeat_B(v *Info, info *Info, reason string) (bool,string) {
 
 	m, n := 0, 0
 	if v.projectname != "" && v.projectname == info.projectname {
@@ -779,13 +801,13 @@ func tenderRepeat_B(v *Info, info *Info, reason string) bool {
 	}
 	if m >= 2 {
 		if n == 2 && m == 2 {
-			return false
+			return false,reason
 		} else {
 			reason = reason + "满足招标B,七选二,"
-			return true
+			return true,reason
 		}
 	}
-	return false
+	return false,reason
 }
 
 //招标_C
@@ -808,7 +830,7 @@ func tenderRepeat_C(v *Info, info *Info) bool {
 }
 
 //中标_A
-func winningRepeat_A(v *Info, info *Info, reason string) bool {
+func winningRepeat_A(v *Info, info *Info, reason string) (bool,string) {
 
 	var ss string
 	p1, p2, p3, p5, p6, p11 := false, false, false, false, false, false
@@ -845,14 +867,14 @@ func winningRepeat_A(v *Info, info *Info, reason string) bool {
 		(p3 && p5 && p6) || (p3 && p5 && p11) || (p3 && p6 && p11) ||
 		(p5 && p6 && p11) {
 		reason = reason + "满足中标A,3要素组合-" + ss + ","
-		return true
+		return true,reason
 	}
 
-	return false
+	return false,reason
 }
 
 //中标_B
-func winningRepeat_B(v *Info, info *Info, reason string) bool {
+func winningRepeat_B(v *Info, info *Info, reason string) (bool,string) {
 
 	m, n := 0, 0
 	if v.projectname != "" && v.projectname == info.projectname {
@@ -878,13 +900,13 @@ func winningRepeat_B(v *Info, info *Info, reason string) bool {
 	}
 	if m >= 2 {
 		if n == 2 && m == 2 {
-			return false
+			return false,reason
 		} else {
 			reason = reason + "满足中标B.六选二,"
-			return true
+			return true,reason
 		}
 	}
-	return false
+	return false,reason
 }
 
 //中标_C
@@ -902,27 +924,29 @@ func winningRepeat_C(v *Info, info *Info) bool {
 }
 
 //合同_A
-func contractRepeat_A(v *Info, info *Info, reason string) bool {
+func contractRepeat_A(v *Info, info *Info, reason string) (bool,string) {
 
-	if tenderRepeat_A(v, info, reason) {
-		return true
+	isMeet := false
+	if isMeet, reason = tenderRepeat_A(v, info, reason);isMeet {
+		return true,reason
 	}
-	if winningRepeat_A(v, info, reason) {
-		return true
+	if isMeet, reason = winningRepeat_A(v, info, reason);isMeet {
+		return true,reason
 	}
-	return false
+	return false,reason
 }
 
 //合同_B
-func contractRepeat_B(v *Info, info *Info, reason string) bool {
+func contractRepeat_B(v *Info, info *Info, reason string) (bool,string) {
 
-	if tenderRepeat_B(v, info, reason) {
-		return true
+	isMeet := false
+	if isMeet, reason = tenderRepeat_B(v, info, reason);isMeet {
+		return true,reason
 	}
-	if winningRepeat_B(v, info, reason) {
-		return true
+	if isMeet, reason = winningRepeat_B(v, info, reason);isMeet {
+		return true,reason
 	}
-	return false
+	return false,reason
 }
 
 //合同_C
@@ -937,37 +961,14 @@ func contractRepeat_C(v *Info, info *Info) bool {
 	return false
 }
 
-func (d *datamap) update(t int64) {
-	//每天0点清除历史数据
-	d.keymap = d.GetLatelyFiveDay(t)
-	m := map[string]bool{}
-	for _, v := range d.keymap {
-		m[v] = true
-	}
-	all, all1 := 0, 0
-	for k, v := range d.data {
-		all += len(v)
-		if !m[k[:8]] {
-			delete(d.data, k)
-		}
-	}
-	for k, _ := range d.keys {
-		if !m[k] {
-			delete(d.keys, k)
-		}
-	}
-	for _, v := range d.data {
-		all1 += len(v)
-	}
-	//log.Println("更新前后数据:", all, all1)
-}
 
-func (d *datamap) GetLatelyFiveDay(t int64) []string {
-	array := make([]string, d.days)
-	now := time.Unix(t, 0)
-	for i := 0; i < d.days; i++ {
-		array[i] = now.Format(qutil.Date_yyyyMMdd)
-		now = now.AddDate(0, 0, -1)
-	}
-	return array
-}
+
+
+
+
+
+
+
+
+
+

+ 21 - 40
udpfilterdup/src/main.go

@@ -23,7 +23,6 @@ var (
 	Sysconfig map[string]interface{} //配置文件
 	mconf     map[string]interface{} //mongodb配置信息
 	mgo       *mongodb.MongodbSim    //mongodb操作对象
-	//siteMgo      *mongodb.MongodbSim
 	extract   string
 	udpclient mu.UdpClient             //udp对象
 	nextNode  []map[string]interface{} //下节点数组
@@ -32,16 +31,17 @@ var (
 	HM        *historymap              //判重数据
 	lastid    = ""
 	/*
-		5da3f2c5a5cb26b9b79847fc
+		5da3f31aa5cb26b9b798d3aa
 	*/
 	//正则筛选相关
 	FilterRegTitle   = regexp.MustCompile("^_$")
 	FilterRegTitle_1 = regexp.MustCompile("^_$")
 	FilterRegTitle_2 = regexp.MustCompile("^_$")
 
+
 	isMerger bool                              //是否合并
+	threadNum int								   //线程数量
 	SiteMap  map[string]map[string]interface{} //站点map
-
 	idtype, sid, eid string //测试人员判重使用
 )
 
@@ -71,6 +71,7 @@ func init() {
 	FilterRegTitle_1 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_1"]))
 	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
 	isMerger = Sysconfig["isMerger"].(bool)
+	threadNum = util.IntAllDef(Sysconfig["threads"], 1)
 
 	//站点配置
 	site := mconf["site"].(map[string]interface{})
@@ -90,7 +91,6 @@ func init() {
 		SiteMap[util.ObjToString(site_dict["site"])] = data_map
 	}
 	fmt.Printf("用时:%d秒,%d个", int(time.Now().Unix())-start, len(SiteMap))
-
 }
 
 func main() {
@@ -129,7 +129,6 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 		if err != nil {
 			udpclient.WriteUdp([]byte("err:"+err.Error()), mu.OP_NOOP, ra)
 		} else if mapInfo != nil {
-
 			taskType := util.ObjToString(mapInfo["stype"])
 			if taskType == "historyTask" {
 				//更新流程
@@ -180,16 +179,20 @@ func task(data []byte, mapInfo map[string]interface{}) {
 			},
 		}
 	}
+	log.Println(extract,mgo.DbName,q)
 	it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
 	updateExtract := [][]map[string]interface{}{}
-	pool := make(chan bool, 4)
+	log.Println("线程数:",threadNum)
+	pool := make(chan bool, threadNum)
+
 	wg := &sync.WaitGroup{}
-	mapLock := &sync.Mutex{}
+	//mapLock := &sync.Mutex{}
 	n, repeateN := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
 		if n%10000 == 0 {
 			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
 		}
+
 		pool <- true
 		wg.Add(1)
 		go func(tmp map[string]interface{}) {
@@ -200,7 +203,6 @@ func task(data []byte, mapInfo map[string]interface{}) {
 			info := NewInfo(tmp)
 			//是否为无效数据
 			if invalidData(info.buyer, info.projectname, info.projectcode) {
-				mapLock.Lock()
 				updateExtract = append(updateExtract, []map[string]interface{}{
 					map[string]interface{}{
 						"_id": tmp["_id"],
@@ -211,15 +213,11 @@ func task(data []byte, mapInfo map[string]interface{}) {
 						},
 					},
 				})
-
 				if len(updateExtract) > 500 {
 					mgo.UpdateBulk(extract, updateExtract...)
 					updateExtract = [][]map[string]interface{}{}
 				}
-				mapLock.Unlock()
 			} else {
-				//判重原因 reason  tmp["_id"] 对比id   id原始id
-				//mapLock.Lock()
 				b, source, reason := DM.check(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
 					repeateN++
@@ -229,8 +227,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 					repeat_id := source.id
 					id_map["_id"] = util.StringTOBsonId(info.id)
 					if isMerger {
-						//需要合并相关操作
-						//合并操作--评功权重打分-合并完替换原始数据池
+						//需要合并相关操作-合并操作--评功权重打分-合并完替换原始数据池
 						basic_bool := basicDataScore(source, info)
 						if basic_bool {
 							//已原始数据为标准-对比数据打判重标签
@@ -254,14 +251,10 @@ func task(data []byte, mapInfo map[string]interface{}) {
 							"repeatid":      repeat_id,
 						},
 					}
-
 					if isMerger {
-						//合并记录
 						if len(newData.mergemap) > 0 {
 							update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
-							//fmt.Println("合并长度:",len(newData.mergemap))
 						}
-
 						//更新合并后的数据
 						for _, value := range mergeArr {
 							if value == 1 {
@@ -282,10 +275,8 @@ func task(data []byte, mapInfo map[string]interface{}) {
 							} else if value == 8 {
 								update_map["$set"].(map[string]interface{})["bidopentime"] = newData.bidopentime
 							} else {
-
 							}
 						}
-
 					}
 					//构建数据库更新用到的
 					updateExtract = append(updateExtract, []map[string]interface{}{
@@ -380,13 +371,11 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 	}
 	it_task := sess.DB(mgo.DbName).C(extract).Find(&q_task).Iter()
 	updateExtract := [][]map[string]interface{}{}
-	pool := make(chan bool, 16)
+	pool := make(chan bool, threadNum)
 	wg := &sync.WaitGroup{}
-	mapLock := &sync.Mutex{}
+	//mapLock := &sync.Mutex{}
 	n, repeateN := 0, 0
-
 	for tmp := make(map[string]interface{}); it_task.Next(&tmp); n++ {
-
 		if n%10000 == 0 {
 			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
 		}
@@ -398,9 +387,8 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
-			//是否为无效数据
 			if invalidData(info.buyer, info.projectname, info.projectcode) {
-				mapLock.Lock()
+				//mapLock.Lock()
 				updateExtract = append(updateExtract, []map[string]interface{}{
 					map[string]interface{}{
 						"_id": tmp["_id"],
@@ -415,7 +403,7 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 					mgo.UpdateBulk(extract, updateExtract...)
 					updateExtract = [][]map[string]interface{}{}
 				}
-				mapLock.Unlock()
+				//mapLock.Unlock()
 			} else {
 				b, source, reason := HM.checkHistory(info)
 				if b { //有重复,生成更新语句,更新抽取和更新招标
@@ -443,8 +431,7 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 						id_map["_id"] = util.StringTOBsonId(info.id)
 
 						if isMerger {
-							//需要合并相关操作
-							//合并操作--评功权重打分-合并完替换原始数据池
+							//需要合并相关操作-合并操作--评功权重打分-合并完替换原始数据池
 							basic_bool := basicDataScore(source, info)
 							if basic_bool {
 								//已原始数据为标准-对比数据打判重标签
@@ -460,7 +447,6 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 								repeat_id = info.id
 							}
 						}
-
 						var update_map = map[string]interface{}{
 							"$set": map[string]interface{}{
 								"repeat_reason": reason,
@@ -468,14 +454,11 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 								"repeatid":      repeat_id,
 							},
 						}
-
 						if isMerger {
 							//合并记录
 							if len(newData.mergemap) > 0 {
 								update_map["$set"].(map[string]interface{})["merge"] = newData.mergemap
-								//fmt.Println("合并长度:",len(newData.mergemap))
 							}
-
 							//更新合并后的数据
 							for _, value := range mergeArr {
 								if value == 1 {
@@ -499,7 +482,6 @@ func historyTask(data []byte, mapInfo map[string]interface{}) {
 
 								}
 							}
-
 						}
 						//构建数据库更新用到的
 						updateExtract = append(updateExtract, []map[string]interface{}{
@@ -659,7 +641,7 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 		source.bidamount = info.bidamount
 		mergeArr = append(mergeArr, 7)
 	}
-	//8、开时间-地点
+	//8、开时间-地点
 	if source.bidopentime == 0 && info.bidopentime != 0 {
 		var arr []int64
 		if source.mergemap["bidopentime"] == nil {
@@ -681,13 +663,12 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 //权重评估
 func basicDataScore(v *Info, info *Info) bool {
 
-	//权重评估
 	/*
-			网站优先级判定规则:
-		    1、中央>省>市>县区
-		    2、政府采购>公共资源>采购单位官网>招标代理公司/平台
+	  权重评估
+	  网站优先级判定规则:
+	  1、中央>省>市>县区
+	  2、政府采购>公共资源>采购单位官网>招标代理公司/平台
 	*/
-
 	v_score, info_score := -1, -1
 	dict_v := SiteMap[v.site]
 	dict_info := SiteMap[info.site]

+ 0 - 688
udpprojectset/src/heavy_test.go

@@ -1,688 +0,0 @@
-package main
-
-import (
-	"fmt"
-	"github.com/tealeg/xlsx"
-	"log"
-	"qfw/util"
-	"qfw/util/mongodb"
-	"testing"
-)
-
-var (
-	mgo          *mongodb.MongodbSim    //mongodb操作对象
-	//mgo_copy          *mongodb.MongodbSim    //mongodb操作对象
-)
-
-
-//分类爬虫抽取统计
-func Test_crawlerExtractitCompare(t *testing.T) {
-
-	mgo = &mongodb.MongodbSim{
-		MongodbAddr: "192.168.3.207:27092",
-		DbName:      "extract_kf",
-		Size:        util.IntAllDef(15, 10),
-	}
-	mgo.InitPool()
-
-	sess := mgo.GetMgoConn()
-	defer mgo.DestoryMongoConn(sess)
-	it :=sess.DB("extract_kf").C("zheng_test_1").Find(nil).Sort("_id").Iter()
-	n:=0
-	crawlerMap := make(map[string]string,0)
-	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-		if n%10000==0 {
-			log.Println("当前n:",n)
-		}
-
-		//if n>2000 {
-		//	break
-		//}
-		crawlerMap[util.BsonIdToSId(tmp["_id"])] = util.ObjToString(tmp["spidercode"])
-	}
-
-	sess_1 := mgo.GetMgoConn()
-	defer mgo.DestoryMongoConn(sess_1)
-	it_1 :=sess_1.DB("extract_kf").C("zheng_test1_jd1").Find(nil).Sort("_id").Iter()
-	n1:=0
-	crawlerMap_1 := make(map[string][]map[string]interface{},0)
-
-	for tmp := make(map[string]interface{});it_1.Next(&tmp);n1++{
-		if n1%10000==0 {
-			log.Println("当前n1:",n1)
-		}
-
-		//if n1>2000 {
-		//	break
-		//}
-
-		//类别
-		dic :=map[string]interface{}{
-			"_id":util.BsonIdToSId(tmp["_id"]),
-			"href":util.ObjToString(tmp["href"]),
-			"title":util.ObjToString(tmp["title"]),
-			"buyer":util.ObjToString(tmp["buyer"]),
-			"agency":util.ObjToString(tmp["agency"]),
-			"winner":util.ObjToString(tmp["winner"]),
-			"budget":util.ObjToString(tmp["budget"]),
-			"bidamount":util.ObjToString(tmp["bidamount"]),
-			"projectname":util.ObjToString(tmp["projectname"]),
-			"projectcode":util.ObjToString(tmp["projectcode"]),
-			"publishtime":util.ObjToString(tmp["publishtime"]),
-			"bidopentime":util.ObjToString(tmp["bidopentime"]),
-			"agencyaddr":util.ObjToString(tmp["agencyaddr"]),
-		}
-		value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
-		arr := crawlerMap_1[value]
-		if arr==nil {
-			crawlerMap_1[value] = make([]map[string]interface{},0)
-			crawlerMap_1[value] = append(crawlerMap_1[value],dic)
-		}else {
-			crawlerMap_1[value] = append(crawlerMap_1[value],dic)
-		}
-
-	}
-
-	sess_2 :=mgo.GetMgoConn()
-	defer mgo.DestoryMongoConn(sess_2)
-	it_2 :=sess_2.DB("extract_kf").C("zheng_test1_jd2").Find(nil).Sort("_id").Iter()
-	n2:=0
-	crawlerMap_2 := make(map[string][]map[string]interface{})
-	for tmp := make(map[string]interface{}); it_2.Next(&tmp); n2++ {
-		if n2%10000==0 {
-			log.Println("当前n2:",n2)
-		}
-
-		//if n2>1000 {
-		//	break
-		//}
-
-		//类别
-		dic :=map[string]interface{}{
-			"_id":util.BsonIdToSId(tmp["_id"]),
-			"href":util.ObjToString(tmp["href"]),
-			"buyer":util.ObjToString(tmp["buyer"]),
-			"agency":util.ObjToString(tmp["agency"]),
-			"winner":util.ObjToString(tmp["winner"]),
-			"budget":util.ObjToString(tmp["budget"]),
-			"bidamount":util.ObjToString(tmp["bidamount"]),
-			"projectname":util.ObjToString(tmp["projectname"]),
-			"projectcode":util.ObjToString(tmp["projectcode"]),
-		}
-		value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
-		arr := crawlerMap_2[value]
-		if arr==nil {
-			crawlerMap_2[value] = make([]map[string]interface{},0)
-			crawlerMap_2[value] = append(crawlerMap_2[value],dic)
-		}else {
-			crawlerMap_2[value] = append(crawlerMap_2[value],dic)
-		}
-	}
-
-	log.Println("爬虫类个数分别为:",len(crawlerMap_1),len(crawlerMap_2))
-
-
-
-
-	if len(crawlerMap_1)!=len(crawlerMap_2)||len(crawlerMap_1)==0 {
-		return
-	}
-	var list = []string{
-		"buyer",
-		"agency",
-		"winner",
-		"budget",
-		"bidamount",
-		"projectname",
-		"projectcode",
-	}
-
-	var crawlerArr = []string{
-		"a_zgzfcgw_zfcghtgg_new",
-		"gd_gdszfcgw_dscght",
-		"a_zgzfcgw_bid_tender_new",
-		"a_ztxygjzbtbzxyxgs_zbxx",
-		"sd_zgsdzfcgw_xxgk_sxhtgk",
-	}
-
-
-	//数量统计
-	AnaNumMap :=map[string]map[string][]int{
-		"a_zgzfcgw_zfcghtgg_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
-		"gd_gdszfcgw_dscght": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
-		"a_zgzfcgw_bid_tender_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
-		"a_ztxygjzbtbzxyxgs_zbxx": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
-		"sd_zgsdzfcgw_xxgk_sxhtgk": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
-	}
-
-	fmt.Println(len(AnaNumMap))
-	//-对比数据
-	for _,v:=range crawlerArr {
-		if crawlerMap_1[v]==nil||crawlerMap_2[v]==nil {
-			continue
-		}
-		//取数组数据
-		arr_1 := crawlerMap_1[v]
-		arr_2 := crawlerMap_2[v]
-
-
-		log.Println("数据总量:",len(arr_1))
-		AnaNumMap[v]["total"][0] = len(arr_1)
-		f :=xlsx.NewFile()
-		//创建7个表格
-		for i:=0;i<len(list) ;i++  {
-			isTitle :=false
-			row:=0
-			for j:=0;j<len(arr_1);j++ {
-				string_1 := fmt.Sprint(arr_1[j][list[i]])
-				string_2 := fmt.Sprint(arr_2[j][list[i]])
-				if string_1!=string_2 {
-					if !isTitle{
-						sheet, _ := f.AddSheet(list[i])
-						sheet.Cell(row, 0).Value = "_id"
-						sheet.Cell(row, 1).Value = "href"
-						sheet.Cell(row, 2).Value = fmt.Sprint(list[i])+"_V1"
-						sheet.Cell(row, 3).Value = fmt.Sprint(list[i])+"_V2"
-						isTitle = true
-						row++
-					}
-					sheet :=f.Sheet[list[i]]
-					sheet.Cell(row, 0).Value = util.BsonIdToSId(arr_1[j]["_id"])
-					sheet.Cell(row, 1).Value = util.ObjToString(arr_1[j]["href"])
-					sheet.Cell(row, 2).Value = string_1
-					sheet.Cell(row, 3).Value = string_2
-					row++
-					AnaNumMap[v]["diff"][i] = AnaNumMap[v]["diff"][i]+1
-				}else {
-					AnaNumMap[v]["same"][i] = AnaNumMap[v]["same"][i]+1
-				}
-			}
-		}
-
-		path:="zk_"+v+".xlsx"
-		error := f.Save(path)
-		if error != nil {
-			log.Println("保存xlsx失败:", error)
-			return
-		}
-	}
-
-
-
-	f :=xlsx.NewFile()
-	sheet, _ := f.AddSheet("摘要")
-	for i:=0;i<len(list) ;i++ {
-		sheet.Cell(1, i+3).Value = list[i]
-	}
-
-	for i:=0;i<len(crawlerArr) ;i++ {
-		sheet.Cell(i+2, 0).Value = crawlerArr[i]
-		total:= fmt.Sprint(AnaNumMap[crawlerArr[i]]["total"][0])
-		sheet.Cell(i+2, 1).Value = total
-		same:=AnaNumMap[crawlerArr[i]]["same"]
-		diff:=AnaNumMap[crawlerArr[i]]["diff"]
-		for j:=0;j<len(same) ;j++  {
-			sheet.Cell(i+2, j+3).Value = fmt.Sprint(same[j])+"~"+fmt.Sprint(diff[j])
-		}
-
-	}
-	sheet.Cell(1, 0).Value = "爬虫代码"
-	sheet.Cell(1, 1).Value = "数据总量"
-	sheet.Cell(1, 2).Value = "相同字段对比"
-	sheet.Cell(2, 2).Value = "相同数量~不同数量"
-
-
-
-	fmt.Println(AnaNumMap)
-
-
-
-	path:="摘要"+".xlsx"
-	error := f.Save(path)
-	if error != nil {
-		log.Println("保存xlsx失败:", error)
-	}
-
-}
-
-
-
-//对比判重区别
-//func Test_heavy(t *testing.T) {
-
-	//mapinfo := map[string]interface{}{
-	//	"gtid":  "586b6d7061a0721f15b8f264",
-	//	"lteid": "5e0b2b780cf41612e0639460",
-	//}
-	//task([]byte{}, mapinfo)
-
-	//extract,extract_copy:="a_testbidding_new","a_testbidding"
-	//
-	//sess := mgo.GetMgoConn()
-	//defer mgo.DestoryMongoConn(sess)
-	//res_copy := sess.DB("extract_kf").C(extract_copy).Find(nil).Iter()
-	//
-	//m1 :=map[string]int{} //老版本
-	//m2 :=map[string]int{} //新版本
-	//
-	//i:=0
-	//j:=0
-	//for v1 := make(map[string]interface{}); res_copy.Next(&v1); i++{
-	//	if i%2000==0 {
-	//		log.Println("当前i:",i)
-	//	}
-	//	m1[(v1["_id"].(bson.ObjectId).Hex())]= util.IntAll(v1["repeat"])
-	//}
-	//
-	//sesss := mgo.GetMgoConn()
-	//defer mgo.DestoryMongoConn(sesss)
-	//res := sesss.DB("extract_kf").C(extract).Find(nil).Iter()
-	//
-	//
-	//for v2 := make(map[string]interface{}); res.Next(&v2); j++{
-	//	if j%2000==0 {
-	//		log.Println("当前j:",j)
-	//	}
-	//	m2[(v2["_id"].(bson.ObjectId).Hex())]= util.IntAll(v2["repeat"])
-	//}
-	//
-	//fmt.Println(len(m1),len(m2))
-	//n1:=0
-	//n2:=0
-	//n3:=0
-	//n4:=0
-	//n5:=0
-	//n6:=0
-	//
-	//var arr1 []string
-	//var arr2 []string
-	//for k,v:=range m1{
-	//
-	//	if m2[k]==1&&v==0{//0:1
-	//		n1++
-	//		arr2 = append(arr2,fmt.Sprintf("目标_id:%s",k))
-	//	}
-	//	if m2[k]==0&&v==1{ //1:0
-	//		n2++
-	//		arr1 = append(arr1,fmt.Sprintf("目标_id:%s",k))
-	//	}
-	//	if m2[k]==0&&v==0{ //0:0
-	//		n3++
-	//	}
-	//	if m2[k]==1&&v==1{//1:1
-	//		n4++
-	//	}
-	//	if m2[k]==-1&&v==0{ //0:-1
-	//		n5++
-	//	}
-	//	if m2[k]==-1&&v==1{//1:-1
-	//		n6++
-	//	}
-	//
-	//}
-	////打印 1:0情况    ;
-	//mm:=0
-	//for _,v:=range arr1 {
-	//	mm++
-	//	if mm%200==0 {
-	//		log.Println(v)
-	//	}
-	//}
-	//
-	//log.Println("分割线---------------")
-	//log.Println("分割线---------------")
-	//
-	//
-	////打印 0:1情况
-	//nn:=0
-	//for _,v:=range arr2 {
-	//	nn++
-	//	if nn%200==0 {
-	//		log.Println(v)
-	//	}
-	//}
-	//
-	//log.Println("V1 0:1---",n1)
-	//log.Println("V1 1:0---",n2)
-	//log.Println("V1 0:0---",n3)
-	//log.Println("V1 1:1---",n4)
-	//log.Println("V1 0:-1---",n5)
-	//log.Println("V1 1:-1---",n6)
-//}
-
-//糅合数据
-//func Test_specifiedField(t *testing.T) {
-
-	//mgo = &mongodb.MongodbSim{
-	//	MongodbAddr: "192.168.3.207:27081",
-	//	DbName:      "qfw",
-	//	Size:        util.IntAllDef(15, 10),
-	//}
-	//mgo.InitPool()
-	//
-	//mgo_copy = &mongodb.MongodbSim{
-	//	MongodbAddr: "192.168.3.207:27092",
-	//	DbName:      "extract_kf",
-	//	Size:        util.IntAllDef(15, 10),
-	//}
-	//mgo_copy.InitPool()
-	//
-	//
-	////固定死的需要分析的字段
-	//field_map := map[string]string{
-	//	"title":"1",
-	//	"area":"1",
-	//	"city":"1",
-	//	"subtype":"1",
-	//	"buyer":"1",
-	//	"agency":"1",
-	//	"winner":"1",
-	//	"budget":"1",
-	//	"bidamount":"1",
-	//	"projectname":"1",
-	//	"projectcode":"1",
-	//	"publishtime":"1",
-	//	"comeintime":"1",
-	//	"bidopentime":"1",
-	//	"agencyaddr":"1",
-	//	"site":"1",
-	//	"href":"1",
-	//}
-	//
-	//
-	//sess := mgo.GetMgoConn()
-	//defer mgo.DestoryMongoConn(sess)
-	//
-	//sess_1 :=mgo_copy.GetMgoConn()
-	//defer mgo_copy.DestoryMongoConn(sess_1)
-	//
-	//sess_2 :=mgo_copy.GetMgoConn()
-	//defer mgo_copy.DestoryMongoConn(sess_2)
-	//
-	//
-	//it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
-	//it_1 :=sess_1.DB("extract_kf").C("zheng_test_1")
-	//it_2 :=sess_2.DB("extract_kf").C("zheng_test_2")
-	//n:=0
-	//for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-	//	if n%10000==0 {
-	//		log.Println("当前n:",n)
-	//	}
-	//	if n>1000000 { //约半月数据
-	//		break
-	//	}
-	//	if tmp["spidercode"]=="a_zgzfcgw_zfcghtgg_new"|| tmp["spidercode"]=="gd_gdszfcgw_dscght"||
-	//		tmp["spidercode"]=="a_zgzfcgw_bid_tender_new"||tmp["spidercode"]=="a_ztxygjzbtbzxyxgs_zbxx"||
-	//		tmp["spidercode"]=="sd_zgsdzfcgw_xxgk_sxhtgk"{
-	//		jsonData := util.ObjToMap(tmp["jsondata"])
-	//		if jsonData!=nil {
-	//			for k,v :=range *jsonData  {
-	//				if fmt.Sprint(v) !=""{
-	//					if field_map[k]=="1" {
-	//						it_1.Insert(tmp)
-	//						it_2.Insert(tmp)
-	//						break
-	//					}
-	//				}
-	//			}
-	//		}
-	//	}
-	//}
-	//log.Println("总计",n,"条数据")
-
-//}
-
-
-//统计字段
-//func Test_field(t *testing.T) {
-
-	//mgo = &mongodb.MongodbSim{
-	//	MongodbAddr: "192.168.3.207:27081",
-	//	DbName:      "qfw",
-	//	Size:        util.IntAllDef(15, 10),
-	//}
-	//mgo.InitPool()
-	//
-	////调试 - 导出数据
-	////1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
-	////2:人工抽查数据质量,用于jsondata权重评估
-	//
-	////取 固有字段 1-为存在
-	////now := int64(time.Now().Unix())
-	////date_time := int64(86400*2)
-	//
-	////field_map := make(map[string]string,0)
-	////sess_field := mgo.GetMgoConn()
-	////defer sess_field.Close()
-	////res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
-	////for dict := make(map[string]interface{}); res_field.Next(&dict); {
-	////	field_map[dict["s_field"].(string)] = "1"
-	////}
-	//
-	////固定死的需要分析的字段
-	//field_map := map[string]string{
-	//	"title":"1",
-	//	"area":"1",
-	//	"city":"1",
-	//	"subtype":"1",
-	//	"buyer":"1",
-	//	"agency":"1",
-	//	"winner":"1",
-	//	"budget":"1",
-	//	"bidamount":"1",
-	//	"projectname":"1",
-	//	"projectcode":"1",
-	//	"publishtime":"1",
-	//	"comeintime":"1",
-	//	"bidopentime":"1",
-	//	"agencyaddr":"1",
-	//	"site":"1",
-	//	"href":"1",
-	//}
-	//
-	///*	ObjectId("5da3f2c5a5cb26b9b79847fc") 0
-	//	ObjectId("5da3fd6da5cb26b9b7a8683c") 5000
-	//	ObjectId("5da40bdaa5cb26b9b7bea472") 10000
-	//	ObjectId("5da44deaa5cb26b9b75efb38") 50000
-	//	ObjectId("5da53440a5cb26b9b7d3f9aa") 100000
-	//	ObjectId("5db2735ba5cb26b9b7c99c6f") 761414
-	//*/
-	//
-	///*
-	//qfw-bidding
-	//
-	//ObjectId("5e0d4cdd0cf41612e063fc65")  -1
-	//ObjectId("5df8bfe4e9d1f601e4e87431") 一百万
-	//ObjectId("5dea080ce9d1f601e45cb838") 二百万
-	//
-	//5df834dd // 半月         大约100万条
-	//
-	//*/
-	//sess := mgo.GetMgoConn()
-	//defer mgo.DestoryMongoConn(sess)
-	////q := map[string]interface{}{
-	////	"_id": map[string]interface{}{
-	////		"$gt":  util.StringTOBsonId("5dea080ce9d1f601e45cb838"),
-	////		"$lte": util.StringTOBsonId("5e0d4cdd0cf41612e063fc65"),
-	////	},
-	////}
-	//it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
-	//
-	////爬虫组
-	//crawlerMap,n := make(map[string]map[string]interface{},0),0
-	//
-	//for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-	//	if n%10000==0 {
-	//		log.Println("当前n:",n)
-	//	}
-	//
-	//	if n>3000000 {
-	//		break
-	//	}
-	//
-	//	if tmp["spidercode"]!="" {
-	//		//判断是否有此类别分组
-	//		dict := make(map[string]interface{},0)
-	//		if crawlerMap[tmp["spidercode"].(string)]!= nil {
-	//			dict = crawlerMap[tmp["spidercode"].(string)]
-	//		}
-	//		jsonData := util.ObjToMap(tmp["jsondata"])
-	//
-	//		if jsonData!=nil {
-	//			for k,v :=range *jsonData  {
-	//				if fmt.Sprint(v) ==""{
-	//					//无效数据
-	//				}else {
-	//					if field_map[k]=="1" {
-	//						arr := dict[k]
-	//						if arr==nil {
-	//							dict[k] = make([]string,0)
-	//							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
-	//						}else {
-	//							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
-	//						}
-	//					}
-	//				}
-	//			}
-	//		}
-	//		if dict!=nil {
-	//			crawlerMap[tmp["spidercode"].(string)] = dict
-	//		}
-	//	}
-	//}
-	//
-	//log.Println("总计",n,"条数据")
-	//log.Println("爬虫类别个数:",len(crawlerMap))
-	//
-	//
-	////计算每个爬虫分类的总数-并添加
-	//
-	////ObjectId("5e0d4cdd0cf41612e063fc65")
-	//arr :=make([]map[string]interface{},0)
-	//for k,v :=range crawlerMap  {
-	//	total :=0
-	//	for _,v1 :=range v {
-	//		total =total + len(v1.([]string))
-	//	}
-	//	v["total"]= total
-	//	v["key"] = k
-	//	arr = append(arr,v)
-	//}
-	//
-	//
-	////爬虫类别下-有效字段总数排列 前100
-	//start := time.Now().Unix()
-	//quickSort(0,len(arr)-1,&arr)
-	//end :=time.Now().Unix()
-	//fmt.Println("耗时:",end-start,"秒")
-	//
-	//f :=xlsx.NewFile()
-	//sheet, _ := f.AddSheet("排序")
-	//
-	////第一行先写标题
-	//row1 := sheet.AddRow()
-	//row1.AddCell().Value = "排名"
-	//row1.AddCell().Value = "爬虫类"
-	//row1.AddCell().Value = "字段有效数"
-	//
-	//mapLock := &sync.Mutex{}
-	//limit :=0
-	//for _,v :=range arr  {
-	//	limit++
-	//	row := sheet.AddRow()
-	//	row.AddCell().SetInt(limit)
-	//	row.AddCell().SetString(v["key"].(string))
-	//	row.AddCell().SetInt(v["total"].(int))
-	//
-	//	if limit <=20 {
-	//		mapLock.Lock()
-	//		sheetName := "排名"+util.ObjToString(limit)+":"+util.ObjToString(v["key"])
-	//		sheet_detail, err := f.AddSheet(sheetName)
-	//		if err==nil {
-	//			row_num,col_num :=0,0
-	//			for k1,v1 := range v {
-	//				if a,ok :=v1.([]string);ok {
-	//					for k2, v2 := range a {
-	//						if k2==0 {
-	//							sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
-	//							row_num++
-	//							sheet_detail.Cell(row_num, col_num).Value = v2
-	//						}else {
-	//							if row_num>2000 {
-	//								continue
-	//							}
-	//							sheet_detail.Cell(row_num, col_num).Value = v2
-	//						}
-	//						row_num++
-	//					}
-	//					row_num = 0
-	//					col_num++
-	//				}
-	//			}
-	//		}
-	//
-	//		mapLock.Unlock()
-	//	}
-	//
-	//
-	//
-	//	if limit >99{
-	//		break
-	//	}
-	//}
-	//
-	//
-	//err := f.Save("zheng.xlsx")
-	//if err != nil {
-	//	log.Println("保存xlsx失败:", err)
-	//	return
-	//}
-	//log.Println("xlsx保存成功")
-//}
-
-
-func quickSort(left int,right int ,array *[]map[string]interface{}) {
-
-	l:=left
-	r:=right
-
-	pivot := util.IntAll((*array)[(left+right)/2]["total"])//中轴
-	//for 的目标  将比pivot小的左边 反之右边
-	for ;l<r;{
-		//左半区找到大于等于pivot的数
-		for ;util.IntAll((*array)[l]["total"]) > pivot; {
-			l++
-		}
-		//右半区找到小于等于pivot的数
-		for ;util.IntAll((*array)[r]["total"])<pivot; {
-			r--
-		}
-		//本次分解任务完成
-		if l>=r {
-			break
-		}
-
-		(*array)[l],(*array)[r] = (*array)[r],(*array)[l]
-		//优化相等的情况
-		if util.IntAll((*array)[l]["total"]) == pivot {
-			r--
-		}
-		if util.IntAll((*array)[r]["total"]) == pivot {
-			l++
-		}
-
-	}
-	if l==r {
-		l++
-		r--
-	}
-	//向左递归
-	if left<r {
-		quickSort(left,r,array)
-	}
-	//向右递归
-	if right>l {
-		quickSort(l,right,array)
-	}
-
-}

BIN
udpprojectset/src/zheng.xlsx


+ 1 - 9
udps/main.go

@@ -23,20 +23,12 @@ func main() {
 	//2017-06-01,2018-06-01
 	//2018-06-01,2019-02-20
 
-
-	/*
-	ObjectId("5da3f2c5a5cb26b9b79847fc")
-	ObjectId("5db2735ba5cb26b9b7c99c6f")
-
-	5da3f2c5a5cb26b9b79847fc
-	5db2735ba5cb26b9b7c99c6f
-	*/
 	flag.StringVar(&sid, "sid", "", "开始id")
 	flag.StringVar(&eid, "eid", "", "结束id")
 	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
 	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")
-	flag.StringVar(&port, "port", "1485", "dup端口")
+	flag.StringVar(&port, "port", "1488", "dup端口")
 	flag.StringVar(&stype, "stype", "", "stype")
 	flag.Parse()
 	var startid, endid bson.ObjectId