浏览代码

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

fengweiqiang 5 年之前
父节点
当前提交
21c62241e1
共有 4 个文件被更改,包括 572 次插入293 次删除
  1. 2 2
      udpfilterdup/src/config.json
  2. 49 47
      udpfilterdup/src/datamap.go
  3. 3 6
      udpfilterdup/src/main.go
  4. 518 238
      udpprojectset/src/heavy_test.go

+ 2 - 2
udpfilterdup/src/config.json

@@ -4,8 +4,8 @@
     "mongodb": {
         "addr": "192.168.3.207:27092",
         "pool": 5,
-        "db": "data_Xinxihua",
-        "extract": "20200103_fupin_data",
+        "db": "extract_kf",
+        "extract": "ceshi_info",
         "site": {
             "dbname": "zhaolongyue",
             "coll": "site"

+ 49 - 47
udpfilterdup/src/datamap.go

@@ -28,7 +28,6 @@ type Info struct {
 	comeintime  int64   //采集时间
 	bidopentime int64   //开标时间
 	agencyaddr  string  //开标地点
-	detail      string  //招标内容
 	site        string  //站点
 	href        string  //正文的url
 	repeatid    string  //重复id
@@ -210,7 +209,7 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.publishtime = qutil.Int64All(tmp["publishtime"])
 	info.bidopentime = qutil.Int64All(tmp["bidopentime"])
 	info.agencyaddr = qutil.ObjToString(tmp["agencyaddr"])
-	info.detail = qutil.ObjToString(tmp["detail"])
+	//info.detail = qutil.ObjToString(tmp["detail"])
 	info.site = qutil.ObjToString(tmp["site"])
 	info.href = qutil.ObjToString(tmp["href"])
 	info.repeatid = qutil.ObjToString(tmp["repeatid"])
@@ -228,9 +227,8 @@ func NewInfo(tmp map[string]interface{}) *Info {
 //判重方法
 func (d *datamap) check(info *Info) (b bool, source *Info, reason string) {
 	keys := []string{}
-	//不同时间段
 	d.lock.Lock()
-	for k, _ := range d.keys {
+	for k, _ := range d.keys { //不同时间段
 		//...代码
 		keys = append(keys, fmt.Sprintf("%s_%s_%s", k, info.subtype, info.area))
 		if info.area != "全国" { //这个后续可以不要
@@ -238,7 +236,6 @@ func (d *datamap) check(info *Info) (b bool, source *Info, reason string) {
 		}
 	}
 	d.lock.Unlock()
-
 L:
 	for _, k := range keys {
 		d.lock.Lock()
@@ -307,7 +304,8 @@ L:
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
 						reason = reason + "同机构-"
-						if quickHeavyMethodTwo(v, info, reason) {
+						repeat := false
+						if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 							b = true
 							source = v
 							break
@@ -316,14 +314,16 @@ L:
 						reason = reason + "非同机构-"
 						if info.city != "" && info.city == v.city {
 							reason = reason + "同城-"
-							if quickHeavyMethodTwo(v, info, reason) {
+							repeat := false
+							if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 								b = true
 								source = v
 								break
 							}
 						} else {
 							reason = reason + "不同城-"
-							if quickHeavyMethodOne(v, info, reason) {
+							repeat := false
+							if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
 								b = true
 								source = v
 								break
@@ -438,28 +438,28 @@ L:
 					//代理机构相同-非空相等
 					if v.agency != "" && info.agency != "" && v.agency == info.agency {
 						reason = reason + "同机构-"
-						if quickHeavyMethodTwo(v, info, reason) {
+						repeat := false
+						if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 							b = true
 							source = v
-							reasons = reason
 							break
 						}
 					} else {
 						reason = reason + "非同机构-"
 						if info.city != "" && info.city == v.city {
 							reason = reason + "同城-"
-							if quickHeavyMethodTwo(v, info, reason) {
+							repeat := false
+							if repeat, reason = quickHeavyMethodTwo(v, info, reason); repeat {
 								b = true
 								source = v
-								reasons = reason
 								break
 							}
 						} else {
 							reason = reason + "不同城-"
-							if quickHeavyMethodOne(v, info, reason) {
+							repeat := false
+							if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
 								b = true
 								source = v
-								reasons = reason
 								break
 							}
 						}
@@ -516,6 +516,7 @@ func (d *datamap) replaceSourceData(replaceData *Info, replaceId string) {
 	ct, _ := strconv.ParseInt(replaceId[:8], 16, 64)
 	dkey := qutil.FormatDateByInt64(&ct, qutil.Date_yyyyMMdd)
 	k := fmt.Sprintf("%s_%s_%s", dkey, replaceData.subtype, replaceData.area)
+	d.lock.Lock()
 	data := d.data[k]
 	if data == nil {
 		data = []*Info{replaceData}
@@ -534,6 +535,7 @@ func (d *datamap) replaceSourceData(replaceData *Info, replaceId string) {
 		}
 		d.data[k] = data
 	}
+	d.lock.Unlock()
 }
 
 func (h *historymap) replaceSourceData(replaceData *Info, replaceId string) {
@@ -563,67 +565,66 @@ func (h *historymap) replaceSourceData(replaceData *Info, replaceId string) {
 
 //以下为判重   -   一揽子的方法
 //判重方法1
-func quickHeavyMethodOne(v *Info, info *Info, reason string) bool {
-
+func quickHeavyMethodOne(v *Info, info *Info, reason string) (bool, string) {
 	if info.subtype == "招标" || info.subtype == "邀标" || info.subtype == "询价" ||
 		info.subtype == "竞谈" || info.subtype == "单一" || info.subtype == "竞价" ||
 		info.subtype == "变更" || info.subtype == "其他" {
 		//招标结果
 		if tenderRepeat_A(v, info, reason) {
 			if tenderRepeat_C(v, info) {
-				return false
+				return false, reason
 			} else {
 				reason = reason + "---招标类"
-				return true
+				return true, reason
 			}
 		} else {
-			return false
+			return false, reason
 		}
 
 	} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
 		//中标结果
 		if winningRepeat_A(v, info, reason) {
 			if winningRepeat_C(v, info) {
-				return false
+				return false, reason
 			} else {
 				reason = reason + "---中标类"
-				return true
+				return true, reason
 			}
 		} else {
-			return false
+			return false, reason
 		}
 
 	} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
 		//合同
 		if contractRepeat_A(v, info, reason) {
 			if contractRepeat_C(v, info) {
-				return false
+				return false, reason
 			} else {
 				reason = reason + "---合同类"
-				return true
+				return true, reason
 			}
 		} else {
-			return false
+			return false, reason
 		}
 	} else {
 		//招标结果
 		if tenderRepeat_A(v, info, reason) {
 			if tenderRepeat_C(v, info) {
-				return false
+				return false, reason
 			} else {
 				reason = reason + "---类别空-招标类"
-				return true
+				return true, reason
 			}
 		} else {
-			return false
+			return false, reason
 		}
 	}
 
-	return false
+	return false, reason
 }
 
 //判重方法2
-func quickHeavyMethodTwo(v *Info, info *Info, reason string) bool {
+func quickHeavyMethodTwo(v *Info, info *Info, reason string) (bool, string) {
 	//相同
 	if v.agency == info.agency && v.agency != "" && info.agency != "" {
 
@@ -633,70 +634,71 @@ func quickHeavyMethodTwo(v *Info, info *Info, reason string) bool {
 			//招标结果
 			if tenderRepeat_B(v, info, reason) {
 				if tenderRepeat_C(v, info) { //有不同
-					return false
+					return false, reason
 				} else {
 					reason = reason + "---招标类"
-					return true
+					return true, reason
 				}
 			} else {
-				return false
+				return false, reason
 			}
 
 		} else if info.subtype == "中标" || info.subtype == "成交" || info.subtype == "废标" || info.subtype == "流标" {
 			//中标结果
 			if winningRepeat_B(v, info, reason) {
 				if winningRepeat_C(v, info) { //有不同
-					return false
+					return false, reason
 				} else {
 					reason = reason + "---中标类"
-					return true
+					return true, reason
 				}
 			} else {
-				return false
+				return false, reason
 			}
 
 		} else if info.subtype == "合同" || info.subtype == "验收" || info.subtype == "违规" {
 			//合同
 			if contractRepeat_B(v, info, reason) {
 				if contractRepeat_C(v, info) { //有不同
-					return false
+					return false, reason
 				} else {
 					reason = reason + "---合同类"
-					return true
+					return true, reason
 				}
 			} else {
-				return false
+				return false, reason
 			}
 		} else {
 			//招标结果
 			if tenderRepeat_B(v, info, reason) {
 				if tenderRepeat_C(v, info) { //有不同
-					return false
+					return false, reason
 				} else {
 					reason = reason + "---类别空-招标类"
-					return true
+					return true, reason
 				}
 			} else {
-				return false
+				return false, reason
 			}
 		}
 	}
 
 	//不同
 	if v.agency != info.agency && v.agency != "" && info.agency != "" {
-		return false
+		return false, reason
 	}
 	//机构最少一个为空
 	if v.agency == "" || info.agency == "" {
-		if quickHeavyMethodOne(v, info, reason) {
+		var repeat = false
+		if repeat, reason = quickHeavyMethodOne(v, info, reason); repeat {
 			reason = reason + "---机构最少一个空"
-			return true
+			return true, reason
 		} else {
-			return false
+			return false, reason
 		}
 	}
 
-	return false
+	return false, reason
 }
 
 //招标_A

+ 3 - 6
udpfilterdup/src/main.go

@@ -105,7 +105,8 @@ func main() {
 
 //测试组人员使用
 func mainT() {
-	//568551000000000000000000,5e0f65000000000000000000
+	//sid = "568551000000000000000000"
+	//eid = "5e0f65000000000000000000"
 	mapinfo := map[string]interface{}{}
 	if sid == "" || eid == "" {
 		log.Println("sid,eid参数不能为空")
@@ -117,9 +118,7 @@ func mainT() {
 	task([]byte{}, mapinfo)
 	time.Sleep(5 * time.Second)
 }
-
 func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
-
 	fmt.Println("接受的段数据")
 	switch act {
 	case mu.OP_TYPE_DATA: //上个节点的数据
@@ -160,7 +159,6 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 
 //开始判重程序
 func task(data []byte, mapInfo map[string]interface{}) {
-
 	fmt.Println("开始数据判重")
 	defer util.Catch()
 	//区间id
@@ -184,7 +182,7 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	}
 	it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
 	updateExtract := [][]map[string]interface{}{}
-	pool := make(chan bool, 16)
+	pool := make(chan bool, 4)
 	wg := &sync.WaitGroup{}
 	mapLock := &sync.Mutex{}
 	n, repeateN := 0, 0
@@ -200,7 +198,6 @@ func task(data []byte, mapInfo map[string]interface{}) {
 				wg.Done()
 			}()
 			info := NewInfo(tmp)
-
 			//是否为无效数据
 			if invalidData(info.buyer, info.projectname, info.projectcode) {
 				mapLock.Lock()

+ 518 - 238
udpprojectset/src/heavy_test.go

@@ -6,17 +6,249 @@ import (
 	"log"
 	"qfw/util"
 	"qfw/util/mongodb"
-	"sync"
 	"testing"
-	"time"
 )
 
 var (
 	mgo          *mongodb.MongodbSim    //mongodb操作对象
+	//mgo_copy          *mongodb.MongodbSim    //mongodb操作对象
 )
 
 
-func Test_heavy(t *testing.T) {
+//分类爬虫抽取统计
+func Test_crawlerExtractitCompare(t *testing.T) {
+
+	mgo = &mongodb.MongodbSim{
+		MongodbAddr: "192.168.3.207:27092",
+		DbName:      "extract_kf",
+		Size:        util.IntAllDef(15, 10),
+	}
+	mgo.InitPool()
+
+	sess := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess)
+	it :=sess.DB("extract_kf").C("zheng_test_1").Find(nil).Sort("_id").Iter()
+	n:=0
+	crawlerMap := make(map[string]string,0)
+	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		if n%10000==0 {
+			log.Println("当前n:",n)
+		}
+
+		//if n>2000 {
+		//	break
+		//}
+		crawlerMap[util.BsonIdToSId(tmp["_id"])] = util.ObjToString(tmp["spidercode"])
+	}
+
+	sess_1 := mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess_1)
+	it_1 :=sess_1.DB("extract_kf").C("zheng_test1_jd1").Find(nil).Sort("_id").Iter()
+	n1:=0
+	crawlerMap_1 := make(map[string][]map[string]interface{},0)
+
+	for tmp := make(map[string]interface{});it_1.Next(&tmp);n1++{
+		if n1%10000==0 {
+			log.Println("当前n1:",n1)
+		}
+
+		//if n1>2000 {
+		//	break
+		//}
+
+		//类别
+		dic :=map[string]interface{}{
+			"_id":util.BsonIdToSId(tmp["_id"]),
+			"href":util.ObjToString(tmp["href"]),
+			"title":util.ObjToString(tmp["title"]),
+			"buyer":util.ObjToString(tmp["buyer"]),
+			"agency":util.ObjToString(tmp["agency"]),
+			"winner":util.ObjToString(tmp["winner"]),
+			"budget":util.ObjToString(tmp["budget"]),
+			"bidamount":util.ObjToString(tmp["bidamount"]),
+			"projectname":util.ObjToString(tmp["projectname"]),
+			"projectcode":util.ObjToString(tmp["projectcode"]),
+			"publishtime":util.ObjToString(tmp["publishtime"]),
+			"bidopentime":util.ObjToString(tmp["bidopentime"]),
+			"agencyaddr":util.ObjToString(tmp["agencyaddr"]),
+		}
+		value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
+		arr := crawlerMap_1[value]
+		if arr==nil {
+			crawlerMap_1[value] = make([]map[string]interface{},0)
+			crawlerMap_1[value] = append(crawlerMap_1[value],dic)
+		}else {
+			crawlerMap_1[value] = append(crawlerMap_1[value],dic)
+		}
+
+	}
+
+	sess_2 :=mgo.GetMgoConn()
+	defer mgo.DestoryMongoConn(sess_2)
+	it_2 :=sess_2.DB("extract_kf").C("zheng_test1_jd2").Find(nil).Sort("_id").Iter()
+	n2:=0
+	crawlerMap_2 := make(map[string][]map[string]interface{})
+	for tmp := make(map[string]interface{}); it_2.Next(&tmp); n2++ {
+		if n2%10000==0 {
+			log.Println("当前n2:",n2)
+		}
+
+		//if n2>1000 {
+		//	break
+		//}
+
+		//类别
+		dic :=map[string]interface{}{
+			"_id":util.BsonIdToSId(tmp["_id"]),
+			"href":util.ObjToString(tmp["href"]),
+			"buyer":util.ObjToString(tmp["buyer"]),
+			"agency":util.ObjToString(tmp["agency"]),
+			"winner":util.ObjToString(tmp["winner"]),
+			"budget":util.ObjToString(tmp["budget"]),
+			"bidamount":util.ObjToString(tmp["bidamount"]),
+			"projectname":util.ObjToString(tmp["projectname"]),
+			"projectcode":util.ObjToString(tmp["projectcode"]),
+		}
+		value :=crawlerMap[util.BsonIdToSId(tmp["_id"])]
+		arr := crawlerMap_2[value]
+		if arr==nil {
+			crawlerMap_2[value] = make([]map[string]interface{},0)
+			crawlerMap_2[value] = append(crawlerMap_2[value],dic)
+		}else {
+			crawlerMap_2[value] = append(crawlerMap_2[value],dic)
+		}
+	}
+
+	log.Println("爬虫类个数分别为:",len(crawlerMap_1),len(crawlerMap_2))
+
+
+
+
+	if len(crawlerMap_1)!=len(crawlerMap_2)||len(crawlerMap_1)==0 {
+		return
+	}
+	var list = []string{
+		"buyer",
+		"agency",
+		"winner",
+		"budget",
+		"bidamount",
+		"projectname",
+		"projectcode",
+	}
+
+	var crawlerArr = []string{
+		"a_zgzfcgw_zfcghtgg_new",
+		"gd_gdszfcgw_dscght",
+		"a_zgzfcgw_bid_tender_new",
+		"a_ztxygjzbtbzxyxgs_zbxx",
+		"sd_zgsdzfcgw_xxgk_sxhtgk",
+	}
+
+
+	//数量统计
+	AnaNumMap :=map[string]map[string][]int{
+		"a_zgzfcgw_zfcghtgg_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+		"gd_gdszfcgw_dscght": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+		"a_zgzfcgw_bid_tender_new": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+		"a_ztxygjzbtbzxyxgs_zbxx": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+		"sd_zgsdzfcgw_xxgk_sxhtgk": map[string][]int{"same":[]int{0,0,0,0,0,0,0},"diff":[]int{0,0,0,0,0,0,0},"total":[]int{0}},
+	}
+
+	fmt.Println(len(AnaNumMap))
+	//-对比数据
+	for _,v:=range crawlerArr {
+		if crawlerMap_1[v]==nil||crawlerMap_2[v]==nil {
+			continue
+		}
+		//取数组数据
+		arr_1 := crawlerMap_1[v]
+		arr_2 := crawlerMap_2[v]
+
+
+		log.Println("数据总量:",len(arr_1))
+		AnaNumMap[v]["total"][0] = len(arr_1)
+		f :=xlsx.NewFile()
+		//创建7个表格
+		for i:=0;i<len(list) ;i++  {
+			isTitle :=false
+			row:=0
+			for j:=0;j<len(arr_1);j++ {
+				string_1 := fmt.Sprint(arr_1[j][list[i]])
+				string_2 := fmt.Sprint(arr_2[j][list[i]])
+				if string_1!=string_2 {
+					if !isTitle{
+						sheet, _ := f.AddSheet(list[i])
+						sheet.Cell(row, 0).Value = "_id"
+						sheet.Cell(row, 1).Value = "href"
+						sheet.Cell(row, 2).Value = fmt.Sprint(list[i])+"_V1"
+						sheet.Cell(row, 3).Value = fmt.Sprint(list[i])+"_V2"
+						isTitle = true
+						row++
+					}
+					sheet :=f.Sheet[list[i]]
+					sheet.Cell(row, 0).Value = util.BsonIdToSId(arr_1[j]["_id"])
+					sheet.Cell(row, 1).Value = util.ObjToString(arr_1[j]["href"])
+					sheet.Cell(row, 2).Value = string_1
+					sheet.Cell(row, 3).Value = string_2
+					row++
+					AnaNumMap[v]["diff"][i] = AnaNumMap[v]["diff"][i]+1
+				}else {
+					AnaNumMap[v]["same"][i] = AnaNumMap[v]["same"][i]+1
+				}
+			}
+		}
+
+		path:="zk_"+v+".xlsx"
+		error := f.Save(path)
+		if error != nil {
+			log.Println("保存xlsx失败:", error)
+			return
+		}
+	}
+
+
+
+	f :=xlsx.NewFile()
+	sheet, _ := f.AddSheet("摘要")
+	for i:=0;i<len(list) ;i++ {
+		sheet.Cell(1, i+3).Value = list[i]
+	}
+
+	for i:=0;i<len(crawlerArr) ;i++ {
+		sheet.Cell(i+2, 0).Value = crawlerArr[i]
+		total:= fmt.Sprint(AnaNumMap[crawlerArr[i]]["total"][0])
+		sheet.Cell(i+2, 1).Value = total
+		same:=AnaNumMap[crawlerArr[i]]["same"]
+		diff:=AnaNumMap[crawlerArr[i]]["diff"]
+		for j:=0;j<len(same) ;j++  {
+			sheet.Cell(i+2, j+3).Value = fmt.Sprint(same[j])+"~"+fmt.Sprint(diff[j])
+		}
+
+	}
+	sheet.Cell(1, 0).Value = "爬虫代码"
+	sheet.Cell(1, 1).Value = "数据总量"
+	sheet.Cell(1, 2).Value = "相同字段对比"
+	sheet.Cell(2, 2).Value = "相同数量~不同数量"
+
+
+
+	fmt.Println(AnaNumMap)
+
+
+
+	path:="摘要"+".xlsx"
+	error := f.Save(path)
+	if error != nil {
+		log.Println("保存xlsx失败:", error)
+	}
+
+}
+
+
+
+//对比判重区别
+//func Test_heavy(t *testing.T) {
 
 	//mapinfo := map[string]interface{}{
 	//	"gtid":  "586b6d7061a0721f15b8f264",
@@ -24,40 +256,6 @@ func Test_heavy(t *testing.T) {
 	//}
 	//task([]byte{}, mapinfo)
 
-
-	//log.Println("1")
-	//代码copy数据
-	//sessTest :=mgoTest.GetMgoConn()
-	//defer sessTest.Close()
-	//
-	//sess := mgo.GetMgoConn()
-	//defer sess.Close()
-	//
-	////var arr []map[string]interface{}
-	//
-	//res_test := sessTest.DB("qfw").C("bidding").Find(mongodb.ObjToMQ(`{"comeintime":{"$gte": 1571025600, "$lte": 1571976000}}`, true)).Iter()
-	//res :=sess.DB("extract_kf").C("a_testbidding")
-	//5
-	//
-	//
-	//
-	//
-	//i:=0
-	//for dict := make(map[string]interface{}); res_test.Next(&dict); i++{
-	//
-	//	//插入
-	//	if i%2000==0 {
-	//		log.Println("当前:",i)
-	//	}
-	//	res.Insert(dict)
-	//	//if len(arr)>=500 {
-	//	//	arr = make([]map[string]interface{},0)
-	//	//}else {
-	//	//	arr = append(arr,dict)
-	//	//}
-	//}
-	//
-
 	//extract,extract_copy:="a_testbidding_new","a_testbidding"
 	//
 	//sess := mgo.GetMgoConn()
@@ -150,215 +348,297 @@ func Test_heavy(t *testing.T) {
 	//log.Println("V1 1:1---",n4)
 	//log.Println("V1 0:-1---",n5)
 	//log.Println("V1 1:-1---",n6)
-}
-
-
-
+//}
 
-func Test_field(t *testing.T) {
+//糅合数据
+//func Test_specifiedField(t *testing.T) {
 
-	mgo = &mongodb.MongodbSim{
-		MongodbAddr: "192.168.3.207:27081",
-		DbName:      "qfw",
-		Size:        util.IntAllDef(15, 10),
-	}
-	mgo.InitPool()
-
-	//调试 - 导出数据
-	//1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
-	//2:人工抽查数据质量,用于jsondata权重评估
-
-	//取 固有字段 1-为存在
-	//now := int64(time.Now().Unix())
-	//date_time := int64(86400*2)
-
-	//field_map := make(map[string]string,0)
-	//sess_field := mgo.GetMgoConn()
-	//defer sess_field.Close()
-	//res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
-	//for dict := make(map[string]interface{}); res_field.Next(&dict); {
-	//	field_map[dict["s_field"].(string)] = "1"
+	//mgo = &mongodb.MongodbSim{
+	//	MongodbAddr: "192.168.3.207:27081",
+	//	DbName:      "qfw",
+	//	Size:        util.IntAllDef(15, 10),
 	//}
-
-	//固定死的需要分析的字段
-	field_map := map[string]string{
-		"title":"1",
-		"area":"1",
-		"city":"1",
-		"subtype":"1",
-		"buyer":"1",
-		"agency":"1",
-		"winner":"1",
-		"budget":"1",
-		"bidamount":"1",
-		"projectname":"1",
-		"projectcode":"1",
-		"publishtime":"1",
-		"comeintime":"1",
-		"bidopentime":"1",
-		"agencyaddr":"1",
-		"site":"1",
-		"href":"1",
-	}
-
-	/*	ObjectId("5da3f2c5a5cb26b9b79847fc") 0
-		ObjectId("5da3fd6da5cb26b9b7a8683c") 5000
-		ObjectId("5da40bdaa5cb26b9b7bea472") 10000
-		ObjectId("5da44deaa5cb26b9b75efb38") 50000
-		ObjectId("5da53440a5cb26b9b7d3f9aa") 100000
-		ObjectId("5db2735ba5cb26b9b7c99c6f") 761414
-	*/
-
-	/*
-	qfw-bidding
-
-	ObjectId("5e0d4cdd0cf41612e063fc65")  -1
-	ObjectId("5df8bfe4e9d1f601e4e87431") 一百万
-	ObjectId("5dea080ce9d1f601e45cb838") 二百万
-
-
-	*/
-	sess := mgo.GetMgoConn()
-	defer mgo.DestoryMongoConn(sess)
-	//q := map[string]interface{}{
-	//	"_id": map[string]interface{}{
-	//		"$gt":  util.StringTOBsonId("5dea080ce9d1f601e45cb838"),
-	//		"$lte": util.StringTOBsonId("5e0d4cdd0cf41612e063fc65"),
-	//	},
+	//mgo.InitPool()
+	//
+	//mgo_copy = &mongodb.MongodbSim{
+	//	MongodbAddr: "192.168.3.207:27092",
+	//	DbName:      "extract_kf",
+	//	Size:        util.IntAllDef(15, 10),
 	//}
-	it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
-
-	//爬虫组
-	crawlerMap,n := make(map[string]map[string]interface{},0),0
-
-	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
-		if n%10000==0 {
-			log.Println("当前n:",n)
-		}
-
-		if n>3000000 {
-			break
-		}
-
-		if tmp["spidercode"]!="" {
-			//判断是否有此类别分组
-			dict := make(map[string]interface{},0)
-			if crawlerMap[tmp["spidercode"].(string)]!= nil {
-				dict = crawlerMap[tmp["spidercode"].(string)]
-			}
-			jsonData := util.ObjToMap(tmp["jsondata"])
-
-			if jsonData!=nil {
-				for k,v :=range *jsonData  {
-					if fmt.Sprint(v) ==""{
-						//无效数据
-					}else {
-						if field_map[k]=="1" {
-							arr := dict[k]
-							if arr==nil {
-								dict[k] = make([]string,0)
-								dict[k] = append(dict[k].([]string),fmt.Sprint(v))
-							}else {
-								dict[k] = append(dict[k].([]string),fmt.Sprint(v))
-							}
-						}
-					}
-				}
-			}
-			if dict!=nil {
-				crawlerMap[tmp["spidercode"].(string)] = dict
-			}
-		}
-	}
-
-	log.Println("总计",n,"条数据")
-	log.Println("爬虫类别个数:",len(crawlerMap))
-
-
-	//计算每个爬虫分类的总数-并添加
-
-	//ObjectId("5e0d4cdd0cf41612e063fc65")
-	arr :=make([]map[string]interface{},0)
-	for k,v :=range crawlerMap  {
-		total :=0
-		for _,v1 :=range v {
-			total =total + len(v1.([]string))
-		}
-		v["total"]= total
-		v["key"] = k
-		arr = append(arr,v)
-	}
-
-
-	//爬虫类别下-有效字段总数排列 前100
-	start := time.Now().Unix()
-	quickSort(0,len(arr)-1,&arr)
-	end :=time.Now().Unix()
-	fmt.Println("耗时:",end-start,"秒")
-
-	f :=xlsx.NewFile()
-	sheet, _ := f.AddSheet("排序")
-
-	//第一行先写标题
-	row1 := sheet.AddRow()
-	row1.AddCell().Value = "排名"
-	row1.AddCell().Value = "爬虫类"
-	row1.AddCell().Value = "字段有效数"
-
-	mapLock := &sync.Mutex{}
-	limit :=0
-	for _,v :=range arr  {
-		limit++
-		row := sheet.AddRow()
-		row.AddCell().SetInt(limit)
-		row.AddCell().SetString(v["key"].(string))
-		row.AddCell().SetInt(v["total"].(int))
-
-		if limit <=20 {
-			mapLock.Lock()
-			sheetName := "排名"+util.ObjToString(limit)+":"+util.ObjToString(v["key"])
-			sheet_detail, err := f.AddSheet(sheetName)
-			if err==nil {
-				row_num,col_num :=0,0
-				for k1,v1 := range v {
-					if a,ok :=v1.([]string);ok {
-						for k2, v2 := range a {
-							if k2==0 {
-								sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
-								row_num++
-								sheet_detail.Cell(row_num, col_num).Value = v2
-							}else {
-								if row_num>2000 {
-									continue
-								}
-								sheet_detail.Cell(row_num, col_num).Value = v2
-							}
-							row_num++
-						}
-						row_num = 0
-						col_num++
-					}
-				}
-			}
-
-			mapLock.Unlock()
-		}
-
+	//mgo_copy.InitPool()
+	//
+	//
+	////固定死的需要分析的字段
+	//field_map := map[string]string{
+	//	"title":"1",
+	//	"area":"1",
+	//	"city":"1",
+	//	"subtype":"1",
+	//	"buyer":"1",
+	//	"agency":"1",
+	//	"winner":"1",
+	//	"budget":"1",
+	//	"bidamount":"1",
+	//	"projectname":"1",
+	//	"projectcode":"1",
+	//	"publishtime":"1",
+	//	"comeintime":"1",
+	//	"bidopentime":"1",
+	//	"agencyaddr":"1",
+	//	"site":"1",
+	//	"href":"1",
+	//}
+	//
+	//
+	//sess := mgo.GetMgoConn()
+	//defer mgo.DestoryMongoConn(sess)
+	//
+	//sess_1 :=mgo_copy.GetMgoConn()
+	//defer mgo_copy.DestoryMongoConn(sess_1)
+	//
+	//sess_2 :=mgo_copy.GetMgoConn()
+	//defer mgo_copy.DestoryMongoConn(sess_2)
+	//
+	//
+	//it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
+	//it_1 :=sess_1.DB("extract_kf").C("zheng_test_1")
+	//it_2 :=sess_2.DB("extract_kf").C("zheng_test_2")
+	//n:=0
+	//for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+	//	if n%10000==0 {
+	//		log.Println("当前n:",n)
+	//	}
+	//	if n>1000000 { //约半月数据
+	//		break
+	//	}
+	//	if tmp["spidercode"]=="a_zgzfcgw_zfcghtgg_new"|| tmp["spidercode"]=="gd_gdszfcgw_dscght"||
+	//		tmp["spidercode"]=="a_zgzfcgw_bid_tender_new"||tmp["spidercode"]=="a_ztxygjzbtbzxyxgs_zbxx"||
+	//		tmp["spidercode"]=="sd_zgsdzfcgw_xxgk_sxhtgk"{
+	//		jsonData := util.ObjToMap(tmp["jsondata"])
+	//		if jsonData!=nil {
+	//			for k,v :=range *jsonData  {
+	//				if fmt.Sprint(v) !=""{
+	//					if field_map[k]=="1" {
+	//						it_1.Insert(tmp)
+	//						it_2.Insert(tmp)
+	//						break
+	//					}
+	//				}
+	//			}
+	//		}
+	//	}
+	//}
+	//log.Println("总计",n,"条数据")
 
+//}
 
-		if limit >99{
-			break
-		}
-	}
 
+//统计字段
+//func Test_field(t *testing.T) {
 
-	err := f.Save("zheng.xlsx")
-	if err != nil {
-		log.Println("保存xlsx失败:", err)
-		return
-	}
-	log.Println("xlsx保存成功")
-}
+	//mgo = &mongodb.MongodbSim{
+	//	MongodbAddr: "192.168.3.207:27081",
+	//	DbName:      "qfw",
+	//	Size:        util.IntAllDef(15, 10),
+	//}
+	//mgo.InitPool()
+	//
+	////调试 - 导出数据
+	////1:已抽取字段为准,统计对应爬虫字段存在个数,出个结果表格统计(前100名)
+	////2:人工抽查数据质量,用于jsondata权重评估
+	//
+	////取 固有字段 1-为存在
+	////now := int64(time.Now().Unix())
+	////date_time := int64(86400*2)
+	//
+	////field_map := make(map[string]string,0)
+	////sess_field := mgo.GetMgoConn()
+	////defer sess_field.Close()
+	////res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
+	////for dict := make(map[string]interface{}); res_field.Next(&dict); {
+	////	field_map[dict["s_field"].(string)] = "1"
+	////}
+	//
+	////固定死的需要分析的字段
+	//field_map := map[string]string{
+	//	"title":"1",
+	//	"area":"1",
+	//	"city":"1",
+	//	"subtype":"1",
+	//	"buyer":"1",
+	//	"agency":"1",
+	//	"winner":"1",
+	//	"budget":"1",
+	//	"bidamount":"1",
+	//	"projectname":"1",
+	//	"projectcode":"1",
+	//	"publishtime":"1",
+	//	"comeintime":"1",
+	//	"bidopentime":"1",
+	//	"agencyaddr":"1",
+	//	"site":"1",
+	//	"href":"1",
+	//}
+	//
+	///*	ObjectId("5da3f2c5a5cb26b9b79847fc") 0
+	//	ObjectId("5da3fd6da5cb26b9b7a8683c") 5000
+	//	ObjectId("5da40bdaa5cb26b9b7bea472") 10000
+	//	ObjectId("5da44deaa5cb26b9b75efb38") 50000
+	//	ObjectId("5da53440a5cb26b9b7d3f9aa") 100000
+	//	ObjectId("5db2735ba5cb26b9b7c99c6f") 761414
+	//*/
+	//
+	///*
+	//qfw-bidding
+	//
+	//ObjectId("5e0d4cdd0cf41612e063fc65")  -1
+	//ObjectId("5df8bfe4e9d1f601e4e87431") 一百万
+	//ObjectId("5dea080ce9d1f601e45cb838") 二百万
+	//
+	//5df834dd // 半月         大约100万条
+	//
+	//*/
+	//sess := mgo.GetMgoConn()
+	//defer mgo.DestoryMongoConn(sess)
+	////q := map[string]interface{}{
+	////	"_id": map[string]interface{}{
+	////		"$gt":  util.StringTOBsonId("5dea080ce9d1f601e45cb838"),
+	////		"$lte": util.StringTOBsonId("5e0d4cdd0cf41612e063fc65"),
+	////	},
+	////}
+	//it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
+	//
+	////爬虫组
+	//crawlerMap,n := make(map[string]map[string]interface{},0),0
+	//
+	//for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+	//	if n%10000==0 {
+	//		log.Println("当前n:",n)
+	//	}
+	//
+	//	if n>3000000 {
+	//		break
+	//	}
+	//
+	//	if tmp["spidercode"]!="" {
+	//		//判断是否有此类别分组
+	//		dict := make(map[string]interface{},0)
+	//		if crawlerMap[tmp["spidercode"].(string)]!= nil {
+	//			dict = crawlerMap[tmp["spidercode"].(string)]
+	//		}
+	//		jsonData := util.ObjToMap(tmp["jsondata"])
+	//
+	//		if jsonData!=nil {
+	//			for k,v :=range *jsonData  {
+	//				if fmt.Sprint(v) ==""{
+	//					//无效数据
+	//				}else {
+	//					if field_map[k]=="1" {
+	//						arr := dict[k]
+	//						if arr==nil {
+	//							dict[k] = make([]string,0)
+	//							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+	//						}else {
+	//							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+	//						}
+	//					}
+	//				}
+	//			}
+	//		}
+	//		if dict!=nil {
+	//			crawlerMap[tmp["spidercode"].(string)] = dict
+	//		}
+	//	}
+	//}
+	//
+	//log.Println("总计",n,"条数据")
+	//log.Println("爬虫类别个数:",len(crawlerMap))
+	//
+	//
+	////计算每个爬虫分类的总数-并添加
+	//
+	////ObjectId("5e0d4cdd0cf41612e063fc65")
+	//arr :=make([]map[string]interface{},0)
+	//for k,v :=range crawlerMap  {
+	//	total :=0
+	//	for _,v1 :=range v {
+	//		total =total + len(v1.([]string))
+	//	}
+	//	v["total"]= total
+	//	v["key"] = k
+	//	arr = append(arr,v)
+	//}
+	//
+	//
+	////爬虫类别下-有效字段总数排列 前100
+	//start := time.Now().Unix()
+	//quickSort(0,len(arr)-1,&arr)
+	//end :=time.Now().Unix()
+	//fmt.Println("耗时:",end-start,"秒")
+	//
+	//f :=xlsx.NewFile()
+	//sheet, _ := f.AddSheet("排序")
+	//
+	////第一行先写标题
+	//row1 := sheet.AddRow()
+	//row1.AddCell().Value = "排名"
+	//row1.AddCell().Value = "爬虫类"
+	//row1.AddCell().Value = "字段有效数"
+	//
+	//mapLock := &sync.Mutex{}
+	//limit :=0
+	//for _,v :=range arr  {
+	//	limit++
+	//	row := sheet.AddRow()
+	//	row.AddCell().SetInt(limit)
+	//	row.AddCell().SetString(v["key"].(string))
+	//	row.AddCell().SetInt(v["total"].(int))
+	//
+	//	if limit <=20 {
+	//		mapLock.Lock()
+	//		sheetName := "排名"+util.ObjToString(limit)+":"+util.ObjToString(v["key"])
+	//		sheet_detail, err := f.AddSheet(sheetName)
+	//		if err==nil {
+	//			row_num,col_num :=0,0
+	//			for k1,v1 := range v {
+	//				if a,ok :=v1.([]string);ok {
+	//					for k2, v2 := range a {
+	//						if k2==0 {
+	//							sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
+	//							row_num++
+	//							sheet_detail.Cell(row_num, col_num).Value = v2
+	//						}else {
+	//							if row_num>2000 {
+	//								continue
+	//							}
+	//							sheet_detail.Cell(row_num, col_num).Value = v2
+	//						}
+	//						row_num++
+	//					}
+	//					row_num = 0
+	//					col_num++
+	//				}
+	//			}
+	//		}
+	//
+	//		mapLock.Unlock()
+	//	}
+	//
+	//
+	//
+	//	if limit >99{
+	//		break
+	//	}
+	//}
+	//
+	//
+	//err := f.Save("zheng.xlsx")
+	//if err != nil {
+	//	log.Println("保存xlsx失败:", err)
+	//	return
+	//}
+	//log.Println("xlsx保存成功")
+//}
 
 
 func quickSort(left int,right int ,array *[]map[string]interface{}) {