Procházet zdrojové kódy

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

fengweiqiang před 5 roky
rodič
revize
ec1e31ee67

+ 2 - 2
fullproject/src_v1/config.json

@@ -2,11 +2,11 @@
     "loadStart": 0,
 	"validdays":150,
     "statusdays": 7,
-	"mongodbServers": "192.168.3.207:27082",
+	"mongodbServers": "192.168.3.207:27092",
     "mongodbPoolSize": 10,
     "mongodbName": "extract_kf",
 	"hints":"publishtime_1",
-    "extractColl": "jh_info",
+    "extractColl": "december",
     "projectColl": "jh_project",
     "backupFlag": true,
     "backupColl": "jh_project1",

+ 0 - 1
fullproject/src_v1/init.go

@@ -296,7 +296,6 @@ type ProjectInfo struct {
 	score         int
 	comStr        string
 	resVal, pjVal int
-	IdStatusInfo  map[string]map[string]interface{}
 }
 
 type Site struct {

+ 150 - 105
fullproject/src_v1/project.go

@@ -430,16 +430,6 @@ var FIELDS = []string{
 	"package",
 }
 
-var bidtype = map[string]string{
-	"招标": "招标",
-	"询价": "询价",
-	"竞谈": "竞谈",
-	"单一": "单一",
-	"竞价": "竞价",
-	"变更": "变更",
-	"邀标": "邀标",
-}
-
 var bidstatus = map[string]string{
 	"预告": "预告",
 	"中标": "中标",
@@ -492,34 +482,28 @@ func (p *ProjectTask) NewProject(tmp map[string]interface{}, thisinfo *Info) (st
 	}
 	//projecthref保存
 	if jsonData, ok := tmp["jsondata"].(map[string]interface{}); ok {
-		if jsonData != nil && jsonData["projecthref"] != "" {
+		if jsonData != nil && qu.ObjToString(jsonData["projecthref"]) != "" {
 			set["projecthref"] = jsonData["projecthref"]
 		}
 	}
 
 	//招标类型
+	bt := qu.ObjToString(tmp["toptype"])
+	set["bidtype"] = bt
+	bs, _ := tmp["subtype"].(string)
 	p.mapBidLock.Lock()
-	bt := bidtype[thisinfo.SubType]
-	p.mapBidLock.Unlock()
-	if bt == "" {
-		bt = "招标"
+	if bidstatus[bs] != "" {
+		set["bidstatus"] = thisinfo.SubType
+	} else if tmp["infoformat"] == 2 {
+		set["bidstatus"] = "拟建"
+	} else if tmp["subytpe"] == "招标" {
+		set["bidstatus"] = thisinfo.TopType
+	} else {
+		set["bidstatus"] = "其它"
 	}
-	set["bidtype"] = bt
-	set["bidstatus"] = thisinfo.SubType
+	p.mapBidLock.Unlock()
 
 	p1, pkg := p.NewCachePinfo(pId, thisinfo, bt)
-	//招标信息是中标或者成交,保存bidstatus、budget、bidamount
-	if thisinfo.SubType == "中标" || thisinfo.SubType == "成交" {
-		p1.IdStatusInfo = map[string]map[string]interface{}{
-			thisinfo.Id: {
-				"projectname": thisinfo.ProjectName,
-				"bidstatus": thisinfo.SubType,
-				"budget": thisinfo.Budget,
-				"bidamount": thisinfo.Bidamount,
-			},
-		}
-	}
-
 	if len(thisinfo.Subscopeclass) > 0 {
 		s_subscopeclass := strings.Join(thisinfo.Subscopeclass, ",")
 		set["s_subscopeclass"] = s_subscopeclass
@@ -646,29 +630,24 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 		}
 	}
 	//2--lasttime
-	if thisinfo.Publishtime > pInfo.LastTime {
-		pInfo.LastTime = thisinfo.Publishtime
-		set["lasttime"] = thisinfo.Publishtime
-		p.mapBidLock.Lock()
-		defer p.mapBidLock.Unlock()
-		bt := bidtype[thisinfo.SubType]
-		if bt != "" {
-			set["bidtype"] = bt
-		}
-		bs, _ := tmp["subtype"].(string)
-		if bidstatus[bs] != "" {
-			set["bidstatus"] = thisinfo.SubType
-			if bidstatus[bs] != "预告" && bidstatus[bs] != "合同" {
-				set["jgtime"] = tmp["publishtime"]
-			}
-		} else if tmp["infoformat"] == 2 {
-			set["bidstatus"] = "拟建"
-		} else if tmp["subytpe"] == "招标" {
-			set["bidstatus"] = thisinfo.TopType
-		} else {
-			set["bidstatus"] = "其它"
+	pInfo.LastTime = thisinfo.Publishtime
+	set["lasttime"] = thisinfo.Publishtime
+	set["bidtype"] = tmp["toptype"]
+	bs, _ := tmp["subtype"].(string)
+	p.mapBidLock.Lock()
+	if bidstatus[bs] != "" {
+		set["bidstatus"] = thisinfo.SubType
+		if bidstatus[bs] != "预告" && bidstatus[bs] != "合同" {
+			set["jgtime"] = tmp["publishtime"]
 		}
+	} else if tmp["infoformat"] == 2 {
+		set["bidstatus"] = "拟建"
+	} else if tmp["subytpe"] == "招标" {
+		set["bidstatus"] = thisinfo.TopType
+	} else {
+		set["bidstatus"] = "其它"
 	}
+	p.mapBidLock.Unlock()
 
 	//废标、流标   处理时间
 	if thisinfo.SubType == "流标" || thisinfo.SubType == "废标" {
@@ -849,7 +828,7 @@ func (p *ProjectTask) CompareStatus(project *ProjectInfo, info *Info) (bool, int
 			if (info.Publishtime - project.FirstTime) > p.statusTime {
 				return true, 0
 			} else {
-				return true, 0
+				return false, 0
 			}
 		} else if project.Bidstatus == "成交" && info.SubType == "中标" {
 			return true, 0
@@ -920,8 +899,6 @@ func packageEle(map1 map[string]interface{}, id string) map[string]interface{} {
 }
 
 func PackageFormat(info *Info, project *ProjectInfo) map[string]interface{} {
-	budget := 0
-	bidamount := 0
 	p1 := map[string]interface{}{}
 	if project != nil && project.Package != nil && len(project.Package) > 0 {
 		p1 = project.Package
@@ -932,15 +909,6 @@ func PackageFormat(info *Info, project *ProjectInfo) map[string]interface{} {
 				if v2["bidstatus"] == nil {
 					v2["bidstatus"] = info.SubType
 				}
-				if isCount(project, v2[""]) {
-					
-				}
-				if v2["budget"] != nil {
-					budget = budget + v2["budget"].(float64)
-				}
-				if v2["bidamount"] != nil {
-					bidamount = bidamount + v2["bidamount"].(float64)
-				}
 				addFlag := false
 				for k1, v3 := range p1 {
 					if v4, ok := v3.([]map[string]interface{}); ok {
@@ -965,72 +933,149 @@ func PackageFormat(info *Info, project *ProjectInfo) map[string]interface{} {
 			if p2["bidstatus"] == nil {
 				p2["bidstatus"] = info.SubType
 			}
-			if p2["budget"] != nil {
-				budget = p2["budget"].(float64)
-			}
-			if p2["bidamount"] != nil {
-				bidamount = p2["bidamount"].(float64)
-			}
 			p1[k] = []map[string]interface{}{p2}
 		}
 	}
-	info.Budget = budget
-	info.Bidamount = bidamount
 	return p1
 }
 
 //计算预算(budget)、中标金额(bidamount)
 func CountAmount(project *ProjectInfo, info *Info) {
-	//if project!= nil && project.Package != nil && len(project.Package) > 0 {
-	////暂时未考虑太多情况,简单处理
-	//}
-
-	if info.Budget > 0 {
-		//项目中第一条招标信息是否是包/段项目
-		key := titleGetPc.FindStringSubmatch(project.ProjectName)
-		if len(key) > 0 {
-			//判断项目中是否已经计算过该包/段的预算
-			if !isCount(project, info.ProjectName) {
-				project.Budget = project.Budget + info.Budget
+	if info.HasPackage {
+		budget := 0.0
+		for _, v := range project.Package{
+			v1, _ := v.([]map[string]interface{})
+			for _, v2 := range v1{
+				b1 := qu.Float64All(v2["budget"])
+				if b1 > 0 {
+					budget = budget + b1
+					break
+				}
+			}
+		}
+		project.Budget = budget
+	}else {
+		//招标没有多包
+		k := KeyPackage.FindStringSubmatch(project.ProjectName)
+		if len(k) > 0 {
+			//招标是单包
+			if len(project.Package) > 0 {
+				//项目有多包
+				flag := false
+				for _, v := range project.Package{
+					v1, _ := v.([]map[string]interface{})
+					if len(v1) > 0 && v1[0]["name"] == info.ProjectName {
+						flag = true
+					}
+				}
+				if !flag {
+					project.Budget = project.Budget + info.Budget
+				}
+			}else {
+				//项目没有多包
+				if info.Budget > 0 {
+					project.Budget = project.Budget + info.Budget
+				}
 			}
 		}else {
+			//招标不是单包
 			if project.Budget < info.Budget {
 				project.Budget = info.Budget
 			}
 		}
 	}
-	//中标、成交、合同     处理中标金额(bidamount)
 	if info.SubType == "中标" || info.SubType == "成交" || info.SubType == "合同" {
-		if info.Bidamount > 0 {
-			key := titleGetPc.FindStringSubmatch(project.ProjectName)
-			if len(key) > 0 {
-				if !isCount(project, info.ProjectName) {
-					project.Bidamount = project.Bidamount + info.Bidamount
+		if info.HasPackage {
+			bidamount := 0.0
+			for _, v := range project.Package{
+				v1, _ := v.([]map[string]interface{})
+				for _, v2 := range v1{
+					b1 := qu.Float64All(v2["bidamount"])
+					if b1 > 0 {
+						bidamount = bidamount + b1
+						break
+					}
+				}
+			}
+			project.Bidamount = bidamount
+		}else {
+			//招标没有多包
+			k := KeyPackage.FindStringSubmatch(project.ProjectName)
+			if len(k) > 0 {
+				//招标是单包
+				if len(project.Package) > 0 {
+					//项目有多包
+					flag := false
+					for _, v := range project.Package{
+						v1, _ := v.([]map[string]interface{})
+						if len(v1) > 0 {
+							flag = true
+						}
+					}
+					if !flag {
+						project.Bidamount = project.Bidamount + info.Bidamount
+					}
+				}else {
+					//项目没有多包
+					if info.Bidamount > 0 {
+						project.Bidamount = project.Bidamount + info.Bidamount
+					}
 				}
 			}else {
-				if project.Bidamount < project.Bidamount {
+				//招标不是单包
+				if project.Bidamount < info.Bidamount {
 					project.Bidamount = info.Bidamount
 				}
 			}
 		}
 	}
 
-	//保存信息到IdStatusInfo
-	project.IdStatusInfo[info.Id] = map[string]interface{}{
-		"projectname": info.ProjectName,
-		"bidstatus": info.SubType,
-		"budget": info.Budget,
-		"bidamount": info.Bidamount,
-	}
-}
 
-func isCount(project *ProjectInfo, infoName string) bool {
-	if project.IdStatusInfo != nil && len(project.IdStatusInfo) > 0 {
-		for _, v := range project.IdStatusInfo{
-			if v["projectname"] == infoName {
-				return true
-			}
-		}
-	}
-	return false
+	//if info.Budget > 0 {
+	//	//项目中第一条招标信息是否是包/段项目
+	//	key := KeyPackage.FindStringSubmatch(project.ProjectName)
+	//	if len(key) > 0 {
+	//		//判断项目中是否已经计算过该包/段的预算
+	//		if !isCount(project, info.ProjectName) {
+	//			project.Budget = project.Budget + info.Budget
+	//		}
+	//	}else {
+	//		if project.Budget < info.Budget {
+	//			project.Budget = info.Budget
+	//		}
+	//	}
+	//}
+	////中标、成交、合同     处理中标金额(bidamount)
+	//if info.SubType == "中标" || info.SubType == "成交" || info.SubType == "合同" {
+	//	if info.Bidamount > 0 {
+	//		key := KeyPackage.FindStringSubmatch(project.ProjectName)
+	//		if len(key) > 0 {
+	//			if !isCount(project, info.ProjectName) {
+	//				project.Bidamount = project.Bidamount + info.Bidamount
+	//			}
+	//		}else {
+	//			if project.Bidamount > info.Bidamount {
+	//				project.Bidamount = info.Bidamount
+	//			}
+	//		}
+	//	}
+	//}
+	//
+	////保存信息到IdStatusInfo
+	//if project.IdStatusInfo != nil {
+	//	project.IdStatusInfo[info.Id] = map[string]interface{}{
+	//		"projectname": info.ProjectName,
+	//		"bidstatus": info.SubType,
+	//		"budget": info.Budget,
+	//		"bidamount": info.Bidamount,
+	//	}
+	//}else {
+	//	project.IdStatusInfo = map[string]map[string]interface{}{}
+	//	project.IdStatusInfo[info.Id] = map[string]interface{}{
+	//		"projectname": info.ProjectName,
+	//		"bidstatus": info.SubType,
+	//		"budget": info.Budget,
+	//		"bidamount": info.Bidamount,
+	//	}
+	//}
 }

+ 4 - 1
fullproject/src_v1/task.go

@@ -480,7 +480,7 @@ func (p *ProjectTask) CommonMerge(tmp map[string]interface{}, info *Info) {
 					p.AllIdsMapLock.Lock()
 					comparePro := p.AllIdsMap[pid].P
 					p.AllIdsMapLock.Unlock()
-					_, ex := CompareStatus(comparePro, info)
+					_, ex := p.CompareStatus(comparePro, info)
 					p.UpdateProject(tmp, info, comparePro, -1, "AAAAAAAAAA", ex)
 				} else {
 					id, p1 := p.NewProject(tmp, info)
@@ -515,6 +515,9 @@ func ParseInfo(tmp map[string]interface{}) (info *Info) {
 	if len(thisinfo.Subscopeclass) == 0 {
 		thisinfo.Subscopeclass = []string{}
 	}
+	if thisinfo.SubType == "" {
+		thisinfo.SubType = util.ObjToString(tmp["bidstatus"])
+	}
 
 	if thisinfo.Publishtime == 0 {
 		thisinfo.Publishtime = thisinfo.Comeintime

+ 31 - 31
fullproject/src_v1/update.go

@@ -68,7 +68,7 @@ func (p *ProjectTask) mergeAndModify(pInfoId string, index int, info *Info, tmp
 				//更新其它的项目
 				pro := MongoTool.FindById(ProjectColl, mergePro.Id.Hex())
 				backupPro(pro, )
-				choose, ex := CompareStatus(mergePro, info)
+				choose, ex := p.CompareStatus(mergePro, info)
 				if !choose {
 					p.UpdateProject(tmp, info, mergePro, i, comStr, ex)
 				}else {
@@ -165,7 +165,7 @@ func (p *ProjectTask) updateMerge(index int, info *Info, pInfoId string, tmp map
 			ex := 0
 			resArr := []*ProjectInfo{}
 			for _, res := range resN{
-				choose, e := CompareStatus(resN[0], info)
+				choose, e := p.CompareStatus(resN[0], info)
 				if !choose {
 					ex = e
 					resArr = append(resArr, res)
@@ -267,28 +267,20 @@ func mergeProject(p *ProjectTask, pInfo *ProjectInfo, thisinfo *Info, set map[st
 		}
 	}
 	//2--lasttime
-	if thisinfo.Publishtime > pInfo.LastTime {
-		pInfo.LastTime = thisinfo.Publishtime
-		set["lasttime"] = thisinfo.Publishtime
-		p.mapBidLock.Lock()
-		bt := bidtype[thisinfo.SubType]
-		p.mapBidLock.Unlock()
-		if bt != "" {
-			set["bidtype"] = bt
-		}
-		if thisinfo.SubType != "" {
-			set["bidstatus"] = thisinfo.SubType
-			if thisinfo.SubType != "预告" {
-				set["jgtime"] = thisinfo.Publishtime
-			}
-		}else if thisinfo.Infoformat == 2 {
-			set["bidstatus"] = "拟建"
-		}else if thisinfo.SubType == "招标" {
-			set["bidstatus"] = thisinfo.TopType
-		}else {
-			set["bidstatus"] = thisinfo.SubType
+	pInfo.LastTime = thisinfo.Publishtime
+	set["lasttime"] = thisinfo.Publishtime
+	set["bidtype"] = thisinfo.SubType
+	if thisinfo.SubType != "" {
+		set["bidstatus"] = thisinfo.SubType
+		if thisinfo.SubType != "预告" {
+			set["jgtime"] = thisinfo.Publishtime
 		}
-
+	}else if thisinfo.Infoformat == 2 {
+		set["bidstatus"] = "拟建"
+	}else if thisinfo.SubType == "招标" {
+		set["bidstatus"] = thisinfo.TopType
+	}else {
+		set["bidstatus"] = thisinfo.SubType
 	}
 
 	//3\4\5--省、市、县
@@ -353,14 +345,17 @@ func mergeProject(p *ProjectTask, pInfo *ProjectInfo, thisinfo *Info, set map[st
 		pInfo.Bidopentime = thisinfo.Bidopentime
 		set["bidopentime"] = pInfo.Bidopentime
 	}
-	if thisinfo.Bidamount > 0 && pInfo.Bidamount < 1 {
-		pInfo.Bidamount = thisinfo.Bidamount
-		set["bidamount"] = pInfo.Bidamount
-	}
 
-	if thisinfo.Budget > 0 && pInfo.Budget < 1 {
-		pInfo.Budget = thisinfo.Budget
-		set["budget"] = pInfo.Budget
+	//废标、流标   处理时间
+	if thisinfo.SubType == "流标" || thisinfo.SubType == "废标" {
+		pInfo.FirstTime = thisinfo.Publishtime
+		pInfo.Bidopentime = int64(0)
+		pInfo.LastTime = thisinfo.Publishtime
+
+		set["firsttime"] = thisinfo.Publishtime
+		set["zbtime"] = int64(0)
+		set["publishtime"] = thisinfo.Publishtime
+		set["bidopentime"] = int64(0)
 	}
 
 	if len(thisinfo.Topscopeclass) > 0 {
@@ -399,12 +394,17 @@ func mergeProject(p *ProjectTask, pInfo *ProjectInfo, thisinfo *Info, set map[st
 	}
 
 	if thisinfo.HasPackage {
-		pkg, _, _ := PackageFormat(thisinfo, pInfo)
+		pkg := PackageFormat(thisinfo, pInfo)
 		set["multipackage"] = 1
 		pInfo.Package = pkg
 	}else {
 		set["multipackage"] = 0
 	}
+	//处理多包后,计算预算金额、中标金额
+	CountAmount(pInfo, thisinfo)
+	set["budget"] = pInfo.Budget
+	set["bidamount"] = pInfo.Bidamount
+
 
 	set["mpn"] = pInfo.MPN
 	set["mpc"] = pInfo.MPC

+ 3 - 3
src/config.json

@@ -2,19 +2,19 @@
     "port": "9090",
     "mgodb": "192.168.3.207:27092",
     "dbsize": 10,
-    "dbname": "extract_kf",
+    "dbname": "extract_dev32",
     "redis": "buyer=192.168.3.207:1679,winner=192.168.3.207:1679,agency=192.168.3.207:1679",
     "elasticsearch": "http://192.168.3.11:9800",
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": true,
+    "saveresult": false,
     "qualityaudit": false,
     "saveblock": false,
     "filelength": 100000,
     "iscltlog": false,
     "brandgoods": false,
-    "udptaskid": "5cdd3025698414032c8322b1",
+    "udptaskid": "5e103206234ddc34b406c5d1",
     "udpport": "1484",
     "nextNode": [
         {

+ 9 - 9
src/jy/extract/extract.go

@@ -26,13 +26,13 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 100                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
-	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 100                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -267,7 +267,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
 	if err == nil {
 		conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " "))
-		if conlen < 50 {
+		if conlen < 200 {
 			if isextFile {
 				detail += qu.ObjToString(doc["detailfile"])
 				doc["detail"] = detail
@@ -1871,7 +1871,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 5 - 0
src/main.go

@@ -9,6 +9,8 @@ import (
 	_ "jy/front"
 	. "jy/router"
 	"jy/util"
+	"net/http"
+	_ "net/http/pprof"
 	qu "qfw/util"
 	//"qfw/util/elastic"
 	"qfw/util/redis"
@@ -42,6 +44,9 @@ func main() {
 	go extract.Export()
 	go Router.Run(":" + qu.ObjToString(util.Config["port"]))
 	go log.Debug("启动..", qu.ObjToString(util.Config["port"]))
+	go func() {
+		http.ListenAndServe("localhost:10000", nil)
+	}()
 	lock := make(chan bool)
 	<-lock
 }

+ 10 - 26
udpfilterdup/src/config.json

@@ -3,39 +3,23 @@
     "dupdays": 5,
     "mongodb": {
         "addr": "192.168.3.207:27092",
-        "pool": 15,
-        "db": "zhaolongyue",
-        "extract": "kedaxunfei_zhengfa_gnq",
-        "extract_copy": "a_testbidding",
-        "bidding": "bidding_126"
+        "pool": 5,
+        "db": "data_Xinxihua",
+        "extract": "20200103_fupin_data",
+        "site": {
+            "dbname": "zhaolongyue",
+            "coll": "site"
+        }
     },
     "jkmail": {
-        "to": "renzheng@topnet.net.cn",
+        "to": "zhangjinkun@topnet.net.cn",
         "api": "http://10.171.112.160:19281/_send/_mail"
     },
-    "nextNode": [
-        {
-            "addr": "127.0.0.11",
-            "port": 1482,
-            "stype": "project",
-            "memo": "合并项目"
-        },
-        {
-            "addr": "127.0.0.1",
-            "port": 1483,
-            "stype": "bidding",
-            "memo": "创建招标数据索引"
-        }
-    ],
-    "isMerger":false,
+    "nextNode": [],
+    "isMerger": false,
     "specialwords": "(重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研)",
     "specialtitle_1": "[0-9a-zA-Z一二三四五六七八九十零123456789](次|包|标段|标包)",
     "specialtitle_2": "项目([0-9a-zA-Z一二三四五六七八九十零123456789])",
-
-
     "beifen": "[((]?[0-9一二三四五六七八九十零123456789再][))]?[子分]?[次批标包]|重招|重新招标|勘察|设计|施工|监理|总承包|土石方|可研"
-
-
-
 }
 

+ 1 - 11
udpfilterdup/src/datamap.go

@@ -41,8 +41,6 @@ type Info struct {
 
 var datelimit = float64(432000) //五天
 var sitelock sync.Mutex         //锁
-var reason = ""
-
 
 //判重数据
 type datamap struct {
@@ -228,8 +226,7 @@ func NewInfo(tmp map[string]interface{}) *Info {
 }
 
 //判重方法
-func (d *datamap) check(info *Info) (b bool, source *Info, reasons string) {
-	reason = ""
+func (d *datamap) check(info *Info) (b bool, source *Info, reason string) {
 	keys := []string{}
 	//不同时间段
 	d.lock.Lock()
@@ -280,7 +277,6 @@ L:
 							reason = "href相同"
 							b = true
 							source = v
-							reasons = reason
 							break L
 						}
 						if info.href != "" && info.href != v.href {
@@ -304,7 +300,6 @@ L:
 							reason = "标题关键词且包含关系"
 							b = true
 							source = v
-							reasons = reason
 							break L
 						}
 					}
@@ -315,7 +310,6 @@ L:
 						if quickHeavyMethodTwo(v, info, reason) {
 							b = true
 							source = v
-							reasons = reason
 							break
 						}
 					} else {
@@ -325,7 +319,6 @@ L:
 							if quickHeavyMethodTwo(v, info, reason) {
 								b = true
 								source = v
-								reasons = reason
 								break
 							}
 						} else {
@@ -333,7 +326,6 @@ L:
 							if quickHeavyMethodOne(v, info, reason) {
 								b = true
 								source = v
-								reasons = reason
 								break
 							}
 						}
@@ -368,8 +360,6 @@ L:
 }
 
 func (h *historymap) checkHistory(info *Info) (b bool, source *Info, reasons string) {
-	reason = ""
-
 	h.lock.Lock()
 	defer h.lock.Unlock()
 	keys := []string{}

+ 48 - 32
udpfilterdup/src/main.go

@@ -11,6 +11,7 @@ import (
 	"log"
 	mu "mfw/util"
 	"net"
+	"os"
 	"qfw/util"
 	"qfw/util/mongodb"
 	"regexp"
@@ -23,15 +24,13 @@ var (
 	mconf     map[string]interface{} //mongodb配置信息
 	mgo       *mongodb.MongodbSim    //mongodb操作对象
 	//siteMgo      *mongodb.MongodbSim
-	extract      string
-	extract_copy string
-	bidding      string
-	udpclient    mu.UdpClient             //udp对象
-	nextNode     []map[string]interface{} //下节点数组
-	dupdays      = 5                      //初始化判重范围
-	DM           *datamap                 //
-	HM           *historymap              //判重数据
-	lastid       = ""
+	extract   string
+	udpclient mu.UdpClient             //udp对象
+	nextNode  []map[string]interface{} //下节点数组
+	dupdays   = 5                      //初始化判重范围
+	DM        *datamap                 //
+	HM        *historymap              //判重数据
+	lastid    = ""
 	/*
 		5da3f2c5a5cb26b9b79847fc
 	*/
@@ -42,10 +41,15 @@ var (
 
 	isMerger bool                              //是否合并
 	SiteMap  map[string]map[string]interface{} //站点map
+
+	idtype, sid, eid string //测试人员判重使用
 )
 
 func init() {
 	flag.StringVar(&lastid, "id", "", "最后加载id") //以小于等于此id开始加载最近几天的数据
+	flag.StringVar(&sid, "sid", "", "开始id")
+	flag.StringVar(&eid, "eid", "", "结束id")
+	flag.StringVar(&idtype, "idtype", "", "id类型,默认ObjectId:0,String:1")
 	flag.Parse()
 	//172.17.145.163:27080
 	util.ReadConfig(&Sysconfig)
@@ -57,7 +61,6 @@ func init() {
 		Size:        util.IntAllDef(mconf["pool"], 10),
 	}
 	extract = mconf["extract"].(string)
-	extract_copy = mconf["extract_copy"].(string)
 	mgo.InitPool()
 
 	//测试可以临时注释
@@ -69,13 +72,13 @@ func init() {
 	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
 	isMerger = Sysconfig["isMerger"].(bool)
 
-	//配置站点Map
+	//站点配置
+	site := mconf["site"].(map[string]interface{})
 	SiteMap = make(map[string]map[string]interface{}, 0)
 	start := int(time.Now().Unix())
-	//站点配置
 	sess_site := mgo.GetMgoConn()
 	defer sess_site.Close()
-	res_site := sess_site.DB("zhaolongyue").C("site").Find(nil).Sort("_id").Iter()
+	res_site := sess_site.DB(site["dbname"].(string)).C(site["coll"].(string)).Find(nil).Sort("_id").Iter()
 	for site_dict := make(map[string]interface{}); res_site.Next(&site_dict); {
 		data_map := map[string]interface{}{
 			"area":     util.ObjToString(site_dict["area"]),
@@ -91,9 +94,7 @@ func init() {
 }
 
 func main() {
-
 	go checkMapJob()
-
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
 	udpclient.Listen(processUdpMsg)
@@ -102,6 +103,21 @@ func main() {
 	time.Sleep(99999 * time.Hour)
 }
 
+//测试组人员使用
+func mainT() {
+	//568551000000000000000000,5e0f65000000000000000000
+	mapinfo := map[string]interface{}{}
+	if sid == "" || eid == "" {
+		log.Println("sid,eid参数不能为空")
+		os.Exit(0)
+	}
+	mapinfo["gtid"] = sid
+	mapinfo["lteid"] = eid
+	mapinfo["stop"] = "true"
+	task([]byte{}, mapinfo)
+	time.Sleep(5 * time.Second)
+}
+
 func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 
 	fmt.Println("接受的段数据")
@@ -150,13 +166,22 @@ func task(data []byte, mapInfo map[string]interface{}) {
 	//区间id
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
-	q := map[string]interface{}{
-		"_id": map[string]interface{}{
-			"$gt":  util.StringTOBsonId(mapInfo["gtid"].(string)),
-			"$lte": util.StringTOBsonId(mapInfo["lteid"].(string)),
-		},
+	var q map[string]interface{}
+	if idtype == "1" {
+		q = map[string]interface{}{
+			"_id": map[string]interface{}{
+				"$gt":  mapInfo["gtid"].(string),
+				"$lte": mapInfo["lteid"].(string),
+			},
+		}
+	} else {
+		q = map[string]interface{}{
+			"_id": map[string]interface{}{
+				"$gt":  util.StringTOBsonId(mapInfo["gtid"].(string)),
+				"$lte": util.StringTOBsonId(mapInfo["lteid"].(string)),
+			},
+		}
 	}
-
 	it := sess.DB(mgo.DbName).C(extract).Find(&q).Sort("publishtime").Iter()
 	updateExtract := [][]map[string]interface{}{}
 	pool := make(chan bool, 16)
@@ -552,8 +577,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 		source.area = info.area
 		source.city = info.city
 		mergeArr = append(mergeArr, 1)
-
-		//fmt.Println("合并-城市")
 	}
 	//2、项目名称
 	if source.projectname == "" && info.projectname != "" {
@@ -568,7 +591,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.projectname = info.projectname
 		mergeArr = append(mergeArr, 2)
-		//fmt.Println("合并-项目名称")
 	}
 	//3、项目编号
 	if source.projectcode == "" && info.projectcode != "" {
@@ -583,7 +605,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.projectcode = info.projectcode
 		mergeArr = append(mergeArr, 3)
-		//fmt.Println("合并-项目标号")
 	}
 	//4、采购单位
 	if source.buyer == "" && info.buyer != "" {
@@ -598,7 +619,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.buyer = info.buyer
 		mergeArr = append(mergeArr, 4)
-		//fmt.Println("合并-采购单位")
 	}
 	//5、预算
 	if source.budget == 0 && info.budget != 0 {
@@ -613,7 +633,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.budget = info.budget
 		mergeArr = append(mergeArr, 5)
-		//fmt.Println("合并-预算")
 	}
 	//6、中标单位
 	if source.winner == "" && info.winner != "" {
@@ -628,7 +647,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.winner = info.winner
 		mergeArr = append(mergeArr, 6)
-		//fmt.Println("合并-中标单位")
 	}
 	//7、中标金额
 	if source.bidamount == 0 && info.bidamount != 0 {
@@ -643,7 +661,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.bidamount = info.bidamount
 		mergeArr = append(mergeArr, 7)
-		//fmt.Println("合并-中标金额")
 	}
 	//8、开天时间-地点
 	if source.bidopentime == 0 && info.bidopentime != 0 {
@@ -658,7 +675,6 @@ func mergeDataFields(source *Info, info *Info) (*Info, []int64) {
 
 		source.bidopentime = info.bidopentime
 		mergeArr = append(mergeArr, 8)
-		//fmt.Println("合并-开标时间")
 	}
 
 	//以上合并过于简单,待进一步优化
@@ -817,10 +833,10 @@ func basicDataScore(v *Info, info *Info) bool {
 		n++
 	}
 	if info.agency != "" {
-		n = m + 2
+		n = n + 2
 	}
 	if info.city != "" {
-		n = m + 2
+		n = n + 2
 	}
 
 	if m > n {

+ 93 - 50
udpprojectset/src/heavy_test.go

@@ -158,8 +158,8 @@ func Test_heavy(t *testing.T) {
 func Test_field(t *testing.T) {
 
 	mgo = &mongodb.MongodbSim{
-		MongodbAddr: "192.168.3.207:27092",
-		DbName:      "extract_kf",
+		MongodbAddr: "192.168.3.207:27081",
+		DbName:      "qfw",
 		Size:        util.IntAllDef(15, 10),
 	}
 	mgo.InitPool()
@@ -172,39 +172,76 @@ func Test_field(t *testing.T) {
 	//now := int64(time.Now().Unix())
 	//date_time := int64(86400*2)
 
-	field_map := make(map[string]string,0)
-	sess_field := mgo.GetMgoConn()
-	defer sess_field.Close()
-	res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
-	for dict := make(map[string]interface{}); res_field.Next(&dict); {
-		field_map[dict["s_field"].(string)] = "1"
-	}
+	//field_map := make(map[string]string,0)
+	//sess_field := mgo.GetMgoConn()
+	//defer sess_field.Close()
+	//res_field := sess_field.DB("extract_kf").C("fields").Find(nil).Sort("_id").Iter()
+	//for dict := make(map[string]interface{}); res_field.Next(&dict); {
+	//	field_map[dict["s_field"].(string)] = "1"
+	//}
 
 	//固定死的需要分析的字段
+	field_map := map[string]string{
+		"title":"1",
+		"area":"1",
+		"city":"1",
+		"subtype":"1",
+		"buyer":"1",
+		"agency":"1",
+		"winner":"1",
+		"budget":"1",
+		"bidamount":"1",
+		"projectname":"1",
+		"projectcode":"1",
+		"publishtime":"1",
+		"comeintime":"1",
+		"bidopentime":"1",
+		"agencyaddr":"1",
+		"site":"1",
+		"href":"1",
+	}
+
+	/*	ObjectId("5da3f2c5a5cb26b9b79847fc") 0
+		ObjectId("5da3fd6da5cb26b9b7a8683c") 5000
+		ObjectId("5da40bdaa5cb26b9b7bea472") 10000
+		ObjectId("5da44deaa5cb26b9b75efb38") 50000
+		ObjectId("5da53440a5cb26b9b7d3f9aa") 100000
+		ObjectId("5db2735ba5cb26b9b7c99c6f") 761414
+	*/
 
+	/*
+	qfw-bidding
 
+	ObjectId("5e0d4cdd0cf41612e063fc65")  -1
+	ObjectId("5df8bfe4e9d1f601e4e87431") 一百万
+	ObjectId("5dea080ce9d1f601e45cb838") 二百万
 
 
-	/*	ObjectId("5da3f2c5a5cb26b9b79847fc")
-		ObjectId("5da3fd6da5cb26b9b7a8683c")
-		ObjectId("5da40bdaa5cb26b9b7bea472")
 	*/
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
-	q := map[string]interface{}{
-		"_id": map[string]interface{}{
-			"$gt":  util.StringTOBsonId("5da3f2c5a5cb26b9b79847fc"),
-			"$lte": util.StringTOBsonId("5da3fd6da5cb26b9b7a8683c"),
-		},
-	}
-	it := sess.DB(mgo.DbName).C("a_testbidding").Find(&q).Sort("_id").Iter()
+	//q := map[string]interface{}{
+	//	"_id": map[string]interface{}{
+	//		"$gt":  util.StringTOBsonId("5dea080ce9d1f601e45cb838"),
+	//		"$lte": util.StringTOBsonId("5e0d4cdd0cf41612e063fc65"),
+	//	},
+	//}
+	it := sess.DB(mgo.DbName).C("bidding").Find(nil).Sort("-_id").Iter()
 
 	//爬虫组
 	crawlerMap,n := make(map[string]map[string]interface{},0),0
 
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
+		if n%10000==0 {
+			log.Println("当前n:",n)
+		}
+
+		if n>3000000 {
+			break
+		}
+
 		if tmp["spidercode"]!="" {
-			//判断是否有次类别分组
+			//判断是否有类别分组
 			dict := make(map[string]interface{},0)
 			if crawlerMap[tmp["spidercode"].(string)]!= nil {
 				dict = crawlerMap[tmp["spidercode"].(string)]
@@ -213,18 +250,17 @@ func Test_field(t *testing.T) {
 
 			if jsonData!=nil {
 				for k,v :=range *jsonData  {
-					if fmt.Sprint(v) =="" {
+					if fmt.Sprint(v) ==""{
 						//无效数据
 					}else {
-						arr := dict[k]
-						if arr==nil {
-							dict[k] = make([]string,0)
-							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
-						}else {
-							//if a,ok :=arr.([]string);ok{
-							//	a = append(a,fmt.Sprint(v))
-							//}
-							dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+						if field_map[k]=="1" {
+							arr := dict[k]
+							if arr==nil {
+								dict[k] = make([]string,0)
+								dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+							}else {
+								dict[k] = append(dict[k].([]string),fmt.Sprint(v))
+							}
 						}
 					}
 				}
@@ -236,12 +272,12 @@ func Test_field(t *testing.T) {
 	}
 
 	log.Println("总计",n,"条数据")
-	log.Println("判重类别个数:",len(crawlerMap))
+	log.Println("爬虫类别个数:",len(crawlerMap))
 
 
 	//计算每个爬虫分类的总数-并添加
 
-	//
+	//ObjectId("5e0d4cdd0cf41612e063fc65")
 	arr :=make([]map[string]interface{},0)
 	for k,v :=range crawlerMap  {
 		total :=0
@@ -278,32 +314,39 @@ func Test_field(t *testing.T) {
 		row.AddCell().SetString(v["key"].(string))
 		row.AddCell().SetInt(v["total"].(int))
 
-		mapLock.Lock()
-		sheetName := "排名:"+util.ObjToString(v["key"])
-		sheet_detail, err := f.AddSheet(sheetName)
-		if err==nil {
-			row_num,col_num :=0,0
-			for k1,v1 := range v {
-				if a,ok :=v1.([]string);ok {
-					for k2, v2 := range a {
-						if k2==0 {
-							sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
+		if limit <=20 {
+			mapLock.Lock()
+			sheetName := "排名"+util.ObjToString(limit)+":"+util.ObjToString(v["key"])
+			sheet_detail, err := f.AddSheet(sheetName)
+			if err==nil {
+				row_num,col_num :=0,0
+				for k1,v1 := range v {
+					if a,ok :=v1.([]string);ok {
+						for k2, v2 := range a {
+							if k2==0 {
+								sheet_detail.Cell(row_num, col_num).Value = util.ObjToString(k1)
+								row_num++
+								sheet_detail.Cell(row_num, col_num).Value = v2
+							}else {
+								if row_num>2000 {
+									continue
+								}
+								sheet_detail.Cell(row_num, col_num).Value = v2
+							}
 							row_num++
-							sheet_detail.Cell(row_num, col_num).Value = v2
-						}else {
-							sheet_detail.Cell(row_num, col_num).Value = v2
 						}
-						row_num++
+						row_num = 0
+						col_num++
 					}
-					row_num = 0
-					col_num++
 				}
 			}
+
+			mapLock.Unlock()
 		}
 
-		mapLock.Unlock()
 
-		if limit >10{
+
+		if limit >99{
 			break
 		}
 	}

binární
udpprojectset/src/zheng.xlsx