zhengkun 3 rokov pred
rodič
commit
11d827de3d

+ 4 - 0
src/config.json

@@ -23,6 +23,10 @@
         "api": "http://172.17.145.179:19281/_send/_mail"
     },
     "nextNode": [
+
+    ],
+    "jyfb_data": [
+        "a_jyxxfbpt_gg"
     ],
     "userName": "",
     "password": "",

+ 10 - 0
src/dataMethodHeavy.go

@@ -453,6 +453,15 @@ func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
 				reason = reason + "---低质量-有且一个要素组合"
 				return true, reason
 			}
+		}else if isValue==2{
+			if info.subtype == "采购意向" { //特殊
+				if info.projectname!="" && info.projectname == v.projectname &&
+					info.budget != 0 && info.budget == v.budget &&
+					info.city != "" && info.city == v.city{
+					reason = reason + "---采购意向~同城~预算~名称均一致"
+					return true,reason
+				}
+			}
 		}else {
 
 		}
@@ -461,6 +470,7 @@ func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
 }
 
 
+
 //类别细节原因记录
 func judgeLowQualityData(v *Info, info *Info, reason string) (bool, string) {
 	if info.projectname!="" && info.projectname == v.projectname{//项目名称

+ 0 - 5
src/dataMethodMerge.go

@@ -115,11 +115,6 @@ func mergeDataFields(source *Info, info *Info) (*Info,map[string]interface{} ,bo
 }
 
 
-
-
-
-
-
 //合并字段-并更新merge字段的值-
 func mergeDataFieldsArr(source *Info, info *Info) (*Info, []int64, bool) {
 

+ 10 - 12
src/datamap.go

@@ -14,7 +14,7 @@ import (
 type Info struct {
 	id    string //id
 	title string //标题
-
+	spidercode		string //爬虫代码
 	area           string  //省份
 	city           string  //城市
 	subtype        string  //信息类型
@@ -212,6 +212,7 @@ func NewInfo(tmp map[string]interface{}) *Info {
 	info.title = qutil.ObjToString(tmp["title"])
 	info.area = area
 	info.subtype = subtype
+	info.spidercode = qutil.ObjToString(tmp["spidercode"])
 	info.buyer = qutil.ObjToString(tmp["buyer"])
 	info.projectname = qutil.ObjToString(tmp["projectname"])
 	info.contractnumber = qutil.ObjToString(tmp["contractnumber"])
@@ -299,6 +300,14 @@ L:
 					}
 				}
 
+				//前置条件-五要素均相等
+				if leadingElementSame(v,info) {
+					reason = "五要素-相同-满足"
+					b = true
+					source = v
+					reasons = reason
+					break L
+				}
 
 				//前置条件 - 站点相关
 				if info.site != "" && info.site == v.site {
@@ -374,17 +383,6 @@ L:
 				}
 
 
-				//前置条件-五要素均相等
-				if leadingElementSame(v,info) {
-					reason = "五要素-相同-满足"
-					b = true
-					source = v
-					reasons = reason
-					break L
-				}
-
-
-
 				//新增快速数据过少判重
 				if LowHeavy {
 					repeat := false

+ 4 - 2
src/historyRepeat.go

@@ -57,8 +57,10 @@ func historyRepeat() {
 			time.Sleep(5 * time.Minute)
 			continue
 		}
-
 		log.Println("查询完毕-找到有标记的lteid-先睡眠5分钟",gtid,lteid)
+		if isUpdateSite{
+			initSite()
+		}
 		time.Sleep(5 * time.Minute)
 
 		sess := data_mgo.GetMgoConn()//连接器
@@ -172,7 +174,7 @@ func historyRepeat() {
 
 						updatelock.Lock()
 						//替换数据池-更新
-						DM.replacePoolData(source)
+						curTM.replacePoolData(source)
 						//更新数据源
 						//判断是否在当前段落
 						if judgeIsCurIds(gtid,lteid,source.id) {

+ 42 - 27
src/increaseRepeat.go

@@ -30,7 +30,7 @@ func increaseRepeat(mapInfo map[string]interface{}) {
 	dataAllDict := make(map[string][]map[string]interface{},0)
 	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
 		if total%1000 == 0 {
-			log.Println("index: ", total, isok)
+			log.Println("current index : ", total, isok)
 		}
 		if util.IntAll(tmp["repeat"]) == 1 {
 			repeatN++
@@ -60,6 +60,7 @@ func increaseRepeat(mapInfo map[string]interface{}) {
 	pool := make(chan bool, threadNum)
 	wg := &sync.WaitGroup{}
 	for _,dataArr := range dataAllDict {
+		log.Println("处理中...","当前重复量~", repeatN)
 		pool <- true
 		wg.Add(1)
 		go func(dataArr []map[string]interface{}) {
@@ -72,36 +73,50 @@ func increaseRepeat(mapInfo map[string]interface{}) {
 				info := NewInfo(tmp)
 				b,source,reason := DM.check(info)
 				if b {
-					num++
-					var updateID = map[string]interface{}{} //记录更新判重的
-					updateID["_id"] = StringTOBsonId(info.id)
-					repeat_ids:=source.repeat_ids
-					repeat_ids =  append(repeat_ids,info.id)
-					source.repeat_ids = repeat_ids
-					DM.replacePoolData(source)//替换数据池-更新
+					//判断信息是否为-指定剑鱼发布数据
+					if jyfb_data[info.spidercode]!="" { //伪判重标记
+						Update.updatePool <- []map[string]interface{}{//原始数据打标签
+							map[string]interface{}{
+								"_id": StringTOBsonId(info.id),
+							},
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat_jyfb": 1,
+								},
+							},
+						}
+					} else { //真实重复~~~
+						num++
+						var updateID = map[string]interface{}{} //记录更新判重的
+						updateID["_id"] = StringTOBsonId(info.id)
+						repeat_ids:=source.repeat_ids
+						repeat_ids =  append(repeat_ids,info.id)
+						source.repeat_ids = repeat_ids
+						DM.replacePoolData(source)//替换数据池-更新
 
-					Update.updatePool <- []map[string]interface{}{//原始数据打标签
-						map[string]interface{}{
-							"_id": StringTOBsonId(source.id),
-						},
-						map[string]interface{}{
-							"$set": map[string]interface{}{
-								"repeat_ids": repeat_ids,
+						Update.updatePool <- []map[string]interface{}{//原始数据打标签
+							map[string]interface{}{
+								"_id": StringTOBsonId(source.id),
 							},
-						},
-					}
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat_ids": repeat_ids,
+								},
+							},
+						}
 
-					Update.updatePool <- []map[string]interface{}{//重复数据打标签
-						updateID,
-						map[string]interface{}{
-							"$set": map[string]interface{}{
-								"repeat":        1,
-								"repeat_reason": reason,
-								"repeat_id":     source.id,
-								"dataging":		 0,
-								"updatetime_repeat" :util.Int64All(time.Now().Unix()),
+						Update.updatePool <- []map[string]interface{}{//重复数据打标签
+							updateID,
+							map[string]interface{}{
+								"$set": map[string]interface{}{
+									"repeat":        1,
+									"repeat_reason": reason,
+									"repeat_id":     source.id,
+									"dataging":		 0,
+									"updatetime_repeat" :util.Int64All(time.Now().Unix()),
+								},
 							},
-						},
+						}
 					}
 				}
 			}

+ 47 - 23
src/main.go

@@ -7,10 +7,10 @@ package main
 import (
 	"encoding/json"
 	"flag"
+	"github.com/cron"
 	"log"
 	mu "mfw/util"
 	"net"
-	"qfw/common/src/qfw/util"
 	qu "qfw/util"
 	"regexp"
 	"sync"
@@ -47,7 +47,9 @@ var (
 	updatelock 		sync.Mutex         				 //锁4
 	numberlock 		sync.Mutex         				 //锁4
 	userName,passWord 	string						 //mongo -用户密码
+	jyfb_data		map[string]string		 		//任务池
 	taskList		[]map[string]interface{}		 //任务池
+	isUpdateSite	bool
 )
 
 func init() {
@@ -65,46 +67,64 @@ func init() {
 
 	log.Println("集群用户密码:",userName,passWord)
 
+	jyfb_arr := qu.ObjArrToStringArr(Sysconfig["jyfb_data"].([]interface{}))
+	jyfb_data = make(map[string]string,0)
+	for _,v := range jyfb_arr{
+		jyfb_data[v] = v
+	}
+
+	log.Println("伪判重~",jyfb_data)
+
 	task_mconf := Sysconfig["task_mongodb"].(map[string]interface{})
 	task_mgo = &MongodbSim{
 		MongodbAddr: task_mconf["task_addrName"].(string),
 		DbName:      task_mconf["task_dbName"].(string),
-		Size:        util.IntAllDef(task_mconf["task_pool"], 10),
+		Size:        qu.IntAllDef(task_mconf["task_pool"], 10),
 		UserName:	 userName,
 		Password:	 passWord,
 	}
 	task_mgo.InitPool()
 	task_collName = task_mconf["task_collName"].(string)
 
-	nextNode = util.ObjArrToMapArr(Sysconfig["nextNode"].([]interface{}))
+	nextNode = qu.ObjArrToMapArr(Sysconfig["nextNode"].([]interface{}))
 	mconf = Sysconfig["mongodb"].(map[string]interface{})
 	data_mgo = &MongodbSim{
 		MongodbAddr: mconf["addr"].(string),
 		DbName:      mconf["db"].(string),
-		Size:        util.IntAllDef(mconf["pool"], 10),
+		Size:        qu.IntAllDef(mconf["pool"], 10),
 	}
 	data_mgo.InitPool()
 
 	extract = mconf["extract"].(string)
 	extract_back = mconf["extract_back"].(string)
 
-	dupdays = util.IntAllDef(Sysconfig["dupdays"], 3)
+	dupdays = qu.IntAllDef(Sysconfig["dupdays"], 3)
 	//加载数据
 	DM = NewDatamap(dupdays, lastid)
 	//更新池
 	Update = newUpdatePool()
 	go Update.updateData()
 
-	FilterRegTitle = regexp.MustCompile(util.ObjToString(Sysconfig["specialwords"]))
-	FilterRegTitle_0 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_0"]))
-	FilterRegTitle_1 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_1"]))
-	FilterRegTitle_2 = regexp.MustCompile(util.ObjToString(Sysconfig["specialtitle_2"]))
-	threadNum = util.IntAllDef(Sysconfig["threads"], 1)
+	FilterRegTitle = regexp.MustCompile(qu.ObjToString(Sysconfig["specialwords"]))
+	FilterRegTitle_0 = regexp.MustCompile(qu.ObjToString(Sysconfig["specialtitle_0"]))
+	FilterRegTitle_1 = regexp.MustCompile(qu.ObjToString(Sysconfig["specialtitle_1"]))
+	FilterRegTitle_2 = regexp.MustCompile(qu.ObjToString(Sysconfig["specialtitle_2"]))
+	threadNum = qu.IntAllDef(Sysconfig["threads"], 1)
 	LowHeavy = Sysconfig["lowHeavy"].(bool)
 	TimingTask = Sysconfig["timingTask"].(bool)
-	timingSpanDay = util.Int64All(Sysconfig["timingSpanDay"])
-	timingPubScope = util.Int64All(Sysconfig["timingPubScope"])
+	timingSpanDay = qu.Int64All(Sysconfig["timingSpanDay"])
+	timingPubScope = qu.Int64All(Sysconfig["timingPubScope"])
+
+	c := cron.New()
+	c.AddFunc("0 0 1 ? * WED", func() {
+		isUpdateSite = true
+	})
+	c.Start()
 	//站点配置
+	initSite()
+}
+
+func initSite(){
 	site := mconf["site"].(map[string]interface{})
 	SiteMap = make(map[string]map[string]interface{}, 0)
 	start := int(time.Now().Unix())
@@ -113,16 +133,15 @@ func init() {
 	res_site := sess_site.DB(site["dbname"].(string)).C(site["coll"].(string)).Find(map[string]interface{}{}).Sort("_id").Iter()
 	for site_dict := make(map[string]interface{}); res_site.Next(&site_dict); {
 		data_map := map[string]interface{}{
-			"area":     util.ObjToString(site_dict["area"]),
-			"city":     util.ObjToString(site_dict["city"]),
-			"district": util.ObjToString(site_dict["district"]),
-			"sitetype": util.ObjToString(site_dict["sitetype"]),
-			"level":    util.ObjToString(site_dict["level"]),
-			"weight":   util.ObjToString(site_dict["weight"]),
+			"area":     qu.ObjToString(site_dict["area"]),
+			"city":     qu.ObjToString(site_dict["city"]),
+			"district": qu.ObjToString(site_dict["district"]),
 		}
-		SiteMap[util.ObjToString(site_dict["site"])] = data_map
+		SiteMap[qu.ObjToString(site_dict["site"])] = data_map
 	}
+	isUpdateSite = false
 	log.Printf("new站点加载用时:%d秒,%d个\n", int(time.Now().Unix())-start, len(SiteMap))
+
 }
 
 //udp接收
@@ -139,6 +158,10 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 				key = "udpok"
 			}
 			udpclient.WriteUdp([]byte(key), mu.OP_NOOP, ra)
+			//计算是否需要加载站点~每天加载一次
+			if isUpdateSite {
+				initSite()
+			}
 			//插入任务-判断任务-是否存在
 			updatelock.Lock()
 			taskList = append(taskList,mapInfo)
@@ -173,10 +196,11 @@ func getRepeatTask()  {
 }
 
 
-func mainT() {
+func main() {
+	//log.Println("模拟增量判重...")
 	IsFull = true
-	sid := "1fffffffffffffffffffffff"
-	eid := "9fffffffffffffffffffffff"
+	sid := "124ed2324f7bde5444f1e973"
+	eid := "924ed35e4f7bde5444f1ec1d"
 	increaseRepeat(map[string]interface{}{
 		"gtid":sid,
 		"lteid":eid,
@@ -184,7 +208,7 @@ func mainT() {
 }
 
 //主函数
-func main() {
+func mainT() {
 	go checkMapJob()
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}

+ 6 - 17
src/udptaskmap.go

@@ -4,7 +4,6 @@ import (
 	"fmt"
 	"io/ioutil"
 	"log"
-	mu "mfw/util"
 	"net"
 	"net/http"
 	"sync"
@@ -23,7 +22,6 @@ type udpNode struct {
 }
 
 func checkMapJob() {
-
 	//阿里云内网无法发送邮件
 	jkmail, _ := Sysconfig["jkmail"].(map[string]interface{})
 	if jkmail != nil {
@@ -36,22 +34,13 @@ func checkMapJob() {
 			now := time.Now().Unix()
 			node, _ := v.(*udpNode)
 			if now-node.timestamp > 120 {
-				node.retry++
-				if node.retry > 5 {
-					log.Println("udp重试失败", k)
-					udptaskmap.Delete(k)
-					res, err := http.Get(fmt.Sprintf("%s?to=%s&title=%s&body=%s", api, tomail, "repeat-send-fail", k.(string)))
-					if err == nil {
-						defer res.Body.Close()
-						read, err := ioutil.ReadAll(res.Body)
-						log.Println("邮件发发送:", string(read), err)
-					}
-				} else {
-					log.Println("udp重发", k)
-					udpclient.WriteUdp(node.data, mu.OP_TYPE_DATA, node.addr)
+				udptaskmap.Delete(k)
+				res, err := http.Get(fmt.Sprintf("%s?to=%s&title=%s&body=%s", api, tomail, "增量判重程序~严重警告", fmt.Sprintf("下节点索引~未响应~相关人员检查~%s",k.(string))))
+				if err == nil {
+					defer res.Body.Close()
+					read, err := ioutil.ReadAll(res.Body)
+					log.Println("邮件发送:", string(read), err)
 				}
-			} else if now-node.timestamp > 10 {
-				log.Println("udp任务超时中..", k)
 			}
 			return true
 		})

+ 34 - 26
work_repeat/src/main.go

@@ -117,25 +117,26 @@ func main() {
 	//var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_--]")
 	index := 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		info :=tmp["v_baseinfo"].(map[string]interface{})
 		d := &dataSource{
 			_id:          BsonTOStringId(tmp["_id"]),
 			id:           qu.ObjToString(tmp["id"]),
-			title:        filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["title"])), ""),
-			projectname:  filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["projectname"])), ""),
-			projectcode:  filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["projectcode"])), ""),
-			contractcode: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["contractcode"])), ""),
-			buyer:        filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["buyer"])), ""),
-			agency:       filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["agency"])), ""),
-			s_winner:     filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["s_winner"])), ""),
-			budget:       qu.Float64All(tmp["budget"]),
-			bidamount:    qu.Float64All(tmp["bidamount"]),
-			publishtime:  qu.Int64All(tmp["publishtime"]),
+			title:        filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["title"])), ""),
+			projectname:  filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["projectname"])), ""),
+			projectcode:  filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["projectcode"])), ""),
+			contractcode: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["contractcode"])), ""),
+			buyer:        filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["buyer"])), ""),
+			agency:       filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["agency"])), ""),
+			s_winner:     filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["s_winner"])), ""),
+			budget:       qu.Float64All(info["budget"]),
+			bidamount:    qu.Float64All(info["bidamount"]),
+			publishtime:  qu.Int64All(info["publishtime"]),
 			repeat_id:    map[string]string{},
 		}
-		if tmp["budget"]==nil{
+		if info["budget"]==nil{
 			d.budget_isnull=true
 		}
-		if tmp["bidamount"]==nil{
+		if info["bidamount"]==nil{
 			d.bidamount_isnull=true
 		}
 		//log.Println(tmp["_id"],tmp["title"],tmp["projectname"])
@@ -146,6 +147,13 @@ func main() {
 		tmp = map[string]interface{}{}
 	}
 	log.Println("数据加载完成",len(listSource))
+	for i:=0;i<len(listSource);i++ {
+		for j:=0;j<len(listSource)-i-1;j++ {
+			if listSource[j].publishtime > listSource[j+1].publishtime {
+				listSource[j], listSource[j+1] = listSource[j+1], listSource[j]
+			}
+		}
+	}
 	dataItem()
 	dd := 0
 	for i := 0; i < len(listSource); i++ {
@@ -168,15 +176,15 @@ func main() {
 				for k,_:=range a.repeat_id{
 					arr = append(arr,k)
 				}
-			Mgo.UpdateById(table, a._id,
-				map[string]interface{}{"$set": map[string]interface{}{
-					//原始数据看repeatid_ids_str
-					"repeatid":     a.repeat_id_source, //和哪条数据重复id
-					"repeat":       a.isrepeat,         //本条数据是否重复数据
-					//"repeatid_ids": a.repeat_id,        //和我重复的数据都有哪些
-					"repeatid_ids_str": strings.Join(arr,","),
-					"repeattext":   a.repeatText,       //本数据被判重的原因
-				}})}
+				Mgo.UpdateById(table, a._id,
+					map[string]interface{}{"$set": map[string]interface{}{
+						//原始数据看repeatid_ids_str
+						"repeatid":     a.repeat_id_source, //和哪条数据重复id
+						"repeat":       a.isrepeat,         //本条数据是否重复数据
+						//"repeatid_ids": a.repeat_id,        //和我重复的数据都有哪些
+						"repeatid_ids_str": strings.Join(arr,","),
+						"repeattext":   a.repeatText,       //本数据被判重的原因
+					}})}
 		}
 		if i%1000 == 0 {
 			log.Println("已更新:", i)
@@ -404,11 +412,11 @@ func panchong(a, b dataSource) (c, d *dataSource) {
 							//	b.isrepeat = true
 							//	b.repeatText = "标题不相等-->无分包 &&  projectname && projectcode && budget"
 							//}
-						} else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
-							b.repeat_id_source = a.id
-							a.repeat_id[b.id] = ""
-							b.isrepeat = true
-							b.repeatText = "标题不相等-->无分包 && projectname &&  s_winner"
+							//} else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+							//	b.repeat_id_source = a.id
+							//	a.repeat_id[b.id] = ""
+							//	b.isrepeat = true
+							//	b.repeatText = "标题不相等-->无分包 && projectname &&  s_winner"
 							//r := key_list(a, b)
 							//if r {
 							//	b.repeat_id_source = a.id

+ 554 - 0
work_repeat/src/mark

@@ -0,0 +1,554 @@
+// main
+package main
+
+import (
+	"fmt"
+	"gopkg.in/mgo.v2"
+	"log"
+	"os"
+	qu "qfw/util"
+	"regexp"
+	"strings"
+	"time"
+)
+var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"}
+var packreg *regexp.Regexp
+var Mgo *MongodbSim
+var listSource []*dataSource
+
+type dataSource struct {
+	_id, id, title                         string
+	projectname, projectcode, contractcode string
+	buyer, agency, s_winner                string
+	budget, bidamount                      float64
+	budget_isnull,bidamount_isnull 		   bool
+	isrepeat                               bool
+	repeat_id_source                       string
+	repeat_id                              map[string]string
+	repeatText                             string
+	publishtime							   int64
+}
+
+//var addr, dbname, table, startTime, endTime, sortType *string
+var addr, dbname, table,  sortType string
+var cycle int64
+var sysconfig    map[string]interface{} 	//配置文件
+
+func initConfig()  {
+	qu.ReadConfig(&sysconfig)
+	addr =  sysconfig["mgo_addr"].(string)
+	dbname =  sysconfig["mgo_db"].(string)
+	table =  sysconfig["mgo_table"].(string)
+	sortType =  sysconfig["mgo_sort"].(string)
+	month := qu.IntAllDef(sysconfig["cycle_month"],6)
+	cycle = qu.Int64All(month)
+}
+//创建mgo索引
+func createMgoIndex(){
+	mongoDBDialInfo := &mgo.DialInfo{
+		Addrs:    []string{addr},
+		Timeout:  60 * time.Second,
+		Database: dbname,
+	}
+	session, err := mgo.DialWithInfo(mongoDBDialInfo)
+	if err != nil {
+		log.Fatalf("CreateSession failed:%\n", err)
+	}
+	coll := session.DB(dbname).C(table)
+	err = coll.EnsureIndexKey("publishtime")
+	fmt.Println("创建索引~publishtime",err)
+
+	//查询所有的已存在索引
+	//indexs, err := coll.Indexes()
+	//fmt.Println("indexs--------------:", indexs)
+}
+//初始化
+func init() {
+	initConfig()
+	createMgoIndex()
+	Mgo = &MongodbSim{
+		MongodbAddr: addr,
+		Size:        3,
+		DbName:      dbname,
+	}
+	Mgo.InitPool()
+
+	packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十][包标段])`)
+	//packreg, _ = regexp.Compile(`([包标段][::]?[a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十]|[a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十][包标段]){1,}`)
+	//packreg, _ = regexp.MustCompile("([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)")
+}
+//主函数
+func main() {
+	if addr == "" || dbname == "" || table == "" ||  sortType == ""  {
+		log.Println("参数配置有误")
+		fmt.Printf("数据库地址:%s\n数据库名称:%s\n表名:%s\n排序方式:%s\n", addr, dbname, table, sortType)
+		os.Exit(0)
+	}
+	//stime, _ := time.Parse(qu.Date_Short_Layout, *startTime)
+	//etime, _ := time.Parse(qu.Date_Short_Layout, *endTime)
+	//query := map[string]interface{}{}
+	//query["$and"] = []interface{}{
+	//	map[string]interface{}{
+	//		"publishtime":map[string]interface{}{
+	//			"$gte":stime.Unix(),
+	//		},
+	//	},
+	//	map[string]interface{}{
+	//		"publishtime":map[string]interface{}{
+	//			"$lte":etime.Unix(),
+	//		},
+	//	},
+	//	//bson.M{"publishtime": bson.M{"$gte": stime.Unix()}},
+	//	//bson.M{"publishtime": bson.M{"$lte": etime.Unix()}},
+	//}
+	sort := "publishtime"
+	if sortType == "-1" {
+		sort = "-publishtime"
+	}
+	//log.Println(sort)
+	q:=map[string]interface{}{}
+	sess := Mgo.GetMgoConn()
+	defer Mgo.DestoryMongoConn(sess)
+
+	//it := sess.DB(Mgo.DbName).C(*table).Find(query).Sort(sort).Iter()
+	it := sess.DB(Mgo.DbName).C(table).Find(q).Sort(sort).Iter()
+	//对标题、项目名称等中英文符号、空格等进行处理
+	var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_-]")
+	//var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_--]")
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		d := &dataSource{
+			_id:          BsonTOStringId(tmp["_id"]),
+			id:           qu.ObjToString(tmp["id"]),
+			title:        filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["title"])), ""),
+			projectname:  filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["projectname"])), ""),
+			projectcode:  filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["projectcode"])), ""),
+			contractcode: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["contractcode"])), ""),
+			buyer:        filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["buyer"])), ""),
+			agency:       filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["agency"])), ""),
+			s_winner:     filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(tmp["s_winner"])), ""),
+			budget:       qu.Float64All(tmp["budget"]),
+			bidamount:    qu.Float64All(tmp["bidamount"]),
+			publishtime:  qu.Int64All(tmp["publishtime"]),
+			repeat_id:    map[string]string{},
+		}
+		if tmp["budget"]==nil{
+			d.budget_isnull=true
+		}
+		if tmp["bidamount"]==nil{
+			d.bidamount_isnull=true
+		}
+		//log.Println(tmp["_id"],tmp["title"],tmp["projectname"])
+		if index%10000 == 0 {
+			log.Println("加载数据:", index)
+		}
+		listSource = append(listSource, d)
+		tmp = map[string]interface{}{}
+	}
+	log.Println("数据加载完成",len(listSource))
+	dataItem()
+	dd := 0
+	for i := 0; i < len(listSource); i++ {
+		a := listSource[i]
+		if a.isrepeat {
+			dd++
+		}
+		//更新数据
+		if len(a.repeat_id) ==0{
+			Mgo.UpdateById(table, a._id,
+				map[string]interface{}{"$set": map[string]interface{}{
+					//重复数据看repeatid
+					"repeatid":     a.repeat_id_source, //和哪条数据重复id
+					"repeat":       a.isrepeat,         //本条数据是否重复数据
+					"repeattext":   a.repeatText,       //本数据被判重的原因
+				}})
+		}else {
+			if len(a.repeat_id) > 0{
+				arr:=[]string{}
+				for k,_:=range a.repeat_id{
+					arr = append(arr,k)
+				}
+				Mgo.UpdateById(table, a._id,
+					map[string]interface{}{"$set": map[string]interface{}{
+						//原始数据看repeatid_ids_str
+						"repeatid":     a.repeat_id_source, //和哪条数据重复id
+						"repeat":       a.isrepeat,         //本条数据是否重复数据
+						//"repeatid_ids": a.repeat_id,        //和我重复的数据都有哪些
+						"repeatid_ids_str": strings.Join(arr,","),
+						"repeattext":   a.repeatText,       //本数据被判重的原因
+					}})}
+		}
+		if i%1000 == 0 {
+			log.Println("已更新:", i)
+		}
+	}
+	log.Println("重复数据量:",dd)
+}
+
+var listSize = 20000
+
+func dataItem() {
+	for i := 0; i < len(listSource); i++ {
+		a := listSource[i]
+		// if a.isrepeat {
+		// 	continue
+		// }
+		b := &dataSource{}
+		for j := i + 1; j < len(listSource); j++ {
+			b = listSource[j]
+			if sortType == "1" {
+				if publishtime_b_a(*a,*b){
+					// if b.isrepeat {
+					// 	continue
+					// }
+					a, b = panchong(*a, *b)
+					listSource[j] = b
+					listSource[i] = a
+					// if b.isrepeat {
+					// 	log.Println("sss", a.id, b.isrepeat, b.repeat_id)
+					// }
+				}
+			}else{
+				if publishtime_a_b(*a,*b){
+					// if b.isrepeat {
+					// 	continue
+					// }
+					a, b = panchong(*a, *b)
+					listSource[j] = b
+					listSource[i] = a
+					// if b.isrepeat {
+					// 	log.Println("sss", a.id, b.isrepeat, b.repeat_id)
+					// }
+				}
+			}
+		}
+		if i%500 == 0 {
+			log.Println("已处理:", i)
+		}
+	}
+}
+
+func panchong(a, b dataSource) (c, d *dataSource) {
+	switch {
+	case a.title == b.title: //标题相等
+		if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+
+		} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+			if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+				if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+					b.repeat_id_source = a.id
+					a.repeat_id[b.id] = ""
+					b.isrepeat = true
+					b.repeatText = "标题相等 && bidamount && buyer && s_winner"
+				}else{
+					r := key_list(a, b)
+					if r {
+						b.repeat_id_source = a.id
+						a.repeat_id[b.id] = ""
+						b.isrepeat = true
+						b.repeatText = "标题相等 && bidamount && buyer && key_list"
+					}
+				}
+			} else if  pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+				b.repeat_id_source = a.id
+				a.repeat_id[b.id] = ""
+				b.isrepeat = true
+				b.repeatText = "标题相等 && bidamount && s_winner"
+			}else {
+				r := key_list(a, b)
+				if r {
+					b.repeat_id_source = a.id
+					a.repeat_id[b.id] = ""
+					b.isrepeat = true
+					b.repeatText = "标题相等 && bidamount && key_list"
+				}
+			}
+		}else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
+			r := key_list(a, b)
+			if r {
+				b.repeat_id_source = a.id
+				a.repeat_id[b.id] = ""
+				b.isrepeat = true
+				b.repeatText = "标题相等 && projectcode && key_list"
+			}
+		}else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+			if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+				if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+					if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						b.repeat_id_source = a.id
+						a.repeat_id[b.id] = ""
+						b.isrepeat = true
+						b.repeatText = "标题相等 && budget && buyer && s_winner && bidamount"
+						//log.Println("1111", a.id, b.id, b.isrepeat)
+					}
+				}
+			} else {
+				r := key_list(a, b)
+				if r {
+					b.repeat_id_source = a.id
+					a.repeat_id[b.id] = ""
+					b.isrepeat = true
+					b.repeatText = "标题相等 && budget && key_list"
+				}
+			}
+		}   else {
+			//
+		}
+	case a.title != b.title: //标题不相等
+		//项目名称包含及相等
+		if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) {
+			isp := packreg.MatchString(a.title)
+			//有分包
+			if isp {
+				//项目名称相等
+				if a.projectname == b.projectname {
+					if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+						//
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner != b.s_winner{
+
+						}else{
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
+						}
+						//b.repeat_id_source = a.id
+						//a.repeat_id[b.id] = ""
+						//b.isrepeat = true
+						//b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount  {
+						//
+					} else {
+						if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget && (a.budget >=0 || b.budget >= 0) {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget"
+						}
+					}
+				} else { //项目名称包含
+					if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+						//
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode"
+						} else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner"
+						} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer"
+							} else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency)  && pankong(a.agency) && pankong(b.agency) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency"
+							} else {
+								//
+							}
+						}
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount {
+						//
+					} else {
+						if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget"
+						} else {
+							//
+						}
+					}
+				}
+			} else { //无分包
+				//项目名称相等
+				if a.projectname == b.projectname {
+					if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+						//
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						b.repeat_id_source = a.id
+						a.repeat_id[b.id] = ""
+						b.isrepeat = true
+						b.repeatText = "标题不相等-->无分包 && projectname && bidamount"
+					} else if !a.bidamount_isnull && !b.bidamount_isnull &&  a.bidamount != b.bidamount  {
+						//
+					} else {
+						if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
+							if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner != b.s_winner {
+
+							}else if !a.budget_isnull && !b.budget_isnull && a.budget != b.budget{
+
+							}else{
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname && projectcode"
+							}
+							//if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+							//	b.repeat_id_source = a.id
+							//	a.repeat_id[b.id] = ""
+							//	b.isrepeat = true
+							//	b.repeatText = "标题不相等-->无分包 && projectname && projectcode && s_winner"
+							//} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
+							//	b.repeat_id_source = a.id
+							//	a.repeat_id[b.id] = ""
+							//	b.isrepeat = true
+							//	b.repeatText = "标题不相等-->无分包 &&  projectname && projectcode && budget"
+							//}
+							//} else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+							//	b.repeat_id_source = a.id
+							//	a.repeat_id[b.id] = ""
+							//	b.isrepeat = true
+							//	b.repeatText = "标题不相等-->无分包 && projectname &&  s_winner"
+							//r := key_list(a, b)
+							//if r {
+							//	b.repeat_id_source = a.id
+							//	a.repeat_id[b.id] = ""
+							//	b.isrepeat = true
+							//	b.repeatText = "标题不相等-->无分包 && projectname && s_winner && key_list"
+							//}
+						} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer"
+							} else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency)  && pankong(a.agency) && pankong(b.agency) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname && budget && agency"
+							} else {
+								//
+							}
+						}
+					}
+				} else { //项目名称包含
+					if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+						//
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount && projectcode"
+						} else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount && s_winner"
+						} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer"
+							} else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency)  && pankong(a.agency) && pankong(b.agency) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency"
+							} else {
+								//
+							}
+						} else {
+							//
+						}
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount  {
+						//
+					} else {
+						if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget"
+						}
+					}
+				}
+			}
+		}
+	default:
+	}
+	return &a, &b
+}
+
+//zhb_key_list 判断
+//"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"
+func key_list(a, b dataSource) bool {
+	for i := 0; i < len(zhb_key_list); i++ {
+		key := zhb_key_list[i]
+		switch key {
+		case "budget":
+			if !a.budget_isnull && !b.budget_isnull && a.budget != b.budget  {
+				return false
+			} else {
+				continue
+			}
+		case "buyer":
+			if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+				continue
+			} else {
+				return false
+			}
+		case "agency":
+			if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency)  && pankong(a.agency) && pankong(b.agency) {
+				continue
+			} else {
+				return false
+			}
+		case "s_winner":
+			if a.s_winner != b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) {
+				return false
+			} else {
+				continue
+			}
+		case "bidamount":
+			if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount   {
+				return false
+			} else {
+				continue
+			}
+		case "projectcode":
+			if a.projectcode != b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) {
+				return false
+			} else {
+				continue
+			}
+		case "contractcode":
+			if a.contractcode != b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) {
+				return false
+			} else {
+				continue
+			}
+		}
+	}
+	return true
+}
+//发布时间判断
+//正序
+func publishtime_b_a(a,b dataSource) bool{
+	return b.publishtime-a.publishtime < 86400 * 31 * cycle
+}
+//倒序
+func publishtime_a_b(a,b dataSource) bool {
+	return a.publishtime-b.publishtime < 86400 * 31 * cycle
+}
+
+
+//
+func pankong(a string) bool {
+	if a != "" {
+		return true
+	} else {
+		return false
+	}
+}

+ 562 - 0
work_repeat/src/mark1

@@ -0,0 +1,562 @@
+// main
+package main
+
+import (
+	"fmt"
+	"gopkg.in/mgo.v2"
+	"log"
+	"os"
+	qu "qfw/util"
+	"regexp"
+	"strings"
+	"time"
+)
+var zhb_key_list = []string{"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"}
+var packreg *regexp.Regexp
+var Mgo *MongodbSim
+var listSource []*dataSource
+
+type dataSource struct {
+	_id, id, title                         string
+	projectname, projectcode, contractcode string
+	buyer, agency, s_winner                string
+	budget, bidamount                      float64
+	budget_isnull,bidamount_isnull 		   bool
+	isrepeat                               bool
+	repeat_id_source                       string
+	repeat_id                              map[string]string
+	repeatText                             string
+	publishtime							   int64
+}
+
+//var addr, dbname, table, startTime, endTime, sortType *string
+var addr, dbname, table,  sortType string
+var cycle int64
+var sysconfig    map[string]interface{} 	//配置文件
+
+func initConfig()  {
+	qu.ReadConfig(&sysconfig)
+	addr =  sysconfig["mgo_addr"].(string)
+	dbname =  sysconfig["mgo_db"].(string)
+	table =  sysconfig["mgo_table"].(string)
+	sortType =  sysconfig["mgo_sort"].(string)
+	month := qu.IntAllDef(sysconfig["cycle_month"],6)
+	cycle = qu.Int64All(month)
+}
+//创建mgo索引
+func createMgoIndex(){
+	mongoDBDialInfo := &mgo.DialInfo{
+		Addrs:    []string{addr},
+		Timeout:  60 * time.Second,
+		Database: dbname,
+	}
+	session, err := mgo.DialWithInfo(mongoDBDialInfo)
+	if err != nil {
+		log.Fatalf("CreateSession failed:%\n", err)
+	}
+	coll := session.DB(dbname).C(table)
+	err = coll.EnsureIndexKey("publishtime")
+	fmt.Println("创建索引~publishtime",err)
+
+	//查询所有的已存在索引
+	//indexs, err := coll.Indexes()
+	//fmt.Println("indexs--------------:", indexs)
+}
+//初始化
+func init() {
+	initConfig()
+	createMgoIndex()
+	Mgo = &MongodbSim{
+		MongodbAddr: addr,
+		Size:        3,
+		DbName:      dbname,
+	}
+	Mgo.InitPool()
+
+	packreg, _ = regexp.Compile(`([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十][包标段])`)
+	//packreg, _ = regexp.Compile(`([包标段][::]?[a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十]|[a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十][包标段]){1,}`)
+	//packreg, _ = regexp.MustCompile("([a-zA-Z0-9①②ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ一二三四五六七八九十](包|标|段)[::]?)")
+}
+//主函数
+func main() {
+	if addr == "" || dbname == "" || table == "" ||  sortType == ""  {
+		log.Println("参数配置有误")
+		fmt.Printf("数据库地址:%s\n数据库名称:%s\n表名:%s\n排序方式:%s\n", addr, dbname, table, sortType)
+		os.Exit(0)
+	}
+	//stime, _ := time.Parse(qu.Date_Short_Layout, *startTime)
+	//etime, _ := time.Parse(qu.Date_Short_Layout, *endTime)
+	//query := map[string]interface{}{}
+	//query["$and"] = []interface{}{
+	//	map[string]interface{}{
+	//		"publishtime":map[string]interface{}{
+	//			"$gte":stime.Unix(),
+	//		},
+	//	},
+	//	map[string]interface{}{
+	//		"publishtime":map[string]interface{}{
+	//			"$lte":etime.Unix(),
+	//		},
+	//	},
+	//	//bson.M{"publishtime": bson.M{"$gte": stime.Unix()}},
+	//	//bson.M{"publishtime": bson.M{"$lte": etime.Unix()}},
+	//}
+	sort := "publishtime"
+	if sortType == "-1" {
+		sort = "-publishtime"
+	}
+	//log.Println(sort)
+	q:=map[string]interface{}{}
+	sess := Mgo.GetMgoConn()
+	defer Mgo.DestoryMongoConn(sess)
+
+	//it := sess.DB(Mgo.DbName).C(*table).Find(query).Sort(sort).Iter()
+	it := sess.DB(Mgo.DbName).C(table).Find(q).Sort(sort).Iter()
+	//对标题、项目名称等中英文符号、空格等进行处理
+	var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_-]")
+	//var filterReg = regexp.MustCompile("[`~!@#$^&*()=|{}':;,\\[\\].<>/?!¥…()—【】‘;:”“。,、?%+_--]")
+	index := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		info :=tmp["v_baseinfo"].(map[string]interface{})
+		d := &dataSource{
+			_id:          BsonTOStringId(tmp["_id"]),
+			id:           qu.ObjToString(tmp["id"]),
+			title:        filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["title"])), ""),
+			projectname:  filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["projectname"])), ""),
+			projectcode:  filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["projectcode"])), ""),
+			contractcode: filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["contractcode"])), ""),
+			buyer:        filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["buyer"])), ""),
+			agency:       filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["agency"])), ""),
+			s_winner:     filterReg.ReplaceAllString(strings.ToLower(qu.ObjToString(info["s_winner"])), ""),
+			budget:       qu.Float64All(info["budget"]),
+			bidamount:    qu.Float64All(info["bidamount"]),
+			publishtime:  qu.Int64All(info["publishtime"]),
+			repeat_id:    map[string]string{},
+		}
+		if info["budget"]==nil{
+			d.budget_isnull=true
+		}
+		if info["bidamount"]==nil{
+			d.bidamount_isnull=true
+		}
+		//log.Println(tmp["_id"],tmp["title"],tmp["projectname"])
+		if index%10000 == 0 {
+			log.Println("加载数据:", index)
+		}
+		listSource = append(listSource, d)
+		tmp = map[string]interface{}{}
+	}
+	log.Println("数据加载完成",len(listSource))
+	for i:=0;i<len(listSource);i++ {
+		for j:=0;j<len(listSource)-i-1;j++ {
+			if listSource[j].publishtime > listSource[j+1].publishtime {
+				listSource[j], listSource[j+1] = listSource[j+1], listSource[j]
+			}
+		}
+	}
+	dataItem()
+	dd := 0
+	for i := 0; i < len(listSource); i++ {
+		a := listSource[i]
+		if a.isrepeat {
+			dd++
+		}
+		//更新数据
+		if len(a.repeat_id) ==0{
+			Mgo.UpdateById(table, a._id,
+				map[string]interface{}{"$set": map[string]interface{}{
+					//重复数据看repeatid
+					"repeatid":     a.repeat_id_source, //和哪条数据重复id
+					"repeat":       a.isrepeat,         //本条数据是否重复数据
+					"repeattext":   a.repeatText,       //本数据被判重的原因
+				}})
+		}else {
+			if len(a.repeat_id) > 0{
+				arr:=[]string{}
+				for k,_:=range a.repeat_id{
+					arr = append(arr,k)
+				}
+			Mgo.UpdateById(table, a._id,
+				map[string]interface{}{"$set": map[string]interface{}{
+					//原始数据看repeatid_ids_str
+					"repeatid":     a.repeat_id_source, //和哪条数据重复id
+					"repeat":       a.isrepeat,         //本条数据是否重复数据
+					//"repeatid_ids": a.repeat_id,        //和我重复的数据都有哪些
+					"repeatid_ids_str": strings.Join(arr,","),
+					"repeattext":   a.repeatText,       //本数据被判重的原因
+				}})}
+		}
+		if i%1000 == 0 {
+			log.Println("已更新:", i)
+		}
+	}
+	log.Println("重复数据量:",dd)
+}
+
+var listSize = 20000
+
+func dataItem() {
+	for i := 0; i < len(listSource); i++ {
+		a := listSource[i]
+		// if a.isrepeat {
+		// 	continue
+		// }
+		b := &dataSource{}
+		for j := i + 1; j < len(listSource); j++ {
+			b = listSource[j]
+			if sortType == "1" {
+				if publishtime_b_a(*a,*b){
+					// if b.isrepeat {
+					// 	continue
+					// }
+					a, b = panchong(*a, *b)
+					listSource[j] = b
+					listSource[i] = a
+					// if b.isrepeat {
+					// 	log.Println("sss", a.id, b.isrepeat, b.repeat_id)
+					// }
+				}
+			}else{
+				if publishtime_a_b(*a,*b){
+					// if b.isrepeat {
+					// 	continue
+					// }
+					a, b = panchong(*a, *b)
+					listSource[j] = b
+					listSource[i] = a
+					// if b.isrepeat {
+					// 	log.Println("sss", a.id, b.isrepeat, b.repeat_id)
+					// }
+				}
+			}
+		}
+		if i%500 == 0 {
+			log.Println("已处理:", i)
+		}
+	}
+}
+
+func panchong(a, b dataSource) (c, d *dataSource) {
+	switch {
+	case a.title == b.title: //标题相等
+		if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+
+		} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+			if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+				if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+					b.repeat_id_source = a.id
+					a.repeat_id[b.id] = ""
+					b.isrepeat = true
+					b.repeatText = "标题相等 && bidamount && buyer && s_winner"
+				}else{
+					r := key_list(a, b)
+					if r {
+						b.repeat_id_source = a.id
+						a.repeat_id[b.id] = ""
+						b.isrepeat = true
+						b.repeatText = "标题相等 && bidamount && buyer && key_list"
+					}
+				}
+			} else if  pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+				b.repeat_id_source = a.id
+				a.repeat_id[b.id] = ""
+				b.isrepeat = true
+				b.repeatText = "标题相等 && bidamount && s_winner"
+			}else {
+				r := key_list(a, b)
+				if r {
+					b.repeat_id_source = a.id
+					a.repeat_id[b.id] = ""
+					b.isrepeat = true
+					b.repeatText = "标题相等 && bidamount && key_list"
+				}
+			}
+		}else if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
+			r := key_list(a, b)
+			if r {
+				b.repeat_id_source = a.id
+				a.repeat_id[b.id] = ""
+				b.isrepeat = true
+				b.repeatText = "标题相等 && projectcode && key_list"
+			}
+		}else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+			if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+				if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+					if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						b.repeat_id_source = a.id
+						a.repeat_id[b.id] = ""
+						b.isrepeat = true
+						b.repeatText = "标题相等 && budget && buyer && s_winner && bidamount"
+						//log.Println("1111", a.id, b.id, b.isrepeat)
+					}
+				}
+			} else {
+				r := key_list(a, b)
+				if r {
+					b.repeat_id_source = a.id
+					a.repeat_id[b.id] = ""
+					b.isrepeat = true
+					b.repeatText = "标题相等 && budget && key_list"
+				}
+			}
+		}   else {
+			//
+		}
+	case a.title != b.title: //标题不相等
+		//项目名称包含及相等
+		if strings.Contains(a.projectname, b.projectname) || strings.Contains(b.projectname, a.projectname) {
+			isp := packreg.MatchString(a.title)
+			//有分包
+			if isp {
+				//项目名称相等
+				if a.projectname == b.projectname {
+					if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+						//
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner != b.s_winner{
+
+						}else{
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
+						}
+						//b.repeat_id_source = a.id
+						//a.repeat_id[b.id] = ""
+						//b.isrepeat = true
+						//b.repeatText = "标题不相等-->有分包 && projectname && bidamount"
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount  {
+						//
+					} else {
+						if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget && (a.budget >=0 || b.budget >= 0) {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname && s_winner && budget"
+						}
+					}
+				} else { //项目名称包含
+					if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+						//
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && projectcode"
+						} else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && s_winner"
+						} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && buyer"
+							} else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency)  && pankong(a.agency) && pankong(b.agency) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->有分包 && projectname包含 && bidamount && budget && agency"
+							} else {
+								//
+							}
+						}
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount {
+						//
+					} else {
+						if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->有分包 && projectname包含 && s_winner && budget"
+						} else {
+							//
+						}
+					}
+				}
+			} else { //无分包
+				//项目名称相等
+				if a.projectname == b.projectname {
+					if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+						//
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						b.repeat_id_source = a.id
+						a.repeat_id[b.id] = ""
+						b.isrepeat = true
+						b.repeatText = "标题不相等-->无分包 && projectname && bidamount"
+					} else if !a.bidamount_isnull && !b.bidamount_isnull &&  a.bidamount != b.bidamount  {
+						//
+					} else {
+						if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
+							if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner != b.s_winner {
+
+							}else if !a.budget_isnull && !b.budget_isnull && a.budget != b.budget{
+
+							}else{
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname && projectcode"
+							}
+							//if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+							//	b.repeat_id_source = a.id
+							//	a.repeat_id[b.id] = ""
+							//	b.isrepeat = true
+							//	b.repeatText = "标题不相等-->无分包 && projectname && projectcode && s_winner"
+							//} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget {
+							//	b.repeat_id_source = a.id
+							//	a.repeat_id[b.id] = ""
+							//	b.isrepeat = true
+							//	b.repeatText = "标题不相等-->无分包 &&  projectname && projectcode && budget"
+							//}
+						//} else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+						//	b.repeat_id_source = a.id
+						//	a.repeat_id[b.id] = ""
+						//	b.isrepeat = true
+						//	b.repeatText = "标题不相等-->无分包 && projectname &&  s_winner"
+							//r := key_list(a, b)
+							//if r {
+							//	b.repeat_id_source = a.id
+							//	a.repeat_id[b.id] = ""
+							//	b.isrepeat = true
+							//	b.repeatText = "标题不相等-->无分包 && projectname && s_winner && key_list"
+							//}
+						} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname && budget && buyer"
+							} else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency)  && pankong(a.agency) && pankong(b.agency) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname && budget && agency"
+							} else {
+								//
+							}
+						}
+					}
+				} else { //项目名称包含
+					if pankong(a.contractcode) && pankong(b.contractcode) && a.contractcode != b.contractcode {
+						//
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount == b.bidamount  {
+						if pankong(a.projectcode) && pankong(b.projectcode) && a.projectcode == b.projectcode {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount && projectcode"
+						} else if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->无分包 && projectname包含 && bidamount && s_winner"
+						} else if !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && buyer"
+							} else if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency)  && pankong(a.agency) && pankong(b.agency) {
+								b.repeat_id_source = a.id
+								a.repeat_id[b.id] = ""
+								b.isrepeat = true
+								b.repeatText = "标题不相等-->无分包 && projectname包含 && budget && agency"
+							} else {
+								//
+							}
+						} else {
+							//
+						}
+					} else if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount  {
+						//
+					} else {
+						if pankong(a.s_winner) && pankong(b.s_winner) && a.s_winner == b.s_winner && !a.budget_isnull && !b.budget_isnull && a.budget == b.budget  {
+							b.repeat_id_source = a.id
+							a.repeat_id[b.id] = ""
+							b.isrepeat = true
+							b.repeatText = "标题不相等-->无分包 && projectname包含 && s_winner && budget"
+						}
+					}
+				}
+			}
+		}
+	default:
+	}
+	return &a, &b
+}
+
+//zhb_key_list 判断
+//"budget", "buyer", "agency", "s_winner", "bidamount", "projectcode", "contractcode"
+func key_list(a, b dataSource) bool {
+	for i := 0; i < len(zhb_key_list); i++ {
+		key := zhb_key_list[i]
+		switch key {
+		case "budget":
+			if !a.budget_isnull && !b.budget_isnull && a.budget != b.budget  {
+				return false
+			} else {
+				continue
+			}
+		case "buyer":
+			if strings.Contains(a.buyer, b.buyer) || strings.Contains(b.buyer, a.buyer)  && pankong(a.buyer) && pankong(b.buyer) {
+				continue
+			} else {
+				return false
+			}
+		case "agency":
+			if strings.Contains(a.agency, b.agency) || strings.Contains(b.agency, a.agency)  && pankong(a.agency) && pankong(b.agency) {
+				continue
+			} else {
+				return false
+			}
+		case "s_winner":
+			if a.s_winner != b.s_winner && pankong(a.s_winner) && pankong(b.s_winner) {
+				return false
+			} else {
+				continue
+			}
+		case "bidamount":
+			if !a.bidamount_isnull && !b.bidamount_isnull && a.bidamount != b.bidamount   {
+				return false
+			} else {
+				continue
+			}
+		case "projectcode":
+			if a.projectcode != b.projectcode && pankong(a.projectcode) && pankong(b.projectcode) {
+				return false
+			} else {
+				continue
+			}
+		case "contractcode":
+			if a.contractcode != b.contractcode && pankong(a.contractcode) && pankong(b.contractcode) {
+				return false
+			} else {
+				continue
+			}
+		}
+	}
+	return true
+}
+//发布时间判断
+//正序
+func publishtime_b_a(a,b dataSource) bool{
+	return b.publishtime-a.publishtime < 86400 * 31 * cycle
+}
+//倒序
+func publishtime_a_b(a,b dataSource) bool {
+	return a.publishtime-b.publishtime < 86400 * 31 * cycle
+}
+
+
+//
+func pankong(a string) bool {
+	if a != "" {
+		return true
+	} else {
+		return false
+	}
+}