Răsfoiți Sursa

Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

maxiaoshan 6 ani în urmă
părinte
comite
7cbe48b0d5

+ 1 - 1
src/jy/pretreated/analystep.go

@@ -225,7 +225,7 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 //ration==1 遍历所有tabs,ration!=1 tabs只有一个
 func tableDivideBlock(con string, ration float32, tabs []*goquery.Selection) string {
 	if len(tabs) != 1 {
-		//return ""
+		return ""//5c2aca5ea5cb26b9b7a8229b
 	}
 	for _, tab := range tabs {
 		content := ""

+ 266 - 228
src/jy/pretreated/analytable.go

@@ -207,21 +207,20 @@ func (table *Table) KVFilter() {
 	if !winnertag {
 		winnertag = iswinnertabletag.MatchString(table.Tag) && !nswinnertabletag.MatchString(table.TableResult.BlockTag) //块标签
 	}
-	table.analyTdKV() //1.td元素有内嵌kv,遍历放入table的Kv中2.td有子表格的处理,中标候选人排序
+	table.analyTdKV() //1.遍历每行每列td的sortkv添加到table.SorkVK中;2.td有子表格的处理
 	as := NewSortMap()
-	//表格描述处理,对成交结果的处理
+
+	//遍历table.sortkv,进行过滤处理,并放入标准化KV中,如果值是数组跳到下一步处理
 	for _, k := range table.SortKV.Keys {
+		//表格描述处理,对成交结果的处理
 		if regexp.MustCompile("(成交|中标|候选|排名|名次|供应商排序)").MatchString(k) {
 			table.Desc += "成交结果,"
 		}
-	}
-	//遍历table.sortkv,进行过滤处理,并放入标准化KV中,如果值是数组跳到下一步处理
-	for _, k := range table.SortKV.Keys {
 		if regexp.MustCompile("^单价").MatchString(k) {
 			continue
 		}
 		v := table.SortKV.Map[k]
-		if _, ok := v.(string); ok {
+		if _, ok := v.(string); ok { //table.SortKV.Value为字符串,匹配抽取关键词table.SortKV.Key,匹配到添加k,v到table.StandKV,table.StandKVWeight
 			k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "")
 			k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) //对key标准化处理,没有找到会走中标
 			//qutil.Debug(k, v, k1, w1, v1, tag, b)
@@ -235,7 +234,7 @@ func (table *Table) KVFilter() {
 				if tag != "" && table.Tag == "" {
 					table.Tag = tag
 				}
-				for pos, k2 := range k1 {
+				for pos, k2 := range k1 { //根据关键词,过滤table.SortKV到table.StandKV和table.StandKVWeight
 					if table.StandKV[k2] == "" || w1[pos] > table.StandKVWeight[k2] {
 						table.StandKV[k2] = v1 //本节点
 						table.StandKVWeight[k2] = w1[pos]
@@ -254,9 +253,84 @@ func (table *Table) KVFilter() {
 		}
 	}
 
-	//处理值是数组的kv放入标准化kv中
+	//处理值是数组的kv放入标准化kv中//处理table.SortKV.value为数组的情况
+	table.sortKVArr(as, winnertag)
+	//
+	if filterTableWror.MatchString(table.Tag) {
+		table.WinnerOrder = nil
+	}
+	//
+	if len(table.WinnerOrder) > 0 || !table.BPackage {
+		winnerOrder := []map[string]interface{}{}
+		maxSort := 0
+		//调整顺序
+		for i := 0; i < 2; i++ {
+			for _, v := range table.WinnerOrder {
+				sortstr, _ := v["sortstr"].(string)
+				if (i == 0 && sortstr == "") || (i == 1 && sortstr != "") {
+					continue
+				}
+				sort, _ := v["sort"].(int)
+				if i == 0 {
+					if maxSort == 0 || sort > maxSort {
+						maxSort = sort
+					}
+				} else {
+					maxSort++
+					v["sort"] = maxSort
+				}
+				winnerOrder = append(winnerOrder, v)
+			}
+			if len(winnerOrder) == len(table.WinnerOrder) {
+				break
+			}
+		}
+		table.WinnerOrder = winnerOrder
+		winnerOrder = []map[string]interface{}{}
+	L: //遍历每个td,查询中标人
+		for _, tr := range table.TRs {
+			for _, td := range tr.TDs {
+				winnerOrder = winnerOrderEntity.Find(td.Val, true, 3)
+				if len(winnerOrder) > 0 {
+					break L
+				}
+			}
+		}
+		if len(table.WinnerOrder) > 0 {
+			//中标候选人合并
+			winnerOrderEntity.Merge(table.WinnerOrder, winnerOrder)
+			if table.StandKV["中标单位"] == "" {
+				ent := table.WinnerOrder[0]["entname"]
+				if ent != nil {
+					table.StandKV["中标单位"], _ = ent.(string)
+					table.StandKVWeight["中标单位"] = -25
+				}
+			}
+		} else if !table.BPackage { //没有table.WinnerOrder也没有分包 将td中的WinnerOrder赋值给table.WinnerOrder
+			if len(winnerOrder) > 1 {
+				table.WinnerOrder = winnerOrder
+			}
+		}
+	}
+	//对中标候选人进行排序
+	winnerOrderEntity.Order(table.WinnerOrder)
+	//该表格有一个分包,并且有中标候选人排序的情况下,把中标候选人放到包里面
+	if table.BlockPackage != nil && table.BlockPackage.Keys != nil && len(table.BlockPackage.Keys) == 1 {
+		if table.BlockPackage.Map != nil {
+			onePkgKey := table.BlockPackage.Keys[0]
+			onePkg, _ := table.BlockPackage.Map[onePkgKey].(*u.BlockPackage)
+			if onePkg != nil && onePkg.WinnerOrder != nil && len(onePkg.WinnerOrder) == 0 {
+				onePkg.WinnerOrder = table.WinnerOrder
+				table.BlockPackage.Map[onePkgKey] = onePkg
+			}
+		}
+	}
+}
+
+//处理table.SortKV.value为数组的情况
+func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 	checkKey := map[int]bool{}
-	for kn, k := range as.Keys {
+	for kn, k := range as.Keys { //遍历table.SortKV.value为数组的key
 		v := as.Map[k]
 		if vm, ok := v.([]map[string]interface{}); ok && k == NullTxtBid {
 			if table.WinnerOrder == nil {
@@ -408,79 +482,9 @@ func (table *Table) KVFilter() {
 			}
 		}
 	}
-	//
-	if filterTableWror.MatchString(table.Tag) {
-		table.WinnerOrder = nil
-	}
-	//
-	if len(table.WinnerOrder) > 0 || !table.BPackage {
-		winnerOrder := []map[string]interface{}{}
-		maxSort := 0
-		//调整顺序
-		for i := 0; i < 2; i++ {
-			for _, v := range table.WinnerOrder {
-				sortstr, _ := v["sortstr"].(string)
-				if (i == 0 && sortstr == "") || (i == 1 && sortstr != "") {
-					continue
-				}
-				sort, _ := v["sort"].(int)
-				if i == 0 {
-					if maxSort == 0 || sort > maxSort {
-						maxSort = sort
-					}
-				} else {
-					maxSort++
-					v["sort"] = maxSort
-				}
-				winnerOrder = append(winnerOrder, v)
-			}
-			if len(winnerOrder) == len(table.WinnerOrder) {
-				break
-			}
-		}
-		table.WinnerOrder = winnerOrder
-		winnerOrder = []map[string]interface{}{}
-	L:
-		for _, tr := range table.TRs {
-			for _, td := range tr.TDs {
-				winnerOrder = winnerOrderEntity.Find(td.Val, true, 3)
-				if len(winnerOrder) > 0 {
-					break L
-				}
-			}
-		}
-		if len(table.WinnerOrder) > 0 {
-			//中标候选人合并
-			winnerOrderEntity.Merge(table.WinnerOrder, winnerOrder)
-			if table.StandKV["中标单位"] == "" {
-				ent := table.WinnerOrder[0]["entname"]
-				if ent != nil {
-					table.StandKV["中标单位"], _ = ent.(string)
-					table.StandKVWeight["中标单位"] = -25
-				}
-			}
-		} else if !table.BPackage { //没有table.WinnerOrder也没有分包 将td中的WinnerOrder赋值给table.WinnerOrder
-			if len(winnerOrder) > 1 {
-				table.WinnerOrder = winnerOrder
-			}
-		}
-	}
-	//对中标候选人进行排序
-	winnerOrderEntity.Order(table.WinnerOrder)
-	//该表格有一个分包,并且有中标候选人排序的情况下,把中标候选人放到包里面
-	if table.BlockPackage != nil && table.BlockPackage.Keys != nil && len(table.BlockPackage.Keys) == 1 {
-		if table.BlockPackage.Map != nil {
-			onePkgKey := table.BlockPackage.Keys[0]
-			onePkg, _ := table.BlockPackage.Map[onePkgKey].(*u.BlockPackage)
-			if onePkg != nil && onePkg.WinnerOrder != nil && len(onePkg.WinnerOrder) == 0 {
-				onePkg.WinnerOrder = table.WinnerOrder
-				table.BlockPackage.Map[onePkgKey] = onePkg
-			}
-		}
-	}
 }
 
-//1.td元素有内嵌kv,遍历放入table的Kv中2.td有子表格的处理,中标候选人排序
+//1.遍历每行每列td的sortkv添加到table.SorkVK中;2.td有子表格的处理
 func (table *Table) analyTdKV() {
 	//遍历每一行
 	for _, tr := range table.TRs {
@@ -502,7 +506,7 @@ func (table *Table) analyTdKV() {
 						if td.HeadTd != nil && len([]rune(k3)) < 4 {
 							k3 = td.HeadTd.Val + k3
 						}
-						if table.SortKV.Map[k3] == nil {
+						if table.SortKV.Map[k3] == nil && _val != nil && _val != "" {
 							//u.Debug(k3, _val)
 							//if !thisFlag || (thisFlag && table.SortKV.Map[k3] == nil) {
 							table.SortKV.AddKey(k3, _val)
@@ -768,8 +772,8 @@ func (table *Table) createTabe(trs *goquery.Selection) {
 }
 
 //对table进行整体解析处理
-func (table *Table) AnalyTables(contactFormat *u.ContactFormat) []*Table {
-	ts := table.tableSubDemolitionTable() //分包,拆表
+func (tn *Table) AnalyTables(contactFormat *u.ContactFormat) []*Table {
+	ts := tn.tableSubDemolitionTable() //分包,拆表
 	for n, table := range ts {
 		//处理每个table
 		if len(table.TRs) > 0 {
@@ -1436,6 +1440,9 @@ func (table *Table) FindKV() {
 							}
 							if len(td.SortKV.Map) > 0 {
 								for tdk, tdv := range td.SortKV.Map {
+									if tdv == nil || tdv == "" {//value为空或者null不再添加到table.SortKV
+										continue
+									}
 									table.SortKV.AddKey(tdk, tdv)
 								}
 							}
@@ -1759,11 +1766,22 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 				if bvalfind {
 					vals[varrpos] = td.Val // += "__" + td.Val
 				} else {
-					vals = append(vals, td.Val)
-					val = vals
+					//添加时候去除空值和nil
+					newVals := []string{}
+					for _, isval := range vals {
+						if isval == "" {
+							continue
+						}
+						newVals = append(newVals, isval)
+					}
+					//vals = append(vals, td.Val)
+					if td.Val != "" {
+						newVals = append(newVals, td.Val)
+					}
+					val = newVals
 					varrpos = len(vals) - 1
 				}
-			} else if vals, ok := val.(string); ok {
+			} else if vals, ok := val.(string); ok && vals != "" && td.Val != "" {
 				if bvalfind {
 					val = td.Val //vals + "__" + td.Val
 				} else {
@@ -1786,6 +1804,9 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 			tkey := fmtkey("k", near.TR.RowPos, near.ColPos)
 			table.SortKV.ReplaceKey(key, val, tkey)
 		} else {
+			if val == nil || val == "" ||key=="采购项目预算金额"{
+				return
+			}
 			table.SortKV.AddKey(key, val)
 			//if table.SortKV.Map[key] != nil {
 			pos := table.SortKV.Index[key]
@@ -1871,10 +1892,10 @@ func (tn *Table) GetTdByRCNo(row, col int) *TD {
 
 //判断表格是否是分包
 func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
-	pac := 0
-	val := 0
-	index = []string{} //初始化返回值index
-	index_pos := []int{}
+	pac := 0             //包的数量
+	val := 0             //分值
+	index = []string{}   //存储分包,使用tbale.SortKV的key和value使用正则等处理对值进行判断
+	index_pos := []int{} //下标
 	//是数组且能找到标段之类的提示
 	//arr_count := 0 //计数table.SortKV的value是数组的数量,后面没用
 	key_index := -1
@@ -1896,13 +1917,13 @@ func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
 			val += 4
 		}
 		//根据table.SortKV的key判断是否分包,如果没有再根据value判断
-		val, index, index_pos = foundPacBySortKV(tn, val, index, index_pos, pac, &keyExistsCount, &commonKeyVals, key_index, hasPkgTd)
+		val, index, index_pos = foundPacBySortKV(tn, val, index, index_pos, &keyExistsCount, &commonKeyVals, key_index, hasPkgTd)
 	}
 	//	u.Debug(index)
 	//过滤重复及标准化!
 	standIndex := []string{}
 	standIndex_pos := []int{}
-	oldIndex := []string{}
+	oldIndex := []string{} //存放包的原始值
 	brepeat := map[string]bool{}
 	for k, v := range index {
 		v = u.PackageNumberConvert(v)
@@ -1910,7 +1931,7 @@ func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
 			brepeat[v] = true
 			standIndex = append(standIndex, v)
 			standIndex_pos = append(standIndex_pos, index_pos[k])
-			oldIndex = append(oldIndex, v)
+			oldIndex = append(oldIndex, index[k])
 		}
 	}
 	index = standIndex
@@ -1930,163 +1951,175 @@ func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
 		//多包解析
 		if b {
 			tn.BPackage = true
+			//根据数组index分包长度添加table.BlockPackage子包数组
 			for nk, v := range index {
 				if tn.BlockPackage.Map[v] == nil {
 					bp := &u.BlockPackage{}
-					bp.Index = v
-					bp.Origin = oldIndex[nk]
-					bp.TableKV = u.NewJobKv()
+					bp.Index = v                  //序号 (转换后编号,只有数字或字母)
+					bp.Origin = oldIndex[nk]      //包的原始值
+					bp.TableKV = u.NewJobKv()     //table kv (分出的对应的KV值)
 					tn.BlockPackage.AddKey(v, bp) //table子包数组
 				}
 			}
-			if len(index) == 1 { //是一个的情况
-				if len(tn.SortKV.Keys) < 10 && tn.ColNum < 10 && tn.RowNum < 4 {
-					beq := true
-					for _, v2 := range tn.SortKV.Map {
-						if _, ok := v2.(string); !ok {
-							beq = false
-							break
-						}
-					}
-					if beq { //统一处理为数组
-						td := tn.GetTdByRCNo(tn.RowNum-1, 0)
-						if !td.BH && FindVal2_1.MatchString(td.Val) {
-							for k2, v2 := range tn.SortKV.Map {
-								tn.SortKV.Map[k2] = []string{v2.(string)}
-							}
-						} else {
-							//没有处理成数组的情况下,继续调用正文查找分包的方法
-							isGoonNext = true
-						}
-					}
-				}
+			isGoonNext = tn.manyPackageProcessByIndex(index, standIndex_pos) //多包处理,处理不同情况下的分包
+		}
+	} else {
+		isGoonNext = true
+	}
+	if isGoonNext { //没有处理成数组的情况下,继续调用正文查找分包的方法
+		tn.isGoonNext()
+	}
+	//查找分包中的中标人排序
+	if tn.BlockPackage != nil && tn.BlockPackage.Map != nil && len(tn.BlockPackage.Map) > 0 {
+		for _, v := range tn.BlockPackage.Map {
+			vv := v.(*u.BlockPackage)
+			if vv.WinnerOrder == nil || len(vv.WinnerOrder) == 0 {
+				vv.WinnerOrder = winnerOrderEntity.Find(vv.Text, true, 2)
 			}
-			for _, k1 := range tn.SortKV.Keys {
-				v1 := tn.SortKV.Map[k1]
-				if _, bvs := v1.(string); bvs && len(index) > 1 && !strings.HasSuffix(k1, "_") {
-					v1_array := []string{v1.(string)}
-					underline := ""
-					for {
-						underline += "_"
-						if tn.SortKV.Map[k1+underline] == nil {
-							break
-						} else if v3, v2_ok := tn.SortKV.Map[k1+underline].(string); v2_ok && v3 != "" {
-							v1_array = append(v1_array, v3)
-						}
-					}
-					v1 = v1_array
+		}
+	}
+	return
+}
+
+//多包处理,处理不同情况下的分包
+func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int, ) (isGoonNext bool) {
+	if len(index) == 1 { //是一个的情况
+		if len(tn.SortKV.Keys) < 10 && tn.ColNum < 10 && tn.RowNum < 4 { //table带排序的KV值小于10并且小于10列和小于4行
+			beq := true
+			for _, v2 := range tn.SortKV.Map {
+				if _, ok := v2.(string); !ok {
+					beq = false
+					break
 				}
-				if val, bvs := v1.([]string); bvs {
-					if len(val) <= len(index) {
-						for k, v := range val {
-							tn.assemblePackage(k1, v, index[k])
-						}
-					} else {
-						for sk1, sv2 := range index {
-							v := val[sk1]
-							//处理http://www.hljcg.gov.cn/xwzs!queryOneXwxxqx.action?xwbh=8145b599-a11e-45cb-a76a-12157a715570
-							if v == "" && strings.Index(k1, "供应商") > -1 {
-								if sk1 != len(index)-1 {
-									//u.Debug(val[sk1+1], val[sk1+2])
-									if standIndex_pos[sk1+1]-standIndex_pos[sk1] > 1 {
-										v = val[standIndex_pos[sk1]+1]
-									}
-								} else {
-									if standIndex_pos[sk1] < len(val)-1 {
-										v = val[standIndex_pos[sk1]+1]
-									}
-								}
-							}
-							tn.assemblePackage(k1, v, sv2)
-						}
-					}
-					//删除子包的kv
-					//u.Debug("----==1==-------", k1)
-					k1tags := u.GetTags(k1)
-					//if !(len(k1tags) > 0 && k1tags[0].Value == "采购单位") {
-					//	tn.SortKV.RemoveKey(k1)
-					//}
-					for _, vcgdw := range k1tags {
-						if vcgdw.Value == "采购单位" {
-							tn.SortKV.RemoveKey(k1)
-						}
-					}
-				} else if val, bvs := v1.(string); bvs && len(index) == 1 {
-					//删除子包的kv
-					k1tags, _, _, _, _ := CommonDataAnaly(k1, "", "", val)
-					if len(k1tags) > 0 && regexp.MustCompile("^(项目|开标|采购单位|招标机构)").MatchString(k1tags[0]) { //(k1tags[0].Value == "采购单位" || k1tags[0].Value == "项目编号")) {
-						//log.Println("remove", k1, val)
-						tn.assemblePackage(k1, val, index[0])
-						tn.SortKV.RemoveKey(k1)
+			}
+			if beq { //统一处理为数组
+				td := tn.GetTdByRCNo(tn.RowNum-1, 0)
+				if !td.BH && FindVal2_1.MatchString(td.Val) {
+					for k2, v2 := range tn.SortKV.Map {
+						tn.SortKV.Map[k2] = []string{v2.(string)}
 					}
-					//u.Debug("----==2==-------", k1)
+				} else {
+					//没有处理成数组的情况下,继续调用正文查找分包的方法
+					isGoonNext = true
 				}
-
 			}
 		}
-	} else {
-		isGoonNext = true
 	}
-	if isGoonNext {
-		blockPackage := map[string]*u.BlockPackage{}
-		for _, k := range tn.SortKV.Keys {
-			if excludeKey.MatchString(k) {
-				continue
+	for _, k1 := range tn.SortKV.Keys {
+		v1 := tn.SortKV.Map[k1]
+		if _, bvs := v1.(string); bvs && len(index) > 1 && !strings.HasSuffix(k1, "_") { //table.SortKV.Map.value为字符串并且index有分包而且table.SortKV.Map.key没有_
+			v1_array := []string{v1.(string)}
+			underline := ""
+			for {
+				underline += "_"
+				if tn.SortKV.Map[k1+underline] == nil {
+					break
+				} else if v3, v2_ok := tn.SortKV.Map[k1+underline].(string); v2_ok && v3 != "" {
+					v1_array = append(v1_array, v3)
+				}
 			}
-			str := ""
-			v := tn.SortKV.Map[k]
-			nk := regReplAllSpace.ReplaceAllString(k, "")
-			if vs, ok := v.([]string); ok {
-				str += fmt.Sprintf("%s:%s\n", nk, strings.Join(vs, " "))
+			v1 = v1_array
+		}
+		if val, bvs := v1.([]string); bvs {
+			if len(val) <= len(index) { //table.SortKV.Map.value数组小于等于分包index
+				for k, v := range val {
+					tn.assemblePackage(k1, v, index[k]) //组装解析到的分包
+				}
 			} else {
-				str += fmt.Sprintf("%s:%s\n", nk, v)
-			}
-			b, _ := divisionPackageChild(&blockPackage, str, tn.Tag, false, false)
-			if b && len(blockPackage) > 0 {
-				tn.BPackage = true
-				for mk, mv := range blockPackage {
-					if tn.BlockPackage.Map[mk] == nil {
-						tn.BlockPackage.AddKey(mk, mv)
-					} else {
-						bp := tn.BlockPackage.Map[mk].(*u.BlockPackage)
-						if bp.TableKV == nil {
-							bp.TableKV = u.NewJobKv()
-						}
-						for k2, v2 := range mv.ColonKV.Kv {
-							if bp.TableKV.Kv[k2] == "" {
-								bp.TableKV.Kv[k2] = v2
-								bp.TableKV.KvTag[k2] = mv.ColonKV.KvTag[k2]
-								bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
+				for sk1, sv2 := range index {
+					v := val[sk1]
+					//处理http://www.hljcg.gov.cn/xwzs!queryOneXwxxqx.action?xwbh=8145b599-a11e-45cb-a76a-12157a715570
+					if v == "" && strings.Index(k1, "供应商") > -1 {
+						if sk1 != len(index)-1 {
+							//u.Debug(val[sk1+1], val[sk1+2])
+							if standIndex_pos[sk1+1]-standIndex_pos[sk1] > 1 {
+								v = val[standIndex_pos[sk1]+1]
 							}
-						}
-						for k2, v2 := range mv.SpaceKV.Kv {
-							if bp.TableKV.Kv[k2] == "" {
-								bp.TableKV.Kv[k2] = v2
-								bp.TableKV.KvTag[k2] = mv.SpaceKV.KvTag[k2]
-								bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
+						} else {
+							if standIndex_pos[sk1] < len(val)-1 {
+								v = val[standIndex_pos[sk1]+1]
 							}
 						}
 					}
+					tn.assemblePackage(k1, v, sv2)
 				}
-				tn.BPackage = true
-				tn.SortKV.RemoveKey(k)
 			}
+			//删除子包的kv
+			//u.Debug("----==1==-------", k1)
+			k1tags := u.GetTags(k1) //取得匹配
+			//if !(len(k1tags) > 0 && k1tags[0].Value == "采购单位") {
+			//	tn.SortKV.RemoveKey(k1)
+			//}
+			for _, vcgdw := range k1tags {
+				if vcgdw.Value == "采购单位" {
+					tn.SortKV.RemoveKey(k1)
+				}
+			}
+		} else if val, bvs := v1.(string); bvs && len(index) == 1 {
+			//删除子包的kv
+			k1tags, _, _, _, _ := CommonDataAnaly(k1, "", "", val)
+			if len(k1tags) > 0 && regexp.MustCompile("^(项目|开标|采购单位|招标机构)").MatchString(k1tags[0]) { //(k1tags[0].Value == "采购单位" || k1tags[0].Value == "项目编号")) {
+				//log.Println("remove", k1, val)
+				tn.SortKV.RemoveKey(k1)
+				tn.assemblePackage(k1, val, index[0])
+			}
+			//u.Debug("----==2==-------", k1)
 		}
+
 	}
-	//查找分包中的中标人排序
-	if tn.BlockPackage != nil && tn.BlockPackage.Map != nil && len(tn.BlockPackage.Map) > 0 {
-		for _, v := range tn.BlockPackage.Map {
-			vv := v.(*u.BlockPackage)
-			if vv.WinnerOrder == nil || len(vv.WinnerOrder) == 0 {
-				vv.WinnerOrder = winnerOrderEntity.Find(vv.Text, true, 2)
+	return isGoonNext
+}
+
+//没有处理成数组的情况下,继续调用正文查找分包的方法
+func (tn *Table) isGoonNext() {
+	blockPackage := map[string]*u.BlockPackage{}
+	for _, k := range tn.SortKV.Keys {
+		if excludeKey.MatchString(k) {
+			continue
+		}
+		str := "" //拼装为冒号kv
+		v := tn.SortKV.Map[k]
+		nk := regReplAllSpace.ReplaceAllString(k, "")
+		if vs, ok := v.([]string); ok {
+			str += fmt.Sprintf("%s:%s\n", nk, strings.Join(vs, " "))
+		} else {
+			str += fmt.Sprintf("%s:%s\n", nk, v)
+		}
+		b, _ := divisionPackageChild(&blockPackage, str, tn.Tag, false, false) //分块之后分包
+		if b && len(blockPackage) > 0 {
+			tn.BPackage = true
+			for mk, mv := range blockPackage {
+				if tn.BlockPackage.Map[mk] == nil {
+					tn.BlockPackage.AddKey(mk, mv)
+				} else {
+					bp := tn.BlockPackage.Map[mk].(*u.BlockPackage)
+					if bp.TableKV == nil {
+						bp.TableKV = u.NewJobKv()
+					}
+					for k2, v2 := range mv.ColonKV.Kv {
+						if bp.TableKV.Kv[k2] == "" {
+							bp.TableKV.Kv[k2] = v2
+							bp.TableKV.KvTag[k2] = mv.ColonKV.KvTag[k2]
+							bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
+						}
+					}
+					for k2, v2 := range mv.SpaceKV.Kv {
+						if bp.TableKV.Kv[k2] == "" {
+							bp.TableKV.Kv[k2] = v2
+							bp.TableKV.KvTag[k2] = mv.SpaceKV.KvTag[k2]
+							bp.Text += fmt.Sprintf("%v:%v\n", k2, v2)
+						}
+					}
+				}
 			}
+			tn.BPackage = true
+			tn.SortKV.RemoveKey(k)
 		}
 	}
-	return
 }
 
 //根据table.SortKV的key判断是否分包,如果没有再根据value判断
-func foundPacBySortKV(tn *Table, val int, index []string, index_pos []int, pac int, keyExistsCount *map[string]int, commonKeyVals *map[string][]string, key_index int, hasPkgTd map[string]bool) (rval int, rindex []string, rindex_pos []int) {
+func foundPacBySortKV(tn *Table, val int, index []string, index_pos []int, keyExistsCount *map[string]int, commonKeyVals *map[string][]string, key_index int, hasPkgTd map[string]bool) (rval int, rindex []string, rindex_pos []int) {
 	keyIsPkg := false
 	for in, k := range tn.SortKV.Keys {
 		if excludeKey.MatchString(BracketsTextReg.ReplaceAllString(k, "")) { //判断分包前排除
@@ -2103,7 +2136,7 @@ func foundPacBySortKV(tn *Table, val int, index []string, index_pos []int, pac i
 				index = append(index, pkgFlag)
 				index_pos = append(index_pos, len(index))
 				val += 1
-				pac++
+				//pac++
 			} else {
 				k = strings.TrimRight(k, "_")
 			}
@@ -2129,7 +2162,7 @@ func foundPacBySortKV(tn *Table, val int, index []string, index_pos []int, pac i
 						index = append(index, v1)
 						index_pos = append(index_pos, in2)
 						val += 1
-						pac++
+						//pac++
 					}
 				}
 			} else if v1, ok := v.(string); ok && !hasPkgTd[k] {
@@ -2141,7 +2174,7 @@ func foundPacBySortKV(tn *Table, val int, index []string, index_pos []int, pac i
 						index = append(index, v1)
 						index_pos = append(index_pos, 0)
 						val += 1
-						pac++
+						//pac++
 						underline := ""
 						for {
 							underline += "_"
@@ -2221,7 +2254,7 @@ func initCheckMultiPackageByTable(tn *Table, key_index int, index []string, inde
 				index_pos = append(index_pos, 0)
 				val += 1
 				pac++
-			} else if getTd := tn.GetTdByRCNo(0, tn.SortKV.Index[k]); getTd != nil && getTd.KVDirect == 2 {
+			} else if getTd := tn.GetTdByRCNo(0, tn.SortKV.Index[k]); getTd != nil && getTd.KVDirect == 2 { //纵向
 				/*处理这种情况:
 				<tr><td>包一:xxxxxxxxx</td></tr>
 				*/
@@ -2239,19 +2272,24 @@ func initCheckMultiPackageByTable(tn *Table, key_index int, index []string, inde
 	return key_index, index, index_pos, val, pac, hasPkgTd
 }
 
-//组装解析到的分包
+//组装解析到的分包,//key如果匹配到抽取关键词就添加到table.SortKV
 func (tn *Table) assemblePackage(k1, v1, key string) {
 	bp := tn.BlockPackage.Map[key].(*u.BlockPackage)
 	if bp.TableKV == nil {
 		bp.TableKV = u.NewJobKv()
 	}
 	if v1 != "" {
-		k2, w1, v2, _, bf := CommonDataAnaly(k1, "中标情况", "", v1)
+		k2, w1, v2, _, bf := CommonDataAnaly(k1, "中标情况", "", v1) //匹配抽取关键词
 		if bf {
 			for pos, k3 := range k2 {
 				if bp.TableKV.Kv != nil && bp.TableKV.KvTag[k3] != nil && (bp.TableKV.Kv[k3] == "" || w1[pos] > bp.TableKV.KvTag[k3].Weight) {
 					bp.TableKV.Kv[k3] = v2
 					bp.TableKV.KvTag[k3] = &u.Tag{Value: v2, Weight: w1[pos]}
+				} else {
+					bp.TableKV.Kv[k1] = qutil.ObjToString(v1)
+					if tn.SortKV.Map[k3] == nil {
+						tn.SortKV.AddKey(k3, v2) //添加匹配到抽取关键词的key,value
+					}
 				}
 			}
 		} else {
@@ -3109,7 +3147,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 4 - 2
src/jy/pretreated/tablev2.go

@@ -331,7 +331,9 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
 		}
 		_, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3) //td冒号kv
 		for k, v := range resm {
-			td.SortKV.AddKey(k, v) //存放kv值
+			if k != "" && v != "" {
+				td.SortKV.AddKey(k, v) //存放kv值
+			}
 		}
 		//u.Debug(td.SortKV.Keys, "-------2--------------------------------")
 		//		td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "")
@@ -345,7 +347,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
 		} else if !bsontable {
 			txt := repSpace.ReplaceAllString(td.Val, "")
 			btw, must, _, _, repl := CheckHeader(txt)
-			if lenval > 15 {
+			if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
 				btw = false
 			}
 			if strings.Contains(td.Val, "个项目") {

+ 11 - 0
src/res/ext_v3_dump.sh

@@ -0,0 +1,11 @@
+#!/bin/bash
+dbhost="127.0.0.1:27082"
+dbname="extract_v3"
+datapath="/opt/soft/mongodb/mongodb3.4/bin"
+tables=(audit citys classify cleanup fields province rc_calss rc_field rc.order rc_rule rule_back rule_code rule_logic rule_logicback rule_logicore rule_logicpre rule_pre tag tagdetailinfo version versioninfo)
+
+for i in "${!tables[@]}"; 
+do
+    ./mongodump -h "$dbhost" -d "$dbname" -c "${tables[$i]}" -o "$datapath"
+    printf "export:%s\t%s\n" "$i" "${tables[$i]}"    
+done

+ 7 - 0
src/res/ext_v3_import.sh

@@ -0,0 +1,7 @@
+#!/bin/bash
+dbhost="127.0.0.1:27082"
+dbname="extract_v3"
+datapath="/opt/soft/mongodb/mongodb3.4/bin/extract_v3"
+./mongorestore -h "$dbhost" -d "$dbname" "$datapath" --drop
+printf "import ok"    
+

+ 1 - 0
udpprojectset/src/compare.go

@@ -23,6 +23,7 @@ import (
 **/
 type ProjectInfo struct {
 	Id            string                 `json:"id"`
+	IdInc         string                 `json:"_id"`        //补漏使用
 	Publistime    []int64                `json:"publistime"` //多条信息的发布时间、跨度
 	InfoType      [][]string             `json:"infotype"`   //多条信息内的 toptype、subtype
 	Ids           []string               `json:"ids"`

+ 8 - 5
udpprojectset/src/config.json

@@ -8,10 +8,10 @@
     },
     "thread": 1,
     "extractColl": "bidding20190521",
-    "projectColl": "projectset",
+    "projectColl": "projectset_inc",
     "lenprojectname": 18,
     "redisPoolSize": 60,
-    "redisaddrs": "ids=192.168.3.18:3379,keys=192.168.3.18:3379,info=192.168.3.18:3379",
+    "redisaddrs": "ids=127.0.0.1:6379,keys=127.0.0.1:6379,info=127.0.0.1:6379",
     "clearedis": {
         "open": true,
         "clearcron": "0 10 15 ? * 4",
@@ -23,9 +23,12 @@
     },
     "taskstock": {
         "open": true,
-		"startTime":1325347200,
-        "startdate": "2015-11-01",
-        "endate": "2019-06-30"
+        "startTime": 1446543672
+    },
+    "insertmeger": {
+        "omitmax": 10000,
+        "deviationday": 90,
+        "hourinterval": 240
     },
     "udpport": ":1482",
     "nextNode": [

+ 12 - 114
udpprojectset/src/main.go

@@ -42,6 +42,7 @@ var (
 
 	currentMegerTime  int64 //合并项目的时间位置,用来清理几个月之前的项目
 	currentMegerCount int   //合并项目的计数,用来定时清理
+
 )
 
 type MegerFields struct {
@@ -78,6 +79,13 @@ func init() {
 		ProjectNamelen: util.IntAllDef(megerfields["projectlen"], 5),
 		ProjectCodelen: util.IntAllDef(megerfields["projectcodelen"], 8),
 	}
+	//插入合并参数
+	if insertmeger, ok := Sysconfig["insertmeger"].(map[string]interface{}); ok {
+		OmitNumMax = util.Int64All(insertmeger["omitmax"])
+		DeviationDay = util.Int64All(insertmeger["deviationday"])
+		HourInterval = util.Int64All(insertmeger["hourinterval"])
+	}
+
 	redis.InitRedisBySize(Sysconfig["redisaddrs"].(string), util.IntAllDef(Sysconfig["redisPoolSize"], 100), 30, 300)
 	MQFW = mongodb.MongodbSim{
 		MongodbAddr: Sysconfig["mongodbServers"].(string),
@@ -144,17 +152,11 @@ func main() {
 		}
 	}
 	log.Println("load data from redis finished.", n)
-	//清理redis
-	//clearedis()
 
 	if taskstock, ok := Sysconfig["taskstock"].(map[string]interface{}); ok { //跑存量数据
 		if b, _ := taskstock["open"].(bool); b {
 			RunFullData(util.Int64All(taskstock["startTime"]))
-			//			startdate, _ := taskstock["startdate"].(string)
-			//			endate, _ := taskstock["endate"].(string)
-			//			taskStock(startdate, endate)
 		}
-
 	}
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
@@ -206,7 +208,7 @@ func taskInc(mapInfo map[string]interface{}) {
 	sess := MQFW.GetMgoConn()
 	defer MQFW.DestoryMongoConn(sess)
 	//数据正序处理
-	it := sess.DB(MQFW.DbName).C(extractColl).Find(map[string]interface{}{}).Sort("publishtime").Iter()
+	it := sess.DB(MQFW.DbName).C(extractColl).Find(q).Sort("publishtime").Iter()
 	count, index := 0, 0
 	pici := time.Now().Unix()
 	wg := &sync.WaitGroup{}
@@ -221,6 +223,9 @@ func taskInc(mapInfo map[string]interface{}) {
 			continue
 		}
 		pt := util.Int64All(tmp["publishtime"])
+		if time.Now().Unix()-DeviationDay*86400 > pt { //DeviationDay前的数据不处理,走插入何必
+			continue
+		}
 		if pt > currentMegerTime {
 			currentMegerTime = pt
 		}
@@ -293,113 +298,6 @@ func taskInc(mapInfo map[string]interface{}) {
 	}
 }
 
-func taskStock(startDate, endDate string) {
-	defer func() {
-		<-SingleThread
-	}()
-	defer util.Catch()
-	publishtimes := []map[string]interface{}{}
-	start, _ := time.ParseInLocation(util.Date_Short_Layout, startDate, time.Local)
-	end, _ := time.ParseInLocation(util.Date_Short_Layout, endDate, time.Local)
-	for {
-		publishtime := map[string]interface{}{
-			"date":  start.Format(util.Date_Short_Layout),
-			"stime": start.Unix(),
-			"etime": start.Add(24 * time.Hour).Unix(),
-		}
-		publishtimes = append(publishtimes, publishtime)
-		start = start.Add(24 * time.Hour)
-		if start.Unix() > end.Unix() {
-			break
-		}
-	}
-	sess := MQFW.GetMgoConn()
-	defer MQFW.DestoryMongoConn(sess)
-	wg := &sync.WaitGroup{}
-	idmap := &sync.Map{}
-	count, index := 0, 0
-	for _, v := range publishtimes {
-		q := map[string]interface{}{
-			"publishtime": map[string]interface{}{
-				"$gt":  util.Int64All(v["stime"]),
-				"$lte": util.Int64All(v["etime"]),
-			},
-		}
-		log.Println(q)
-		//数据正序处理
-		it := sess.DB(MQFW.DbName).C(extractColl).Find(&q).Sort("publishtime").Iter()
-		datenum := 0
-		for tmp := make(map[string]interface{}); it.Next(tmp); {
-			if index%10000 == 0 {
-				log.Println(index, tmp["_id"])
-			}
-			index++
-			datenum++
-			if util.IntAll(tmp["repeat"]) == 1 {
-				tmp = make(map[string]interface{})
-				continue
-			}
-			pt := util.Int64All(tmp["publishtime"])
-			if pt > currentMegerTime {
-				currentMegerTime = pt
-			}
-			count++
-			currentMegerCount++
-			if currentMegerCount > 300000 {
-				log.Println("执行清理", currentMegerTime)
-				time.Sleep(1 * time.Second)
-				clearPKey()
-				currentMegerCount = 0
-			}
-			thisid := util.BsonIdToSId(tmp["_id"])
-			b, err := redis.Exists(INFOID, thisid)
-			if err != nil {
-				log.Println("checkid err", err.Error())
-			}
-			if !b {
-				wg.Add(1)
-				idmap.Store(tmp["_id"], true) //增加判重逻辑,重复id不再生成
-				MultiThread <- true
-				go func(tmp map[string]interface{}, thisid string) {
-					defer func() {
-						<-MultiThread
-						wg.Done()
-						idmap.Delete(tmp["_id"])
-					}()
-					info := PreThisInfo(tmp)
-					if info != nil {
-						lockPNCBMap(info)
-						storeLock(info)
-						startProjectMerge(info, tmp)
-						redis.Put(INFOID, thisid, 1, INFOTIMEOUT)
-						currentMegerTime = info.Publishtime
-						unlockPNCBMap(info)
-					}
-				}(tmp, thisid)
-			}
-			if count%1000 == 0 {
-				log.Println("count:", count)
-			}
-			tmp = make(map[string]interface{})
-		}
-		log.Println(v["date"], datenum)
-	}
-	for {
-		time.Sleep(5 * time.Second)
-		n := 0
-		idmap.Range(func(key interface{}, v interface{}) bool {
-			n++
-			log.Println(key, v)
-			return true
-		})
-		if n < 1 {
-			break
-		}
-	}
-	wg.Wait()
-	log.Println("taskStock over...", index, count)
-}
-
 func NewPushInfo(tmp map[string]interface{}) bson.M {
 	return bson.M{
 		"comeintime":  tmp["comeintime"],

+ 20 - 10
udpprojectset/src/projectmeger.go

@@ -16,6 +16,7 @@ import (
 
 //有效值三选一、三选二
 var ThreeToTow, ThreeToOne map[string]bool
+var ThreeToLock = &sync.Mutex{}
 
 func init() {
 	ThreeToTow = map[string]bool{}
@@ -139,10 +140,19 @@ func noBuyer(p PCBV, thisinfo *Info, tmp map[string]interface{}) {
 			sflag = mergeProject(tmp, thisinfo, scores, pncb)
 		} else { //无项目编号
 			if p.PnameLen > MegerFieldsLen.ProjectNamelen {
-				//三选一打分
-				scores := score3Select1(p, thisinfo, tmp, res, pncb)
-				//项目合并
-				sflag = mergeProject(tmp, thisinfo, scores, pncb)
+				if p.Area && p.City && p.Agency {
+					//三选一打分
+					scores := score3Select1(p, thisinfo, tmp, res, pncb)
+					//项目合并
+					sflag = mergeProject(tmp, thisinfo, scores, pncb)
+				} else {
+					sflag = "alone"
+					mess := map[string]interface{}{
+						"meger_mess":  "无采购单位,不满足三选一",
+						"meger_sflag": sflag,
+					}
+					newProject(tmp, mess, pici, thisinfo)
+				}
 			} else {
 				//生成项目,不参与后续对比
 				sflag = "alone"
@@ -252,9 +262,11 @@ func score3Select2(p PCBV, thisinfo *Info, tmp map[string]interface{}, res []int
 						}
 						skey := fmt.Sprintf("%s%s%s", cone.BuyerType, cone.ProjectNameType, cone.ProjectCodeType)
 						cone.Cresult = skey
+						ThreeToLock.Lock()
 						if ThreeToTow[skey] {
 							scores = append(scores, cone)
 						}
+						ThreeToLock.Unlock()
 					}
 				}
 			}
@@ -344,11 +356,13 @@ func score3Select1(p PCBV, thisinfo *Info, tmp map[string]interface{}, res []int
 						} else { //D不计分
 							cone.AgencyType = "D"
 						}
-						skey := fmt.Sprintf("%s%s%s", cone.BuyerType, cone.ProjectNameType, cone.ProjectCodeType)
+						skey := fmt.Sprintf("%s%s%s%s", cone.ProjectNameType, cone.ProjectCodeType, cone.AreaType, cone.AgencyType)
 						cone.Cresult = skey
+						ThreeToLock.Lock()
 						if ThreeToOne[skey] {
 							scores = append(scores, cone)
 						}
+						ThreeToLock.Unlock()
 					}
 				}
 			}
@@ -405,10 +419,6 @@ func getComeperProjects(p PCBV, thisinfo *Info) (res []interface{}, pncb []*Comp
 //合并项目
 func mergeProject(tmp map[string]interface{}, thisinfo *Info, scores []*CompareOne, pncb []*CompareInfo) (sflag string) {
 	var id = ""
-	//分值排序
-	sort.Slice(scores, func(i, j int) bool {
-		return scores[i].Score > scores[j].Score
-	})
 	if len(scores) > 0 {
 		//分值排序
 		sort.Slice(scores, func(i, j int) bool {
@@ -416,7 +426,7 @@ func mergeProject(tmp map[string]interface{}, thisinfo *Info, scores []*CompareO
 		})
 		max := scores[0]
 		if max.Score > 0 {
-			sflag = "repeat"
+			sflag = "normal"
 			max.Parent.Bfind = true
 			tmp["cresult"] = max.Cresult
 			tmp["score"] = max.Score

+ 601 - 0
udpprojectset/src/projectmegerinsert.go

@@ -0,0 +1,601 @@
+// projectmegerinsert
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	du "jy/util"
+	"log"
+	qu "qfw/util"
+	"sort"
+	"strings"
+	"sync"
+	"time"
+
+	"gopkg.in/mgo.v2/bson"
+)
+
+const (
+	Select3To2   = iota //三选二合并
+	Select3To1          //三选一合并
+	AloneProject        //孤立核查项目
+	InvalidInfo         //无效信息
+)
+
+var (
+	OmitNumMax, DeviationDay, HourInterval int64 //提取最大遗漏数据量,项目查询时间修正区间,轮循间隔
+	InfoRScore                             = map[string][]*RScoreInfo{}
+	PnAll, PcAll, PbAll                    = map[string][]string{}, map[string][]string{}, map[string][]string{}
+)
+
+type RScoreInfo struct { //对比结果集
+	Id              string //信息id
+	Pid             string //项目id
+	Pkey            string
+	Score           int
+	ProjectNameType string
+	ProjectCodeType string
+	BuyerType       string
+	AreaType        string
+	AgencyType      string
+	Cresult         string
+	Info            *Info
+	Pinfo           *ProjectInfo
+}
+
+type MegerInfo struct { //待合并分段数据
+	StartPublishtime int64
+	EndPublishtime   int64
+	Num              int
+	Minfo            []*Info
+	Lock             *sync.Mutex
+}
+
+var StartId string
+
+func main_inc() {
+	flag.StringVar(&StartId, "StartId", "", "开始_id")
+	flag.Parse()
+	//StartId = "56388138af53745d9a000001"
+	log.Println("StartId", StartId)
+	if StartId == "" {
+		return
+	}
+	startInsertMeger(StartId)
+}
+
+//开始插入合并
+func startInsertMeger(startId string) {
+	datas := getOmitData(startId)
+	for _, minfo := range datas {
+		if int64(minfo.Num) < OmitNumMax {
+			log.Println("分段信息量太小,不执行", minfo.Num)
+			continue
+		}
+		getPncbKey(minfo)
+		compareMeger(minfo)
+		//清空相关信息
+		minfo = nil
+		InfoRScore = map[string][]*RScoreInfo{}
+	}
+	time.AfterFunc(time.Duration(HourInterval)*time.Hour, func() {
+		startInsertMeger(startId)
+	})
+}
+
+//获取遗漏数据(分段)
+func getOmitData(startId string) (list []*MegerInfo) {
+	log.Println("加载分段信息")
+	q := map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt": bson.ObjectIdHex(startId),
+		},
+	}
+	log.Println(MQFW.DbName, extractColl, q)
+	sess := MQFW.GetMgoConn()
+	defer MQFW.DestoryMongoConn(sess)
+	//数据正序处理
+	it := sess.DB(MQFW.DbName).C(extractColl).Find(q).Sort("publishtime").Iter()
+	minfo := &MegerInfo{Lock: &sync.Mutex{}, Minfo: []*Info{}}
+	var lastId string
+	for tmp := make(map[string]interface{}); it.Next(tmp); {
+		if qu.IntAll(tmp["repeat"]) == 1 {
+			continue
+		}
+		if tmp["meger_sflag"] != nil {
+			continue
+		}
+		b, mv := isMegerProjectAndProcess(tmp)
+		if !b { //是否参与合并
+			if mv == AloneProject {
+				//生成孤立项目
+				thisinfo := PreThisInfo(tmp)
+				newProjectInc(tmp, map[string]interface{}{"meger_sflag": "alone"}, time.Now().Unix(), thisinfo)
+			}
+			if mv == InvalidInfo {
+				//无效信息,打标记
+				extInfoTag("invalid", qu.BsonIdToSId(tmp["_id"]))
+			}
+			continue
+		}
+		this := PreThisInfo(tmp)
+		if this == nil {
+			continue
+		}
+		this.MererInc = mv
+		this.Data = tmp
+		tmp = make(map[string]interface{})
+		if minfo.StartPublishtime == 0 {
+			minfo.StartPublishtime = this.Publishtime
+		} else {
+			minfo.EndPublishtime = this.Publishtime
+		}
+		//分段
+		if (minfo.EndPublishtime-minfo.StartPublishtime)/int64(86400) > DeviationDay*int64(2) || OmitNumMax <= int64(minfo.Num) {
+			log.Println(len(list), "段信息加载完成,信息量", minfo.Num)
+			list = append(list, minfo)
+			minfo = &MegerInfo{}
+			break
+		}
+		lastId = this.Id
+		minfo.Minfo = append(minfo.Minfo, this)
+		minfo.Num += 1
+	}
+	if minfo.Num > 0 {
+		list = append(list, minfo)
+	}
+	log.Println("getOmitData lastId", lastId)
+	return
+}
+
+//加载分段pncb key
+func getPncbKey(minfo *MegerInfo) {
+	log.Println("pncb key 开始加载")
+	//计算时间区间
+	var startTime, endTime int64
+	if minfo.EndPublishtime+DeviationDay*86400 <= time.Now().Unix() {
+		startTime = minfo.EndPublishtime - DeviationDay*86400
+		endTime = minfo.EndPublishtime + DeviationDay*86400
+	} else {
+		startTime = minfo.EndPublishtime - DeviationDay*86400 - (minfo.EndPublishtime + DeviationDay*86400 - time.Now().Unix())
+		endTime = time.Now().Unix()
+	}
+	q := map[string]interface{}{
+		"lastpublishtime": map[string]interface{}{ //lastpublishtime
+			"$gte": startTime,
+			"$lte": endTime,
+		},
+	}
+	log.Println("getPncbKey", q, startTime, endTime)
+	//pn,pc,pb加载内存中
+	sess := MQFW.GetMgoConn()
+	defer MQFW.DestoryMongoConn(sess)
+	it := sess.DB(MQFW.DbName).C(projectColl).Find(q).Sort("lastpublishtime").Iter()
+	//it := sess.DB(MQFW.DbName).C(projectColl).Find(map[string]interface{}{}).Sort("pici").Iter()
+	for tmp := make(map[string]interface{}); it.Next(tmp); {
+		if qu.ObjToString(tmp["meger_sflag"]) == "normal" {
+			pn := "pn_" + qu.ObjToString(tmp["projectname"])
+			pc := "pc_" + qu.ObjToString(tmp["projectcode"])
+			pb := "pb_" + qu.ObjToString(tmp["buyer"])
+			pid := qu.BsonIdToSId(tmp["_id"])
+			if len(pn) > 3 {
+				PnAll[pn] = append(PnAll[pn], pid)
+			}
+			if len(pc) > 3 {
+				PcAll[pc] = append(PnAll[pc], pid)
+			}
+			if len(pb) > 3 {
+				PbAll[pb] = append(PnAll[pb], pid)
+			}
+		}
+	}
+	log.Println("pncb key 加载完成", "pn:", len(PnAll), "pb:", len(PbAll), "pc:", len(PcAll))
+}
+
+//对比打分
+func compareMeger(minfo *MegerInfo) {
+	for _, info := range minfo.Minfo {
+		pids := PnAll["pn_"+info.ProjectName]
+		compareScoreMeger("pn", pids, info)
+		pids = PcAll["pc_"+info.ProjectCode]
+		compareScoreMeger("pc", pids, info)
+		pids = PbAll["pb_"+info.Buyer]
+		compareScoreMeger("pb", pids, info)
+
+		infors, _ := InfoRScore[info.Id]
+		sort.Slice(infors, func(i, j int) bool {
+			return infors[i].Score > infors[j].Score
+		})
+		if len(infors) > 0 { //合并项目
+			info := infors[0]
+			info.Info.Data["cresult"] = info.Cresult
+			info.Info.Data["score"] = info.Score
+			id := updateinfoInc(info.Info, info.Info.Data, info.Pinfo)
+			log.Println("合并项目", info.Info.ProjectName, info.Info.ProjectCode, info.Info.Buyer)
+			switch info.Pkey {
+			case "pn":
+				if len(PnAll[info.Pkey+info.Info.ProjectName]) > 0 {
+					PnAll[info.Pkey+info.Info.ProjectName] = append(PnAll[info.Pkey+info.Info.ProjectName], id)
+				}
+			case "pc":
+				if len(PcAll[info.Pkey+info.Info.ProjectCode]) > 0 {
+					PcAll[info.Pkey+info.Info.ProjectCode] = append(PcAll[info.Pkey+info.Info.ProjectCode], id)
+				}
+			case "pb":
+				if len(PbAll[info.Pkey+info.Info.Buyer]) > 0 {
+					PbAll[info.Pkey+info.Info.Buyer] = append(PbAll[info.Pkey+info.Info.Buyer], id)
+				}
+			}
+		} else {
+			//新增项目
+			id := newProjectInc(
+				info.Data,
+				map[string]interface{}{
+					"meger_sflag": "normal",
+				},
+				time.Now().Unix(),
+				info,
+			)
+			log.Println("新增项目", info.ProjectName, info.ProjectCode, info.Buyer)
+			if len((*info).ProjectName) > 0 {
+				if PnAll["pn_"+info.ProjectName] != nil {
+					PnAll["pn_"+info.ProjectName] = append(PnAll["pn_"+info.ProjectName], id)
+				} else {
+					PnAll["pn_"+info.ProjectName] = []string{id}
+				}
+			}
+			if len((*info).ProjectCode) > 0 {
+				if PcAll["pc_"+info.ProjectCode] != nil {
+					PcAll["pc_"+info.ProjectCode] = append(PcAll["pc_"+info.ProjectCode], id)
+				} else {
+					PcAll["pc_"+info.ProjectCode] = []string{id}
+				}
+			}
+			if len((*info).Buyer) > 0 {
+				if PbAll["pb_"+info.Buyer] != nil {
+					PbAll["pb_"+info.Buyer] = append(PbAll["pb_"+info.Buyer], id)
+				} else {
+					PbAll["pb_"+info.Buyer] = []string{id}
+				}
+			}
+		}
+	}
+}
+func compareScoreMeger(ktype string, pids []string, info *Info) {
+	var projects []*ProjectInfo
+	for _, id := range pids {
+		var projectInfo *ProjectInfo
+		if pinfo, b := MQFW.FindById(projectColl, id, nil); b {
+			bys, _ := json.Marshal(pinfo)
+			json.Unmarshal(bys, &projectInfo)
+			if projectInfo == nil {
+				continue
+			}
+			//拼装projectInfo对象
+			projectInfo.Id = projectInfo.IdInc
+			projects = append(projects, projectInfo)
+		}
+	}
+	if info.MererInc == Select3To1 {
+		for _, project := range projects {
+			score3Select1Inc(ktype, project, info)
+		}
+	}
+	if info.MererInc == Select3To2 {
+		for _, project := range projects {
+			score3Select2Inc(ktype, project, info)
+		}
+	}
+}
+
+//3选2打分
+func score3Select2Inc(ktype string, pinfo *ProjectInfo, thisinfo *Info) {
+	defer qu.Catch()
+	rsInfos := InfoRScore[thisinfo.Id]
+	if rsInfos == nil {
+		rsInfos = []*RScoreInfo{}
+	}
+	rsinfo := &RScoreInfo{Id: thisinfo.Id, Info: thisinfo, Pid: pinfo.Id, Pinfo: pinfo}
+	rsinfo.BuyerType, rsinfo.Score = fieldPCBScore(thisinfo.Buyer, pinfo.Buyer, rsinfo.BuyerType, rsinfo.Score)
+	if len(thisinfo.Buyer) > 0 {
+		rsinfo.ProjectNameType, rsinfo.Score = fieldPCBScore(thisinfo.ProjectName, pinfo.ProjectName, rsinfo.ProjectNameType, rsinfo.Score)
+		rsinfo.ProjectCodeType, rsinfo.Score = fieldPCBScore(thisinfo.ProjectCode, pinfo.ProjectCode, rsinfo.ProjectCodeType, rsinfo.Score)
+	} else { //无采购单位,打分考虑长度
+		if len([]rune(thisinfo.ProjectName)) > MegerFieldsLen.ProjectNamelen {
+			rsinfo.ProjectNameType, rsinfo.Score = fieldPCBScore(thisinfo.ProjectName, pinfo.ProjectName, rsinfo.ProjectNameType, rsinfo.Score)
+		} else {
+			rsinfo.ProjectNameType = "D"
+		}
+		if len(thisinfo.ProjectCode) > MegerFieldsLen.ProjectCodelen {
+			rsinfo.ProjectCodeType, rsinfo.Score = fieldPCBScore(thisinfo.ProjectCode, pinfo.ProjectCode, rsinfo.ProjectCodeType, rsinfo.Score)
+		} else {
+			rsinfo.ProjectCodeType = "D"
+		}
+	}
+	//省市打分
+	if thisinfo.Area != "A" && thisinfo.Area != "全国" && pinfo.Area != "A" && pinfo.Area != "全国" {
+		if thisinfo.Area == pinfo.Area && thisinfo.City == pinfo.City {
+			rsinfo.Score += 2
+		} else {
+			rsinfo.Score -= 1
+		}
+	} else {
+		rsinfo.Score += 1
+	}
+	//代理机构打分
+	if len([]rune(pinfo.Agency)) > 0 {
+		if thisinfo.Agency == pinfo.Agency { //A
+			rsinfo.Score += 2
+		} else if strings.Index(pinfo.Agency, thisinfo.Agency) > -1 || strings.Index(thisinfo.Agency, pinfo.Agency) > -1 { //B
+			rsinfo.Score += 1
+		} else {
+			if len(thisinfo.Agency) < 1 { //E
+				rsinfo.Score -= 1
+			} else { //C
+				rsinfo.Score -= 2
+			}
+		}
+	} else { //D不计分
+		//
+	}
+	rsinfo.Pkey = ktype
+	rsinfo.Cresult = fmt.Sprintf("%s%s%s", rsinfo.BuyerType, rsinfo.ProjectNameType, rsinfo.ProjectCodeType)
+	ThreeToLock.Lock()
+	if ThreeToTow[rsinfo.Cresult] {
+		rsInfos = append(rsInfos, rsinfo)
+		InfoRScore[thisinfo.Id] = rsInfos
+	}
+	ThreeToLock.Unlock()
+}
+
+//3选1打分
+func score3Select1Inc(ktype string, pinfo *ProjectInfo, thisinfo *Info) {
+	defer qu.Catch()
+	rsInfos := InfoRScore[thisinfo.Id]
+	if rsInfos == nil {
+		rsInfos = []*RScoreInfo{}
+	}
+	rsinfo := &RScoreInfo{Id: thisinfo.Id, Info: thisinfo, Pid: pinfo.Id, Pinfo: pinfo}
+	if ktype == "pn" { //比较字段项目名称
+		if len(pinfo.ProjectName) > 0 {
+			if thisinfo.ProjectName == pinfo.ProjectName { //A
+				rsinfo.Score += 2
+				rsinfo.ProjectNameType = "A"
+			} else if strings.Index(pinfo.ProjectName, thisinfo.ProjectName) > -1 || strings.Index(thisinfo.ProjectName, pinfo.ProjectName) > -1 { //B
+				rsinfo.Score += 1
+				rsinfo.ProjectNameType = "B"
+			} else { //C
+				rsinfo.Score -= 2
+				rsinfo.ProjectNameType = "C"
+			}
+		} else { //D不计分
+			rsinfo.ProjectNameType = "D"
+		}
+	}
+	if ktype == "pc" { //比较字段项目编号
+		if len(pinfo.ProjectCode) > 0 {
+			if thisinfo.ProjectCode == pinfo.ProjectCode { //A
+				rsinfo.Score += 2
+				rsinfo.ProjectCodeType = "A"
+			} else if strings.Index(pinfo.ProjectCode, thisinfo.ProjectCode) > -1 || strings.Index(thisinfo.ProjectCode, pinfo.ProjectCode) > -1 { //B
+				rsinfo.Score += 1
+				rsinfo.ProjectCodeType = "B"
+			} else { //C
+				rsinfo.Score -= 2
+				rsinfo.ProjectCodeType = "C"
+			}
+		} else { //D不计分
+			rsinfo.ProjectCodeType = "D"
+		}
+	}
+	if thisinfo.Area != "A" && thisinfo.Area != "全国" && pinfo.Area != "A" && pinfo.Area != "全国" {
+		if thisinfo.Area == pinfo.Area && thisinfo.City == pinfo.City {
+			rsinfo.Score += 2
+			rsinfo.AreaType = "A"
+		} else {
+			rsinfo.Score -= 1
+			rsinfo.AreaType = "C"
+		}
+	} else {
+		rsinfo.Score += 1
+		rsinfo.AreaType = "B"
+	}
+	if len([]rune(pinfo.Agency)) > 0 {
+		if thisinfo.Agency == pinfo.Agency { //A
+			rsinfo.Score += 2
+			rsinfo.AgencyType = "A"
+		} else if strings.Index(pinfo.Agency, thisinfo.Agency) > -1 || strings.Index(thisinfo.Agency, pinfo.Agency) > -1 { //B
+			rsinfo.Score += 1
+			rsinfo.AgencyType = "B"
+		} else {
+			if len(thisinfo.Agency) < 1 { //E
+				rsinfo.Score -= 1
+				rsinfo.AgencyType = "E"
+			} else { //C
+				rsinfo.Score -= 2
+				rsinfo.AgencyType = "C"
+			}
+		}
+	} else { //D不计分
+		rsinfo.AgencyType = "D"
+	}
+	rsinfo.Pkey = ktype
+	rsinfo.Cresult = fmt.Sprintf("%s%s%s%s", rsinfo.ProjectNameType, rsinfo.ProjectCodeType, rsinfo.AreaType, rsinfo.AgencyType)
+	ThreeToLock.Lock()
+	if ThreeToOne[rsinfo.Cresult] {
+		rsInfos = append(rsInfos, rsinfo)
+		InfoRScore[thisinfo.Id] = rsInfos
+	}
+	ThreeToLock.Unlock()
+}
+
+//判断是否合并项目、并确定打分流程
+func isMegerProjectAndProcess(tmp map[string]interface{}) (b bool, res int) {
+	b = false
+	pcbv := PCBVal(tmp)
+	bNormalScore := true
+	if checkInfoAlter(tmp) && pcbv.Val < 1 {
+		bNormalScore = false
+		res = InvalidInfo //无效信息,打标记
+	}
+	if bNormalScore {
+		if pcbv.Buyer {
+			if pcbv.PnameLen > 0 || pcbv.PcodeLen > 0 {
+				res = Select3To2 //3选2打分
+				b = true
+			} else {
+				res = AloneProject //生成核查新项目
+			}
+		} else {
+			if pcbv.PnameLen > 0 {
+				if pcbv.PcodeLen > 0 {
+					res = Select3To2 //3选2打分
+					b = true
+				} else {
+					if pcbv.PnameLen > MegerFieldsLen.ProjectNamelen {
+						if pcbv.Agency && pcbv.Area {
+							res = Select3To1 //3选1打分
+							b = true
+						} else {
+							res = AloneProject //生成核查新项目
+						}
+					} else {
+						res = AloneProject //生成核查新项目
+					}
+				}
+			} else {
+				if pcbv.Pcode {
+					if pcbv.PcodeLen > MegerFieldsLen.ProjectCodelen {
+						if pcbv.Agency && pcbv.Area {
+							res = Select3To1 //3选1打分
+							b = true
+						} else {
+							res = AloneProject //生成核查新项目
+						}
+					} else {
+						res = AloneProject //生成核查新项目
+					}
+				} else {
+					res = InvalidInfo //无效信息,打标记
+				}
+			}
+		}
+	}
+	return
+}
+
+//新增项目
+func newProjectInc(tmp, mess map[string]interface{}, pipc int64, thisinfo *Info) (id string) {
+	id = InsertProject(thisinfo.NewPNKey, tmp, mess, pipc, thisinfo)
+	sflag := qu.ObjToString(mess["meger_sflag"])
+	if sflag == "alone" {
+		du.Debug("新增项目,不参与对比", id)
+	}
+	return id
+}
+
+//更新项目
+func updateinfoInc(thisinfo *Info, tmp map[string]interface{}, pInfo *ProjectInfo) string {
+	updateid := pInfo.Id
+	set := map[string]interface{}{}
+	res, bres := MQFW.FindById(projectColl, pInfo.Id, `{"list":0}`)
+	EqInfoUpdate(thisinfo, pInfo)
+	if bres && res != nil && *res != nil {
+		set["topscopeclass"] = pInfo.Topscopeclass
+		set["subscopeclass"] = pInfo.Subscopeclass
+		s_subscopeclass := strings.Join(pInfo.Subscopeclass, ",")
+		if len(s_subscopeclass) > 0 {
+			s_subscopeclass = "," + s_subscopeclass + ","
+		}
+		set["s_subscopeclass"] = s_subscopeclass
+		s_winner := strings.Join(pInfo.Winners, ",")
+		if len(s_winner) > 0 {
+			s_winner = "," + s_winner + ","
+		}
+		set["s_winner"] = s_winner
+		if pInfo.Buyerperson != "" && pInfo.Buyertel != "" {
+			set["buyerperson"] = pInfo.Buyerperson
+			set["buyertel"] = pInfo.Buyertel
+		}
+		if pInfo.Buyerclass != "" {
+			set["buyerclass"] = pInfo.Buyerclass
+		}
+		if pInfo.District != "" {
+			set["district"] = pInfo.District
+		}
+		if pInfo.Bidopentime > 0 {
+			set["bidopentime"] = pInfo.Bidopentime
+		}
+		if len(pInfo.Winnerorder) > 0 {
+			set["winnerorder"] = pInfo.Winnerorder
+		}
+		if thisinfo.HasPackage {
+			set["multipackage"] = 1
+		} else {
+			set["multipackage"] = 0
+		}
+		e := InitEL(qu.ObjToString((*res)["extractpos"]))
+		if thisinfo.dealtype == 1 {
+			var sonpackage map[string]interface{}
+			for _, obj := range tmp["package"].(map[string]interface{}) {
+				sonpackage, _ = obj.(map[string]interface{})
+			}
+			for _, v2 := range []string{"budget", "budget_w", "winner", "winner_w", "bidstatus", "bidstatus_w"} {
+				if sonpackage[v2] != nil {
+					tmp[v2] = sonpackage[v2]
+				}
+			}
+		}
+		e.fieldpriority(&tmp, res, &set)
+		set["extractpos"] = e.GetVal()
+		if thisinfo.HasPackage { //多包处理
+			p1, _ := (*res)["package"].(map[string]interface{})
+			p2, _ := tmp["package"].(map[string]interface{})
+			if p2 != nil {
+				if p1 != nil {
+					for pk2, pv2 := range p2 {
+						if p1[pk2] != nil { //合并
+							item1, _ := p1[pk2].(map[string]interface{})
+							item2, _ := pv2.(map[string]interface{})
+							if item1 != nil && item2 != nil { //原始项
+								for ik1, iv1 := range item2 {
+									if item1[ik1] == nil {
+										item1[ik1] = iv1
+									}
+								}
+							}
+						} else {
+							p1[pk2] = pv2
+						}
+					}
+				} else {
+					p1 = p2
+				}
+			}
+			set["package"] = p1
+		}
+		//中标候选人合并
+
+		update := map[string]interface{}{}
+		if len(set) > 0 {
+			update["$set"] = set
+		}
+		//保留原数据吧
+		push := NewPushInfo(tmp)
+		for tkey, _ := range extractpos {
+			if tmp[tkey] != nil {
+				push[tkey] = tmp[tkey]
+			}
+		}
+		update["$push"] = map[string]interface{}{
+			"list": push,
+		}
+		if len(update) > 0 {
+			MQFW.Update(projectColl, map[string]interface{}{
+				"_id": qu.StringTOBsonId(pInfo.Id),
+			}, &update, false, false)
+		}
+	}
+	return updateid
+}

+ 3 - 0
udpprojectset/src/thisinfo.go

@@ -46,6 +46,9 @@ type Info struct {
 	PNKey    string
 	PCKey    string
 	PBKey    string
+
+	MererInc int //增量合并标记 0三选二合并 1三选一合并 2孤立核查项目 3无效信息
+	Data     map[string]interface{}
 }
 
 //pcb三选值