Explorar el Código

price和number抽取

maxiaoshan hace 5 años
padre
commit
27056cad16

+ 2 - 1
fullproject/src_v1/init.go

@@ -294,7 +294,8 @@ type ProjectInfo struct {
 	Buyerclass  string                 `json:"buyerclass"`            //采购单位分类
 	Bidopentime int64                  `json:"bidopentime,omitempty"` //开标时间
 	//	Zbtime        int64                  `json:"zbtime"`        //招标时间
-	Jgtime    int64   `json:"jgtime"`              //结果中标时间
+	Jgtime		int64   `json:"jgtime"`              //结果中标时间
+	Zbtime		int64	`json:"zbtime"`				//招标时间
 	Bidamount float64 `json:"bidamount,omitempty"` //中标金额
 	Budget    float64 `json:"budget,omitempty"`    //预算
 	//Winnerorder []string `json:"winnerorder"` //中标候选人

+ 26 - 12
fullproject/src_v1/project.go

@@ -515,12 +515,11 @@ func (p *ProjectTask) NewProject(tmp map[string]interface{}, thisinfo *Info) (st
 	if thisinfo.TopType == "招标" {
 		if thisinfo.SubType != "变更" && thisinfo.SubType != "其它" {
 			set["zbtime"] = tmp["publishtime"]
+			p1.Zbtime = tmp["publishtime"].(int64)
 		}
 	} else if thisinfo.TopType == "结果" || thisinfo.SubType == "合同" {
-		if thisinfo.Infoformat != 2 {
 			set["jgtime"] = tmp["publishtime"]
 			p1.Jgtime = thisinfo.Publishtime
-		}
 	}
 
 	if len(thisinfo.Subscopeclass) > 0 {
@@ -690,7 +689,7 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 	pInfo.LastTime = thisinfo.Publishtime
 	set["lasttime"] = thisinfo.Publishtime
 	if thisinfo.TopType == "招标" {
-		if thisinfo.SubType != "变更" && thisinfo.SubType != "其它" && tmp["zbtime"] == nil {
+		if thisinfo.SubType != "变更" && thisinfo.SubType != "其它" && pInfo.Zbtime <= 0 {
 			set["zbtime"] = tmp["publishtime"]
 		}
 		if pInfo.Jgtime > 0 {
@@ -698,15 +697,20 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 			set["jgtime"] = int64(0)
 		}
 	} else if thisinfo.TopType == "结果" {
-		pInfo.Jgtime = thisinfo.Publishtime
-		set["jgtime"] = thisinfo.Publishtime
-	} else if thisinfo.SubType == "合同" {
-		if pInfo.Jgtime <= 0 {
+		if thisinfo.SubType == "中标" || thisinfo.SubType == "成交" || thisinfo.SubType == "流标" || thisinfo.SubType == "废标" {
+			jg1 := int64(math.Abs(float64(pInfo.Jgtime - thisinfo.Publishtime)))
+			if pInfo.Jgtime <= 0 {
+				set["jgtime"] = tmp["publishtime"]
+				pInfo.Jgtime = thisinfo.Publishtime
+			}else if jg1 > p.jgTime {
+				set["jgtime"] = tmp["publishtime"]
+				pInfo.Jgtime = thisinfo.Publishtime
+			}
+		}else if thisinfo.SubType == "合同" {
 			set["jgtime"] = tmp["publishtime"]
 			pInfo.Jgtime = thisinfo.Publishtime
 		}
 	}
-
 	if thisinfo.Bidopentime > pInfo.Bidopentime {
 		pInfo.Bidopentime = thisinfo.Bidopentime
 		set["bidopentime"] = pInfo.Bidopentime
@@ -828,7 +832,7 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 		set["subscopeclass"] = pInfo.Subscopeclass
 		set["s_subscopeclass"] = strings.Join(pInfo.Subscopeclass, ",")
 	}
-	//winner
+
 	if len(thisinfo.Winners) > 0 {
 		if len(pInfo.Winners) <= 0 {
 			set["winner"] = qu.ObjToString(tmp["winner"])
@@ -836,13 +840,23 @@ func (p *ProjectTask) UpdateProject(tmp map[string]interface{}, thisinfo *Info,
 
 		sort.Strings(pInfo.Winners)
 		for _, k := range thisinfo.Winners {
-			if BinarySearch(pInfo.Winners, k) == -1 {
-				pInfo.Winners = append(pInfo.Winners, k)
-				sort.Strings(pInfo.Winners)
+			if thisinfo.SubType == "流标" || thisinfo.SubType == "废标" {
+				if BinarySearch(pInfo.Winners, k) != -1 {
+					arr := strings.Split(pInfo.Winners, ",")
+					deleteSlice(arr, k, "")
+					pInfo.Winners = strings.Join(pInfo.Winners, ",")
+					sort.Strings(pInfo.Winners)
+				}
+			}else {
+				if BinarySearch(pInfo.Winners, k) == -1 {
+					pInfo.Winners = append(pInfo.Winners, k)
+					sort.Strings(pInfo.Winners)
+				}
 			}
 		}
 		set["s_winner"] = strings.Join(pInfo.Winners, ",")
 	}
+
 	if thisinfo.HasPackage { //多包处理
 		set["multipackage"] = 1
 		pkg := PackageFormat(thisinfo, pInfo)

+ 15 - 12
fullproject/src_v1/task.go

@@ -58,10 +58,12 @@ type ProjectTask struct {
 	//当前时间
 	currentTime int64
 	//保存长度
-	saveSize   int
-	pici       int64
-	validTime  int64
-	statusTime int64
+	saveSize   	int
+	pici       	int64
+	validTime  	int64
+	statusTime 	int64
+	//结果时间的更新		最近两天的公告不再更新jgtime
+	jgTime		int64
 	//	LockPool     chan *sync.Mutex
 	//	LockPoolLock sync.Mutex
 	//	m1, m23, m4  map[int]int
@@ -90,6 +92,7 @@ func NewPT() *ProjectTask {
 		coll:       ProjectColl,
 		validTime:  int64(util.IntAllDef(Sysconfig["validdays"], 150) * 86400),
 		statusTime: int64(util.IntAllDef(Sysconfig["statusdays"], 7) * 86400),
+		jgTime:		int64(util.IntAllDef("", 2) * 86400),
 	}
 	return p
 }
@@ -617,16 +620,16 @@ func (p *ProjectTask) updateJudge(tmp map[string]interface{}, info *Info) {
 	index := -1
 	pInfoId := ""
 	p.AllIdsMapLock.Lock()
-F:
-	for k, ID := range p.AllIdsMap {
-		for i, id := range ID.P.Ids {
-			if info.Id == id {
-				pInfoId = k
-				index = i
-				break F
+	F:
+		for k, ID := range p.AllIdsMap {
+			for i, id := range ID.P.Ids {
+				if info.Id == id {
+					pInfoId = k
+					index = i
+					break F
+				}
 			}
 		}
-	}
 	p.AllIdsMapLock.Unlock()
 	//未找到招标信息
 	if index == -1 {

+ 1 - 0
src/jy/clear/clear.go

@@ -29,6 +29,7 @@ func init() {
 	BindFn("clearBuyerPerson", ClearBuyerPerson) //处理较长采购联系人
 	BindFn("clearNumber", ClearNumber)           //一般用于处理抽取联系人后带有电话的情况
 	BindFn("clearEndSymblo", ClearEndSymblo)     //去除尾部特殊符号
+	BindFn("chiToInt", ChiToFloat)			      //中文转数字(费率、折扣率)
 }
 
 //绑定清理方法

+ 23 - 0
src/jy/clear/tonumber.go

@@ -81,6 +81,29 @@ func ObjToFloat(data []interface{}) []interface{} {
 	}
 }
 
+func ChiToFloat(data []interface{}) []interface{} {
+	tmp := ""
+	str := fmt.Sprint(data[0])
+	if strings.Contains(str, "百分之") {
+		str = strings.ReplaceAll(str, "百分之", "")
+		moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
+			if v, ok := moneyChar[key].(float64); ok {
+				tmp += strconv.FormatFloat(v, 'f', 0, 64)
+			}else if v, ok := moneyChar[key].(string); ok {
+				tmp += v
+			}
+			return tmp
+		})
+		tmpF, err := strconv.ParseFloat(tmp, 64)
+		if err != nil {
+			return data
+		}
+		return []interface{}{tmpF/100, data[1]}
+	}else {
+		return data
+	}
+}
+
 //金额转换
 func ObjToMoney(data []interface{}) []interface{} {
 	//isfindUnit := true

+ 4 - 1
src/jy/extract/extpackage.go

@@ -148,6 +148,9 @@ func PackageDetail(j *ju.Job, e *ExtractTask, isSite bool, codeSite string) {
 					sonJobResult["origin"] = pkg.Origin
 					sonJobResult["text"] = pkg.Text
 					sonJobResult["name"] = pkg.Name
+					if pkg.Winner!= ""{
+						sonJobResult["winner"] = pkg.Winner
+					}
 					if pkg.WinnerPerson != "" {
 						sonJobResult["winnertel"] = pkg.WinnerTel
 						sonJobResult["winnerperson"] = pkg.WinnerPerson
@@ -177,7 +180,7 @@ func PackageDetail(j *ju.Job, e *ExtractTask, isSite bool, codeSite string) {
 							}
 						}
 					} else {
-						if len(j.Winnerorder) > 0 {
+						if sonJobResult["winner"] == "" && len(j.Winnerorder) > 0 {
 							if j.Winnerorder[0]["price"] != nil {
 								sonJobResult["bidamount"] = qu.Float64All(j.Winnerorder[0]["price"])
 							}

+ 20 - 0
src/jy/extract/extract.go

@@ -938,6 +938,9 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 				if in.Field == "projectname" && vbpkg.Name != "" {
 					continue
 				}
+				if in.Field == "winner" && vbpkg.Winner != "" {
+					continue
+				}
 				if in.Field == "winnerperson" {
 					if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 {
 						continue
@@ -1777,6 +1780,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil {
 				tmp["s_winner"] = tmp["winner"]
 			} else if savewinner != nil {
+				savewinner = RemoveReplicaSliceString(savewinner)
 				tmp["s_winner"] = strings.Join(savewinner, ",")
 			}
 
@@ -1785,6 +1789,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			tmp["s_winner"] = tmp["winner"]
 		}
 		if len(j.Winnerorder) > 0 { //候选人信息
+			for i,v := range j.Winnerorder{
+				if v["price"]!= nil{
+					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"],""})[0]
+				}
+			}
 			tmp["winnerorder"] = j.Winnerorder
 		}
 		//处理附件
@@ -2302,3 +2311,14 @@ func resetWinnerorder(j *ju.Job) {
 	//j.Result["bidamount"] = bidamounts
 
 }
+func RemoveReplicaSliceString(slc []string) []string {
+	result := make([]string, 0)
+	tempMap := make(map[string]bool, len(slc))
+	for _, e := range slc{
+		if tempMap[e] == false{
+			tempMap[e] = true
+			result = append(result, e)
+		}
+	}
+	return result
+}

+ 2 - 1
src/jy/pretreated/analystep.go

@@ -53,6 +53,7 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 		//log.Println(con)
 		bl := &util.Block{}
 		newCon := con
+		//log.Println(con)
 		if len(tabs) > 0 { //解析表格逻辑
 			job.HasTable = 1 //添加标识:文本中有table
 			newCon = TextAfterRemoveTable(con)
@@ -105,7 +106,7 @@ func AnalyStart(job *util.Job, isSite bool, codeSite string) {
 										job.BlockPackage[k].WinnerOrder = append(job.BlockPackage[k].WinnerOrder, map[string]interface{}{
 											"type":    0,
 											"price":   0.0,
-											"entname": vv.Value,
+											"entname": strings.TrimSpace(vv.Value),
 											"sort":    tmpw,
 										})
 										tmpw++

+ 2 - 2
src/jy/pretreated/analytable.go

@@ -51,7 +51,7 @@ var (
 	FindVal_1  = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)")
 	FindVal2_1 = regexp.MustCompile("([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)|^(设计|施工|监理|验收)[分子]?[标包]?[段号]?$")
 	//判断分包前排除
-	excludeKey  = regexp.MustCompile("(标段代码|涉及包号|分包数量|包件号?|项目标号|规格|型号|招标范围|业绩|废标)|(^编号$)|([^包段标]编号)") //编号|划分
+	excludeKey  = regexp.MustCompile("(标段代码|涉及包号|分包数量|包件号?|项目标号|规格|型号|招标范围|业绩|废标|标段选择要求)|(^编号$)|([^包段标]编号)") //编号|划分
 	excludeKey2 = regexp.MustCompile("包/[0-9]{0,4}[|箱|纸|张]")
 	//-------------
 
@@ -2339,7 +2339,7 @@ func (tn *Table) isGoonNext(isSite bool, codeSite string) {
 func foundPacBySortKV(tn *Table, val int, index []string, index_pos []int, keyExistsCount *map[string]int, commonKeyVals *map[string][]string, key_index int, hasPkgTd map[string]bool) (rval int, rindex []string, rindex_pos []int) {
 	keyIsPkg := false
 	for in, k := range tn.SortKV.Keys {
-		if excludeKey.MatchString(BracketsTextReg.ReplaceAllString(k, "")) { //判断分包前排除
+		if excludeKey.MatchString(BracketsTextReg.ReplaceAllString(k, "")) ||regPDFWarap.MatchString(k)||regAZWarap.MatchString(k){ //判断分包前排除
 			continue
 		}
 		v := tn.SortKV.Map[k]

+ 82 - 103
src/jy/pretreated/division.go

@@ -57,10 +57,15 @@ var (
 	regMoreWrap        = regexp.MustCompile("[\r\n]{2,}")
 	regStrWrap         = regexp.MustCompile("分包名称[::]")
 	regBZJWarap        = regexp.MustCompile("保证金.*")
+	regPDFWarap        = regexp.MustCompile("[a-zA-Z](包|标段).(pdf|PDF)")
+	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分)")
 	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
 	moreColonReg       = regexp.MustCompile("[::]+")
 	regFilter          = regexp.MustCompile("等$")
 	pkgFilter          = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?(标|包)(段|号)?")
+	indexTile          = regexp.MustCompile("[0-9.]{2,3}[\\s\u4e00-\u9fa5]{2,8}[::]+") //小标题
+	indexTile2         = regexp.MustCompile("[\\s\u4e00-\u9fa5]{2,8}")
+	regReplAllSpace2   = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
 	confusion          = map[string]string{
 		"参与": "canyu",
 	}
@@ -566,63 +571,71 @@ func FindPackageFromBlocks(blocks *[]*util.Block, isSite bool, codeSite string)
 	//orderwinner := winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
 
 	for k, v := range blockPackage {
-		if v.ColonKV != nil && v.ColonKV.KvTags != nil {
-			for kc, cv := range v.ColonKV.KvTags {
-				if kc == "预算" && v.Budget <= 0 {
-					moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
-					if len(moneys) > 0 {
-						if vf, ok := moneys[0].(float64); ok {
-							blockPackage[k].Budget = vf
-							blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
-						} else if vi, ok := moneys[0].(int); ok {
-							blockPackage[k].Budget = float64(vi)
-							blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
-						}
+		findWinnerBugetBidmountByKv(v, blockPackage, k)
+	}
+	return
+}
+
+func findWinnerBugetBidmountByKv(v *util.BlockPackage, blockPackage map[string]*util.BlockPackage, k string) {
+	if v.ColonKV != nil && v.ColonKV.KvTags != nil {
+		for kc, cv := range v.ColonKV.KvTags {
+			if kc == "预算" && v.Budget <= 0 {
+				moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
+				if len(moneys) > 0 {
+					if vf, ok := moneys[0].(float64); ok {
+						blockPackage[k].Budget = vf
+						blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
+					} else if vi, ok := moneys[0].(int); ok {
+						blockPackage[k].Budget = float64(vi)
+						blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
 					}
-				} else if kc == "中标金额" && v.Bidamount <= 0 {
-					moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
-					if len(moneys) > 0 {
-						if vf, ok := moneys[0].(float64); ok {
-							blockPackage[k].Bidamount = vf
-							blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
-						} else if vi, ok := moneys[0].(int); ok {
-							blockPackage[k].Bidamount = float64(vi)
-							blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
-						}
+				}
+			} else if kc == "中标金额" && v.Bidamount <= 0 {
+				moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
+				if len(moneys) > 0 {
+					if vf, ok := moneys[0].(float64); ok {
+						blockPackage[k].Bidamount = vf
+						blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
+					} else if vi, ok := moneys[0].(int); ok {
+						blockPackage[k].Bidamount = float64(vi)
+						blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
 					}
 				}
+			} else if kc == "中标单位" && v.Winner == "" {
+				blockPackage[k].Winner = cv[0].Value
 			}
 		}
-		if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
-			for kc, cv := range v.SpaceKV.KvTags {
-				if kc == "预算" && v.Budget <= 0 {
-					moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
-					if len(moneys) > 0 {
-						if vf, ok := moneys[0].(float64); ok {
-							blockPackage[k].Budget = vf
-							blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
-						} else if vi, ok := moneys[0].(int); ok {
-							blockPackage[k].Budget = float64(vi)
-							blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
-						}
+	}
+	if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
+		for kc, cv := range v.SpaceKV.KvTags {
+			if kc == "预算" && v.Budget <= 0 {
+				moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
+				if len(moneys) > 0 {
+					if vf, ok := moneys[0].(float64); ok {
+						blockPackage[k].Budget = vf
+						blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
+					} else if vi, ok := moneys[0].(int); ok {
+						blockPackage[k].Budget = float64(vi)
+						blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
 					}
+				}
 
-				} else if kc == "中标金额" && v.Bidamount <= 0 {
-					moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
-					if len(moneys) > 0 {
-						if vf, ok := moneys[0].(float64); ok {
-							blockPackage[k].Bidamount = vf
-							blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
-						} else if vi, ok := moneys[0].(int); ok {
-							blockPackage[k].Bidamount = float64(vi)
-							blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
-						}
+			} else if kc == "中标金额" && v.Bidamount <= 0 {
+				moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
+				if len(moneys) > 0 {
+					if vf, ok := moneys[0].(float64); ok {
+						blockPackage[k].Bidamount = vf
+						blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
+					} else if vi, ok := moneys[0].(int); ok {
+						blockPackage[k].Bidamount = float64(vi)
+						blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
 					}
 				}
+			} else if kc == "中标单位" && v.Winner == "" {
+				blockPackage[k].Winner = cv[0].Value
 			}
 		}
 	}
-	return
 }
 
 //从正文里面找分包
@@ -631,61 +644,7 @@ func FindPackageFromText(title string, content string, isSite bool, codeSite str
 	//从正文里面找分包
 	divisionPackageChild(&blockPackage, content, title, true, false, isSite, codeSite)
 	for k, v := range blockPackage {
-		if v.ColonKV != nil && v.ColonKV.KvTags != nil {
-			for kc, cv := range v.ColonKV.KvTags {
-				if kc == "预算" && v.Budget <= 0 {
-					moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
-					if len(moneys) > 0 {
-						if vf, ok := moneys[0].(float64); ok {
-							blockPackage[k].Budget = vf
-							blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
-						} else if vi, ok := moneys[0].(int); ok {
-							blockPackage[k].Budget = float64(vi)
-							blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
-						}
-					}
-				} else if kc == "中标金额" && v.Bidamount <= 0 {
-					moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
-					if len(moneys) > 0 {
-						if vf, ok := moneys[0].(float64); ok {
-							blockPackage[k].Bidamount = vf
-							blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
-						} else if vi, ok := moneys[0].(int); ok {
-							blockPackage[k].Bidamount = float64(vi)
-							blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
-						}
-					}
-				}
-			}
-		}
-		if v.SpaceKV != nil && v.SpaceKV.KvTags != nil {
-			for kc, cv := range v.SpaceKV.KvTags {
-				if kc == "预算" && v.Budget <= 0 {
-					moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
-					if len(moneys) > 0 {
-						if vf, ok := moneys[0].(float64); ok {
-							blockPackage[k].Budget = vf
-							blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
-						} else if vi, ok := moneys[0].(int); ok {
-							blockPackage[k].Budget = float64(vi)
-							blockPackage[k].IsTrueBudget = moneys[len(moneys)-1].(bool)
-						}
-					}
-
-				} else if kc == "中标金额" && v.Bidamount <= 0 {
-					moneys := clear.ObjToMoney([]interface{}{cv[0].Value, ""})
-					if len(moneys) > 0 {
-						if vf, ok := moneys[0].(float64); ok {
-							blockPackage[k].Bidamount = vf
-							blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
-						} else if vi, ok := moneys[0].(int); ok {
-							blockPackage[k].Bidamount = float64(vi)
-							blockPackage[k].IsTrueBidamount = moneys[len(moneys)-1].(bool)
-						}
-					}
-				}
-			}
-		}
+		findWinnerBugetBidmountByKv(v, blockPackage, k)
 	}
 	//winnerOrderEntity.Find(content, true, 2, isSite, codeSite)
 	return
@@ -694,6 +653,8 @@ func FindPackageFromText(title string, content string, isSite bool, codeSite str
 //分块之后分包
 func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content, title string, isFindWinnerOrder, accuracy bool, isSite bool, codeSite string) (bool, string) {
 	//查找知否有分包
+	content = regPDFWarap.ReplaceAllString(content, "\n")
+	content = regAZWarap.ReplaceAllString(content, "\n")
 	content = regStrWrap.ReplaceAllString(content, "\n")
 	content = regMoreWrap.ReplaceAllString(content, "\n")
 	content = regEndWrap.ReplaceAllString(content, "")
@@ -731,7 +692,6 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 	con = conTemp
 	con = replSerial.ReplaceAllString(con, "\n")
 	con = regMoreWrap.ReplaceAllString(con, "\n")
-	//log.Println(con)
 	//根据分包,找索引位置
 	indexMap := map[int]int{}
 	indexKeyStringMap := map[int]string{}
@@ -740,6 +700,11 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 	startEndMap := map[int]int{}
 	pkgIndexMap := map[string][]int{}
 	indexPkgMap := map[int]string{}
+	//小标题
+	titleindexs := indexTile.FindAllStringIndex(con, -1)
+	if len(titleindexs) == 0 {
+		titleindexs = indexTile2.FindAllStringIndex(con, -1)
+	}
 	//遍历分包,把kv在包前面的移动到包后面
 	for _, v := range pkg {
 		pgflag := v[0] + "[::]*"
@@ -788,12 +753,12 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			indexKeyStringMap[iv] = indexKeyStringMap[indexs[ik-1]]
 		}
 	}
-	//
 	//获取截取标识
 	surplusText, maxWarpCount, indexTextMap, indexWarpMap := interceptText(indexs, indexPkgMap, pkgIndexMap, startEndMap, con)
 	//查找分包内容,分kv
 	for _, iv := range indexs {
 		text := indexTextMap[iv]
+		tmptext := text
 		//
 		warpIndex := regSpliteSegment.FindAllStringIndex(text, -1)
 		if len(indexWarpMap) > 0 {
@@ -812,6 +777,20 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			index := util.PackageNumberConvert(bk)
 			//去掉前缀,空格必须要加,分kv的时候要用
 			text = regexp.MustCompile(bv[0] + "[::]*").ReplaceAllString(text, "")
+			if strings.TrimLeft(tmptext, bv[0]) == text || strings.TrimLeft(tmptext, bv[0]+":") == text {
+				var tagtitle string
+				for i, v := range titleindexs {
+					if i == 0 {
+						continue
+					}
+					if v[0] > iv {
+						tagtitle = con[titleindexs[i-1][0]:titleindexs[i-1][1]]
+						break
+					}
+				}
+				tagtitle = regReplAllSpace2.ReplaceAllString(tagtitle, "")
+				text = tagtitle + ":" + text
+			}
 			headKey := ""
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
@@ -835,7 +814,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 				}
 				MergeKvTags((*blockPackage)[index].ColonKV.KvTags, colonJobKv.KvTags)
 				//合并空格kv
-				spaceJobKv := SspacekvEntity.Entrance(text, "", nil, isSite, codeSite)
+				spaceJobKv := SspacekvEntity.Entrance(text, headKey, nil, isSite, codeSite)
 				MergeKvTags((*blockPackage)[index].SpaceKV.KvTags, spaceJobKv.KvTags)
 			} else {
 				newBpkg := &util.BlockPackage{

+ 6 - 34
src/jy/pretreated/tablev2.go

@@ -144,30 +144,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table, isSite bool, codeSi
 				td.SortKV.AddKey(bl_sk, bl_sv)
 			}
 		}
-	} else {
-		//for _, v := range GetKVAll(txt, "", nil, 2).KvTags {
-		//for _, vv := range v {
-		//td.SortKV.AddKey(vv.Key, vv.Value)
-		//}
-		//}
-	}
-	////抽取不到走正则抽
-	//proCode := projectcodeReg.FindString(text)
-	//if proCode != "" {
-	//	ckv := GetKVAll(proCode, "", nil, 1)
-	//	for _, v := range ckv.KvTags {
-	//		for _, vv := range v {
-	//			td.SortKV.AddKey(vv.Key, vv.Value)
-	//		}
-	//	}
-	//} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
-	//	ckv := GetKVAll(proCode, "", nil, 1)
-	//	for _, v := range ckv.KvTags {
-	//		for _, vv := range v {
-	//			td.SortKV.AddKey(vv.Key, vv.Value)
-	//		}
-	//	}
-	//}
+	}
 	if proCode := jsonReg.FindString(text); proCode != "" {
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode), &jsonMap)
@@ -188,7 +165,6 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table, isSite bool, codeSi
 		td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵
 		td.KVDirect = 2  //键-值方向,0未知,1横 2纵//指值和k的方向
 	}
-	//u.Debug(td.BH, td.Val)
 	return td
 }
 
@@ -406,13 +382,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite stri
 		//u.Debug(td.SortKV.Keys, "-------2--------------------------------")
 		//		td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "")
 		//resm := GetKVAll(text, "")
-		if len(td.SortKV.Keys) > 0 {
-			//td.KVDirect = 3 //不当头也不当值,忽略
-			if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
-				td.Val, _ = td.SortKV.Map[td.SortKV.Keys[0]].(string)
-				td.BH = true
-			}
-		} else if !bsontable {
+		if !bsontable {
 			txt := repSpace.ReplaceAllString(td.Val, "")
 			btw, must, _, _, repl := CheckHeader(txt)
 			if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
@@ -425,8 +395,10 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite stri
 			td.Valtype = repl
 			td.MustBH = must
 			td.BH = btw
-			if strings.Contains(txt, "年估算额年(万元)") {
-				td.MustBH = true
+		} else if len(td.SortKV.Keys) > 0 {
+			//td.KVDirect = 3 //不当头也不当值,忽略
+			if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
+				td.Val, _ = td.SortKV.Map[td.SortKV.Keys[0]].(string)
 				td.BH = true
 			}
 		}

+ 1 - 1
src/jy/pretreated/winnerorder.go

@@ -235,7 +235,7 @@ func (wo *WinnerOrderEntity) findByReg(content string, blocks []string, reg_2 *r
 				val := wo.clear("中标单位", v)
 				if val != nil {
 					count++
-					object["entname"] = val
+					object["entname"] = strings.TrimSpace(qutil.ObjToString(val))
 					object["sort"] = wo.toNumber(k, count)
 					object["sortstr"] = thisNumberReg.FindString(k)
 					object["type"] = i

+ 1 - 2
src/res/ext_v3_dump.sh

@@ -2,8 +2,7 @@
 dbhost="127.0.0.1:27082"
 dbname="extract_v3"
 datapath="/opt/soft/mongodb/mongodb3.4/bin"
-tables=(audit areacode citys classify cleanup memu menusecond fields infoclass infotype province postcode rc_calss rc_field rc.order rc_rule rule_back rule_code rule_logic rule_logicback rule_logicore rule_logicpre rule_pre tag tagdetailinfo version versioninfo block_info block_classify block_classify_info block_classify_tag)
-
+tables=(address areacode audit block_classify block_classify_info block_classify_tag block_info citys classify cleanup fields infoclass infotype memu menusecond pkg_info pkg_logicore postcode province postcode rc_calss rc_field rc_rule rule_back rule_code rule_logic rule_logicback rule_logicore rule_logicpre rule_pre site site_management site_rule_code site_rule_logic site_rule_logicback site_rule_logickv site_rule_logicore site_versioninfo tag tagdetailinfo user version versioninfo)
 
 for i in "${!tables[@]}"; 
 do

+ 4 - 3
src/res/tablev1.json

@@ -1,7 +1,7 @@
 {
 	"normalhead":[
-		"^((.{2,6}(名称|编号|代码|时间|类型|性质|行政区域|原因|项目|意见|须知|程度))|标段(编号)?|招标金额|规模|统一社会信用代码|拟?中标供应商|质量|(质量)?承诺|地址|招标代理|序号|材料|结构|结构层数|评委|单位|数量|排名|标的|标项|开户银行|邮编|账号|电话|传真|网址|得分|名次|包件?号|职务|(建设|招标|采购|中标|成交|甲|乙)(单位|人|供应商|方|规模).{0,2}|.{0,5}(价格?|额|资金|[预概]算|投资|费用|报价|投标价)(万?元?([大小]写)?))$__M",
-		"^.{0,7}(((单位)?名称|总监|经理|负责人|信息|率|费|期|人|号|码|(价格?|额|资金)(万?元?([大小]写)?)|员|品目|标包|代表|区域|方式|因素|合价|合计|小计|地点|条件|(资质|类别和)等级|类别|状态)|得分|注册专业|方法|家数|全称|简称|邮件|执业或职业资格|证书|部门|事项|来源|划分|长度|规模|保证金|目标|描述)$__",
+		"^((.{2,6}(描述|名称|编号|代码|时间|类型|性质|行政区域|原因|意见|须知|程度))|标段(编号)?|招标金额|规模|统一社会信用代码|拟?中标供应商|质量|(质量)?承诺|地址|招标代理|序号|材料|结构|结构层数|评委|单位|数量|排名|标的|标项|开户银行|邮编|账号|电话|传真|网址|得分|名次|包件?号|职务|(建设|招标|采购|中标|成交|甲|乙)(单位|人|供应商|方|规模).{0,2}|.{0,5}(价格?|额|资金|[预概]算|投资|费用|报价|投标价)(万?元?([大小]写)?))$__M",
+		"^.{0,7}(((单位)?名称|总监|经理|负责人|信息|率|费|期|人|号|码|(价格?|额|资金)(万?元?([大小]写)?)|员|品目|标包|代表|区域|方式|因素|合价|合计|小计|地点|条件|(资质|类别和)等级|类别|状态)|得分|注册专业|方法|家数|全称|简称|邮件|执业或职业资格|证书|部门|事项|来源|划分|长度|规模|保证金|目标)$__",
 		"(名单|证号|名称|要求|时间|日期|地点|单位|条款|机构|范围|情况|概况|品名|规格|参数|标准|指标|型号|限价|数量|方式|等级|依据|明细|概况|内容|次数|产品|性质|地区|地址|币种|主题|详情|说明|代理(公司|机构)|节支率|名单|结果|结果公示)$|^(职称|姓名|级别|职称专业|证书名称|证书编号)$__",
 		"^(联系|评标|单位|公告|采购|商品|附件|质保|用途|公示|机构|评审|品名|规格|参数|指标|型号|数量|证书).{0,10}$__",
 		"(专家|评委|打分)$__",
@@ -59,7 +59,8 @@
 		"落标供应商及落标原因",
 		"被废标供应商名称",
 		"主要人员",
-		"其他投标人"
+		"其他投标人",
+		"年估算额年(万元)"
 	],
 	"bidorder":[
 		"(.{0,8}排[序名]$|名次|^序号$)__sort",

+ 4 - 1
src/web/templates/admin/clear.html

@@ -102,7 +102,10 @@ menuActive("version")
 var field = {{.field}};
 var _id = "";
 //var clearArr = ["cutspace","cutallspace","cutSymbol","cutNotPrs","clearAllWord","clearMaxAmount","clearProjectName","toint","tofloat","totimestamp","tomoney","getcurrency","getrate","getPhone","rateToFloat"]; 
-var clearMap = {"中文符号转英文":"chiToEng","去除首尾空格":"cutspace","去除所有空格":"cutallspace","清理符号":"cutSymbol","清理不成对符号后面的内容":"cutNotPrs","清理全部是汉字或者特殊符号的情况":"clearAllWord","过滤大于1万亿":"clearMaxAmount","清理项目名称":"clearProjectName","转int":"toint","转float":"tofloat","转时间戳":"totimestamp","转换金额":"tomoney","获取币种":"getcurrency","获取汇率":"getrate","取手机号":"getPhone","清理数字":"clearNumber","费率转小数":"rateToFloat","处理较长采购联系人":"clearBuyerPerson","去除尾部特殊符号":"clearEndSymblo"}
+var clearMap = {"中文符号转英文":"chiToEng","去除首尾空格":"cutspace","去除所有空格":"cutallspace","清理符号":"cutSymbol","清理不成对符号后面的内容":"cutNotPrs",
+	"清理全部是汉字或者特殊符号的情况":"clearAllWord","过滤大于1万亿":"clearMaxAmount","清理项目名称":"clearProjectName","转int":"toint","转float":"tofloat",
+	"转时间戳":"totimestamp", "转换金额":"tomoney","获取币种":"getcurrency","获取汇率":"getrate","取手机号":"getPhone", "清理数字":"clearNumber",
+	"费率转小数":"rateToFloat","处理较长采购联系人":"clearBuyerPerson","去除尾部特殊符号":"clearEndSymblo","费率/折扣类报价中文转float":"chiToInt"}
 $(function () {
 	ttableclear=$('#clearTable').DataTable({
 		"lengthChange": false,

+ 5 - 6
standardata/src/config.json

@@ -1,5 +1,4 @@
 {
-  "port": "1235",
   "mgofrom": "172.17.4.187:27083",
   "mgofromsize":5,
   "mgofromdb":"qfw",
@@ -12,18 +11,18 @@
   "extractcoll":"result_20200116",
   "standardata":{
 	"winner":{
-		"standarent":"winner_ent",
-		"standarerr":"winner_err",
+		"standarent":"winner_enterprisenew",
+		"standarerr":"winner_errnew",
 		"redisdb":1
 	},
     "buyer":{
-      "standarent":"buyer_ent",
+      "standarent":"buyer_agency_enterprise",
       "standarerr":"buyer_err",
       "redisdb":2
     },
     "agency":{
-      "standarent":"agency_data_ent",
-      "standarerr":"agency_data_err",
+      "standarent":"agency_enterprise",
+      "standarerr":"agency_err",
       "redisdb":3
     }
   },

+ 67 - 0
standardata/src/historyrepair.go

@@ -0,0 +1,67 @@
+// historyrepair 处理多线程重复数据问题
+package main
+
+import (
+	"dbutil/mongo"
+	"dbutil/redis"
+	"log"
+	qu "qfw/util"
+
+	"go.mongodb.org/mongo-driver/bson"
+)
+
+func historyrepair(db, coll, datatype string, dbnum int) {
+	sess := MongoTo.GetMgoConn()
+	defer MongoTo.Close()
+	field := ""
+	if datatype == "winner" {
+		field = "company_name"
+	} else if datatype == "buyer" {
+		field = "buyer_name"
+	} else if datatype == "agency" {
+		field = "agency_name"
+	}
+	it := sess.DB(db).C(coll).Find(bson.M{}).Select(bson.M{field: 1}).Iter()
+	index := 0
+	delnum := 0
+	for tmp := map[string]interface{}{}; it.Next(&tmp); index++ {
+		name := qu.ObjToString(tmp[field])
+		id := mongo.BsonTOStringId(tmp["_id"])
+		str, _ := redis.GetRedisStr(datatype, dbnum, name)
+		if str != "" {
+			MongoTo.DeleteById(coll, id)
+			delnum++
+		} else {
+			redis.PutRedis(datatype, dbnum, name, id, -1)
+		}
+		tmp = map[string]interface{}{}
+		if index%100 == 0 {
+			log.Println(index, delnum)
+		}
+	}
+	log.Println(index, delnum)
+}
+
+func historyrepairErr(db, coll, datatype string, dbnum int) {
+	sess := MongoTo.GetMgoConn()
+	defer MongoTo.Close()
+	it := sess.DB(db).C(coll).Find(bson.M{}).Select(bson.M{"name": 1}).Iter()
+	index := 0
+	delnum := 0
+	for tmp := map[string]interface{}{}; it.Next(&tmp); index++ {
+		name := qu.ObjToString(tmp["name"])
+		id := mongo.BsonTOStringId(tmp["_id"])
+		str, _ := redis.GetRedisStr(datatype, dbnum, name)
+		if str != "" {
+			MongoTo.DeleteById(coll, id)
+			delnum++
+		} else {
+			redis.PutRedis(datatype, dbnum, name, id, -1)
+		}
+		tmp = map[string]interface{}{}
+		if index%100 == 0 {
+			log.Println(index, delnum)
+		}
+	}
+	log.Println(index, delnum)
+}

+ 14 - 17
standardata/src/main.go

@@ -15,18 +15,18 @@ import (
 
 var (
 	MongoFrom /*抽取原*/, MongoTo /*保存库*/, MongoEnt/*企业库*/ *mongo.MongodbSim
-	sysconfig            map[string]interface{}
-	extractcoll          string
-	winnerent, winnererr string
-	buyerent, buyererr   string
-	agencyent, agencyerr   string
-	winnerbd, buyerbd, agencybd    int
-	Addrs                = make(map[string]interface{}, 0) //省市县
-	winchanbool          = make(chan bool, 3)
-	buyerchanbool        = make(chan bool, 3)
-	agencychanbool        = make(chan bool, 3)
-	gochan               = make(chan bool, 3)
-	udpclient            mu.UdpClient
+	sysconfig                   map[string]interface{}
+	extractcoll                 string
+	winnerent, winnererr        string
+	buyerent, buyererr          string
+	agencyent, agencyerr        string
+	winnerbd, buyerbd, agencybd int
+	Addrs                       = make(map[string]interface{}, 0) //省市县
+	winchanbool                 = make(chan bool, 3)
+	buyerchanbool               = make(chan bool, 3)
+	agencychanbool              = make(chan bool, 3)
+	gochan                      = make(chan bool, 3)
+	udpclient                   mu.UdpClient
 	//异常表正则匹配处理
 	WinnerRegOk, WinnerRegErr, AgencyRegOk, AgencyRegErr, BuyerRegOk, BuyerRegErr []regexp.Regexp
 )
@@ -129,12 +129,9 @@ func initReg() {
 func main() {
 	//go historywinner(qu.ObjToString(sysconfig["mgofromdb"]), extractcoll)
 	//go historybuyer(qu.ObjToString(sysconfig["mgofromdb"]), extractcoll)
-	go historyagency(qu.ObjToString(sysconfig["mgofromdb"]), extractcoll)
-	//go winStandarHistory(qu.ObjToString(sysconfig["mgotodb"]))
-	//go buyerStandarHistory(qu.ObjToString(sysconfig["mgotodb"]))
+	//go historyagency(qu.ObjToString(sysconfig["mgofromdb"]), extractcoll)
 
-
-	//go task_standarData()
+	go task_standarData()
 	c := make(chan int, 1)
 	<-c
 }

+ 0 - 32
standardata/src/standarwinner.go

@@ -466,35 +466,3 @@ func comRepTopscopeclass(tops []interface{}) []interface{} {
 	}
 	return data
 }
-
-//
-func comUpdateErr(coll, name string, tclass []interface{}) {
-	if len(tclass) < 1 {
-		return
-	}
-	tmp := MongoTo.FindOne(coll, map[string]interface{}{"name": name})
-	topscopeclass := tmp["topscopeclass"].(primitive.A)
-	tmpclass := map[string]bool{}
-	for _, tc := range topscopeclass {
-		tmpclass[qu.ObjToString(tc)] = true
-	}
-	oldlen := len(tmpclass)
-	for _, tc := range tclass {
-		tmpclass[qu.ObjToString(tc)] = true
-	}
-	newlen := len(tmpclass)
-	if oldlen == newlen {
-		return
-	}
-	newclass := []interface{}{}
-	for _, v := range tmpclass {
-		newclass = append(newclass, v)
-	}
-	MongoTo.Update(coll, map[string]interface{}{"name": name}, map[string]interface{}{
-		"$set": map[string]interface{}{
-			"name":          name,
-			"topscopeclass": newclass,
-			"updatetime":    time.Now().Unix(),
-		},
-	})
-}

+ 6 - 4
standardata/src/task.go

@@ -12,19 +12,21 @@ import (
 func task_standarData() {
 	mgofromdb := qu.ObjToString(sysconfig["mgofromdb"])
 	c := cron.New()
-	_ = c.AddFunc("0/5 * * * * *", func() {
+	_ = c.AddFunc("0 30 4 * * *", func() {
 		t := time.Now()
 		pici := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.Local).Unix()
 		query := map[string]interface{}{
 			"comeintime": map[string]interface{}{
-				"$gt":  -pici - 86400,
+				"$gt":  pici - 86400,
 				"$lte": pici,
 			},
 		}
 		log.Println(mgofromdb, query)
-		//go winnerStandarData(mgofromdb, query)
+		go winnerStandarData(mgofromdb, query)
+		time.Sleep(1 * time.Minute)
 		go buyerStandarData(mgofromdb, query)
-		//go agencyStandarData(mgofromdb, query)
+		time.Sleep(1 * time.Minute)
+		go agencyStandarData(mgofromdb, query)
 	})
 	c.Start()
 }

+ 1 - 1
util/src/dbutil/mongo/mgo.go

@@ -146,7 +146,7 @@ func (m *MongodbSim) InitPool() {
 	opts := options.Client()
 	opts.SetConnectTimeout(3 * time.Second)
 	opts.ApplyURI("mongodb://" + m.MongodbAddr)
-	opts.SetMaxPoolSize(uint16(m.Size))
+	opts.SetMaxPoolSize(uint64(m.Size))
 	m.pool = make(chan bool, m.Size)
 	opts.SetMaxConnIdleTime(2 * time.Hour)
 	m.Ctx, _ = context.WithTimeout(context.Background(), 99999*time.Hour)