浏览代码

通用字段清洗

zhengkun 8 月之前
父节点
当前提交
b4bbae8ec2
共有 5 个文件被更改,包括 107 次插入19 次删除
  1. 60 0
      clean/c_all.go
  2. 9 0
      clean/c_pname.go
  3. 7 6
      clean/c_time.go
  4. 24 0
      clean/c_unit.go
  5. 7 13
      main.go

+ 60 - 0
clean/c_all.go

@@ -14,6 +14,8 @@ var (
 
 
 func CleanFieldInfo(zhipu map[string]interface{}, fns []string) map[string]interface{} {
 func CleanFieldInfo(zhipu map[string]interface{}, fns []string) map[string]interface{} {
 	data := map[string]interface{}{}
 	data := map[string]interface{}{}
+
+	//重点字段
 	if s_area, s_city := CleanRegion(qu.ObjToString(zhipu["省份"]), qu.ObjToString(zhipu["城市"])); s_area != "" || s_city != "" {
 	if s_area, s_city := CleanRegion(qu.ObjToString(zhipu["省份"]), qu.ObjToString(zhipu["城市"])); s_area != "" || s_city != "" {
 		data["s_area"] = s_area
 		data["s_area"] = s_area
 		data["s_city"] = s_city
 		data["s_city"] = s_city
@@ -27,15 +29,70 @@ func CleanFieldInfo(zhipu map[string]interface{}, fns []string) map[string]inter
 	if s_pcode := CleanPcode(qu.ObjToString(zhipu["项目编号"]), fns); s_pcode != "" {
 	if s_pcode := CleanPcode(qu.ObjToString(zhipu["项目编号"]), fns); s_pcode != "" {
 		data["s_projectcode"] = s_pcode
 		data["s_projectcode"] = s_pcode
 	}
 	}
+	if s_biddingcode := CleanOtherCode(qu.ObjToString(zhipu["招标编号"])); s_biddingcode != "" {
+		data["s_biddingcode"] = s_biddingcode
+	}
+	if s_packagecode := CleanOtherCode(qu.ObjToString(zhipu["标段编号"])); s_packagecode != "" {
+		data["s_packagecode"] = s_packagecode
+	}
+	if s_contractcode := CleanOtherCode(qu.ObjToString(zhipu["合同编号"])); s_contractcode != "" {
+		data["s_contractcode"] = s_contractcode
+	}
 	if s_budget := CleanMoney([]interface{}{zhipu["预算金额"], ""}); s_budget > 0.0 && s_budget < 1000000000.0 {
 	if s_budget := CleanMoney([]interface{}{zhipu["预算金额"], ""}); s_budget > 0.0 && s_budget < 1000000000.0 {
 		data["s_budget"] = s_budget
 		data["s_budget"] = s_budget
 	}
 	}
 	if s_bidamount := CleanMoney([]interface{}{zhipu["中标金额"], ""}); s_bidamount > 0.0 && s_bidamount < 1000000000.0 {
 	if s_bidamount := CleanMoney([]interface{}{zhipu["中标金额"], ""}); s_bidamount > 0.0 && s_bidamount < 1000000000.0 {
 		data["s_bidamount"] = s_bidamount
 		data["s_bidamount"] = s_bidamount
 	}
 	}
+	if s_agency := CleanAgency(qu.ObjToString(zhipu["代理机构"])); s_agency != "" {
+		data["s_agency"] = s_agency
+	}
 	if s_winner := CleanWinner(qu.ObjToString(zhipu["中标单位"])); s_winner != "" {
 	if s_winner := CleanWinner(qu.ObjToString(zhipu["中标单位"])); s_winner != "" {
 		data["s_winner"] = s_winner
 		data["s_winner"] = s_winner
 	}
 	}
+
+	//其他字段
+	if s_bidopenaddress := CleanOtherName(qu.ObjToString(zhipu["开标地点"])); s_bidopenaddress != "" {
+		data["s_bidopenaddress"] = s_bidopenaddress
+	}
+	if s_biddiscount := CleanDiscount(qu.ObjToString(zhipu["中标金额折扣率"])); s_biddiscount > 0.0 {
+		data["s_biddiscount"] = s_biddiscount
+	}
+
+	//时间相关
+	if s_bidopentime := CleanTime(qu.ObjToString(zhipu["开标日期"])); s_bidopentime > 0 {
+		data["s_bidopentime"] = s_bidopentime
+	}
+	if s_bidendtime := CleanTime(qu.ObjToString(zhipu["投标截止时间"])); s_bidendtime > 0 {
+		data["s_bidendtime"] = s_bidendtime
+	}
+	if s_docstarttime := CleanTime(qu.ObjToString(zhipu["招标文件获取开始时间"])); s_docstarttime > 0 {
+		data["s_docstarttime"] = s_docstarttime
+	}
+	if s_docendtime := CleanTime(qu.ObjToString(zhipu["招标文件获取结束时间"])); s_docendtime > 0 {
+		data["s_docendtime"] = s_docendtime
+	}
+
+	//联系方式方式
+	if s_buyerperson := CleanContactPerson(qu.ObjToString(zhipu["采购单位联系人"])); s_buyerperson != "" {
+		data["s_buyerperson"] = s_buyerperson
+	}
+	if s_buyertel := CleanContactTel(qu.ObjToString(zhipu["采购单位联系方式"])); s_buyertel != "" {
+		data["s_buyertel"] = s_buyertel
+	}
+	if s_agencyperson := CleanContactPerson(qu.ObjToString(zhipu["代理机构联系人"])); s_agencyperson != "" {
+		data["s_agencyperson"] = s_agencyperson
+	}
+	if s_agencytel := CleanContactTel(qu.ObjToString(zhipu["代理机构联系方式"])); s_agencytel != "" {
+		data["s_agencytel"] = s_agencytel
+	}
+	if s_winnerperson := CleanContactPerson(qu.ObjToString(zhipu["中标单位联系人"])); s_winnerperson != "" {
+		data["s_winnerperson"] = s_winnerperson
+	}
+	if s_winnertel := CleanContactTel(qu.ObjToString(zhipu["中标单位联系方式"])); s_winnertel != "" {
+		data["s_winnertel"] = s_winnertel
+	}
+
 	//分包字段
 	//分包字段
 	if zhipu["s_pkg"] != nil {
 	if zhipu["s_pkg"] != nil {
 		data["s_pkg"] = zhipu["s_pkg"]
 		data["s_pkg"] = zhipu["s_pkg"]
@@ -54,6 +111,9 @@ func CleanFieldInfo(zhipu map[string]interface{}, fns []string) map[string]inter
 			delete(data, "s_budget")
 			delete(data, "s_budget")
 		}
 		}
 	}
 	}
+	if s_subtype != "合同" {
+		delete(data, "s_contractcode")
+	}
 
 
 	return data
 	return data
 }
 }

+ 9 - 0
clean/c_pname.go

@@ -32,3 +32,12 @@ func CleanPname(pname string) string {
 
 
 	return pname
 	return pname
 }
 }
+
+// 清洗项目名称
+func CleanOtherName(oname string) string {
+	if oname == "无" {
+		return ""
+	}
+	oname = fieldReg1.ReplaceAllString(oname, "")
+	return oname
+}

+ 7 - 6
clean/c_time.go

@@ -1,10 +1,12 @@
 package clean
 package clean
 
 
 import (
 import (
+	"data_ai/ul"
 	"github.com/shopspring/decimal"
 	"github.com/shopspring/decimal"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	"regexp"
 	"regexp"
 	"strings"
 	"strings"
+	"time"
 )
 )
 
 
 var numReg = regexp.MustCompile("[0-9.]+")
 var numReg = regexp.MustCompile("[0-9.]+")
@@ -79,19 +81,18 @@ func convertHMS(hms string) string {
 }
 }
 
 
 // 清洗时间
 // 清洗时间
-func CleanTime(st string) string {
+func CleanTime(st string) int64 {
 	if st == "" || st == "无" {
 	if st == "" || st == "无" {
-		return ""
+		return 0
 	}
 	}
 	st = strings.ReplaceAll(st, ":", ":")
 	st = strings.ReplaceAll(st, ":", ":")
 	ymd, hms := convertYMD(st), convertHMS(st)
 	ymd, hms := convertYMD(st), convertHMS(st)
 	if ymd == "" {
 	if ymd == "" {
-		return ""
+		return 0
 	}
 	}
 	st = ymd + " " + hms
 	st = ymd + " " + hms
-	return st
-	//t, _ := time.ParseInLocation(ul.TimeLayout, st, time.Local)
-	//return t.Unix()
+	t, _ := time.ParseInLocation(ul.TimeLayout, st, time.Local)
+	return t.Unix()
 }
 }
 
 
 // 清洗折扣率
 // 清洗折扣率

+ 24 - 0
clean/c_unit.go

@@ -44,3 +44,27 @@ func CleanAgency(agency string) string {
 	}
 	}
 	return agency
 	return agency
 }
 }
+
+// 联系人
+func CleanContactPerson(person string) string {
+	if person == "无" {
+		return ""
+	}
+	person = fieldReg1.ReplaceAllString(person, "")
+	if utf8.RuneCountInString(person) < 2 {
+		person = ""
+	}
+	return person
+}
+
+// 联系方式
+func CleanContactTel(tel string) string {
+	if tel == "无" {
+		return ""
+	}
+	tel = fieldReg1.ReplaceAllString(tel, "")
+	if utf8.RuneCountInString(tel) < 6 {
+		tel = ""
+	}
+	return tel
+}

+ 7 - 13
main.go

@@ -5,9 +5,7 @@ import (
 	"data_ai/udp"
 	"data_ai/udp"
 	"data_ai/ul"
 	"data_ai/ul"
 	log "github.com/donnie4w/go-logger/logger"
 	log "github.com/donnie4w/go-logger/logger"
-	qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	"sync"
 	"sync"
-	"unicode/utf8"
 )
 )
 
 
 func init() {
 func init() {
@@ -30,22 +28,21 @@ func main() {
 		//tool.StartToolUpdateInfo()
 		//tool.StartToolUpdateInfo()
 		return
 		return
 	}
 	}
-	//extract.TestSingleFieldInfo("bidding", "6722de29b25c3e1debe624c9")
 	lock := make(chan bool)
 	lock := make(chan bool)
 	<-lock
 	<-lock
 }
 }
 
 
 func test() {
 func test() {
-	log.Debug("开始大模型验证内存数据···")
+	log.Debug("···开始验证数据···")
 	q := map[string]interface{}{}
 	q := map[string]interface{}{}
 	pool_mgo := make(chan bool, 500)
 	pool_mgo := make(chan bool, 500)
 	wg_mgo := &sync.WaitGroup{}
 	wg_mgo := &sync.WaitGroup{}
 	sess := ul.SourceMgo.GetMgoConn()
 	sess := ul.SourceMgo.GetMgoConn()
 	defer ul.SourceMgo.DestoryMongoConn(sess)
 	defer ul.SourceMgo.DestoryMongoConn(sess)
 	total, isok := 0, 0
 	total, isok := 0, 0
-	it := sess.DB(ul.SourceMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("-_id").Iter()
+	it := sess.DB(ul.SourceMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
 	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
 	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total%1000 == 0 {
+		if total%100 == 0 {
 			log.Debug("cur index ", total)
 			log.Debug("cur index ", total)
 		}
 		}
 		isok++
 		isok++
@@ -56,13 +53,10 @@ func test() {
 				<-pool_mgo
 				<-pool_mgo
 				wg_mgo.Done()
 				wg_mgo.Done()
 			}()
 			}()
-			detail := qu.ObjToString(tmp["detail"])
-			if utf8.RuneCountInString(detail) < 100 {
-				data := extract.ResolveInfo(tmp)
-				if len(data) > 0 {
-					tmp["ai_zhipu"] = data
-					ul.SourceMgo.Save("zktest_1031", tmp)
-				}
+			data := extract.ResolveInfo(tmp)
+			if len(data) > 0 {
+				tmp["ai_zhipu"] = data
+				ul.SourceMgo.Save("xxx-xxx", tmp)
 			}
 			}
 		}(tmp)
 		}(tmp)
 		tmp = make(map[string]interface{})
 		tmp = make(map[string]interface{})