Răsfoiți Sursa

信息处理~等等

zhengkun 3 ani în urmă
părinte
comite
ff8c16308d

+ 34 - 13
filedproject_medical/data_preparation/src/class/initdata.go

@@ -1,6 +1,9 @@
 package class
 
-import log "github.com/donnie4w/go-logger/logger"
+import (
+	log "github.com/donnie4w/go-logger/logger"
+	qu "qfw/util"
+)
 
 var (
 	Save_Mgo, Spi_Mgo           *MongodbSim
@@ -9,30 +12,24 @@ var (
 	MysqlTool                   *Mysql
 	Medical_Type, Medical_Level map[string]string
 
-	isLocal bool
+	isLocal      bool
+	YXBK_DATA    = map[string]map[string]interface{}{} //医学百科相关数据
+	select_field = map[string]interface{}{"area": 1, "city": 1, "district": 1, "name": 1, "website": 1}
 )
 
 func InitClass() {
 	isLocal = true //本地
 	initMgo()
-	initMysql()
+	//initMysql()
 	initClassCode() //加载相关代码表
+	//initLocalData()
 }
 
 //初始化mgo
 func initMgo() {
 	if isLocal {
-		//Save_Mgo = &MongodbSim{
-		//	MongodbAddr: "127.0.0.1:27017",
-		//	DbName:      "zhengkun",
-		//	Size:        10,
-		//	UserName:    "",
-		//	Password:    "",
-		//}
-		//Save_Mgo.InitPool()
-
 		Save_Mgo = &MongodbSim{
-			MongodbAddr: "192.168.3.207:27092",
+			MongodbAddr: "127.0.0.1:27017",
 			DbName:      "zhengkun",
 			Size:        10,
 			UserName:    "",
@@ -40,6 +37,15 @@ func initMgo() {
 		}
 		Save_Mgo.InitPool()
 
+		//Save_Mgo = &MongodbSim{
+		//	MongodbAddr: "192.168.3.207:27092",
+		//	DbName:      "majiajia",
+		//	Size:        10,
+		//	UserName:    "",
+		//	Password:    "",
+		//}
+		//Save_Mgo.InitPool()
+
 		Spi_Mgo = &MongodbSim{
 			MongodbAddr: "127.0.0.1:27017",
 			DbName:      "zhengkun",
@@ -123,6 +129,21 @@ func initClassCode() {
 	}
 }
 
+//准备医学百科数据~
+func initLocalData() {
+	sess := Save_Mgo.GetMgoConn()
+	defer Save_Mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{}
+	it := sess.DB(Save_Mgo.DbName).C("data_info").Find(&q).Sort("_id").Select(select_field).Iter()
+	for tmp := make(map[string]interface{}); it.Next(&tmp); {
+		name := qu.ObjToString(tmp["name"])
+		delete(tmp, "_id")
+		YXBK_DATA[name] = tmp
+		tmp = make(map[string]interface{})
+	}
+	log.Debug("医学百科数据准备~", len(YXBK_DATA))
+}
+
 //插入数据
 func InsertMysqlData(name string, data map[string]interface{}, mark string) {
 	inb := MysqlTool.Insert(name, data)

+ 10 - 4
filedproject_medical/data_preparation/src/hospital/hospital.go

@@ -1,12 +1,18 @@
 package hospital
 
 //开始执行医院数据
-func RunHospitalInfo() {
-
-	//医院信息处理
+func RunBuildHospitalInfo() {
+	//整合整体~医院信息
 	dealWithHospitalBaseInfo("f_hospital_39jk", "hospital_39jk_ain_depart", "39健康")
 	dealWithHospitalBaseInfo("f_hospital_hdf", "hospital_hdf_ain_depart", "好大夫")
 	dealWithHospitalBaseInfo("f_hospital_yydq", "hospital_yydq_ain_depart", "医院大全")
 	dealWithHospitalBaseInfo("f_hospital_yyyc", "", "医药英才网")
-	confirmHospitalData("zktest_hospital_info")
+}
+
+//开始执行清洗医院数据
+func RunCleanHospitalInfo() {
+	//清洗~医院信息~名称
+	//cleanHospitalInfoData()
+
+	//resetRepeatHospital()
 }

+ 149 - 0
filedproject_medical/data_preparation/src/hospital/hospital_clean.go

@@ -0,0 +1,149 @@
+package hospital
+
+import (
+	"class"
+	log "github.com/donnie4w/go-logger/logger"
+	qu "qfw/util"
+	"regexp"
+	"strings"
+	"unicode/utf8"
+)
+
+var data_hospitals = map[string]string{}
+var cleanReg_1 = regexp.MustCompile("[((](民营|国营|自立)[))]")
+var suffixReg_1 = regexp.MustCompile("(院)(.*科)$")
+var suffixReg_2 = regexp.MustCompile("院[))]?")
+var suffixReg_3 = regexp.MustCompile("([((].*[))])$")
+var suffixReg_4 = regexp.MustCompile("[((]原[::]?(.*)[))]$")
+var suffixReg_5 = regexp.MustCompile("(院|区|部)[))]$")
+
+//整合医院数据
+func cleanHospitalInfoData() {
+	log.Debug("清洗医院信息~~开始~~")
+	data_hospitals = map[string]string{}
+	sess := class.Save_Mgo.GetMgoConn()
+	defer class.Save_Mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{}
+	it := sess.DB(class.Save_Mgo.DbName).C(save_coll).Find(&q).Sort("_id").Select(map[string]interface{}{
+		"name":  1,
+		"alias": 1,
+	}).Iter()
+	total, isok := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total%10000 == 0 {
+			log.Debug("cur index ", total, "~", isok)
+		}
+		old_name := qu.ObjToString(tmp["name"])
+		alias := qu.ObjToString(tmp["alias"])
+		b, new_name, his_name := cleanHospitalName(old_name)
+		tmpid := class.BsonTOStringId(tmp["_id"])
+		if b {
+			isok++
+			update := map[string]interface{}{
+				"name":     new_name,
+				"old_name": old_name,
+			}
+			new_alias := alias
+			if his_name != "" {
+				if new_alias == "" {
+					new_alias = his_name
+				} else {
+					if !strings.Contains(new_alias, his_name) {
+						new_alias = new_alias + "," + his_name
+					}
+				}
+				update["alias"] = new_alias
+			}
+			class.Save_Mgo.UpdateById(save_coll, tmpid, map[string]interface{}{
+				"$set": update,
+			})
+		}
+		tmp = make(map[string]interface{})
+	}
+
+	log.Debug("清洗医院信息~~over~~", total, "~", isok)
+
+	resetRepeatHospital()
+}
+
+//清洗医院名称
+func cleanHospitalName(name string) (bool, string, string) {
+	is_clean := false
+	new_name := name
+	his_name := ""
+	//
+	if cleanReg_1.MatchString(name) {
+		new_name = cleanReg_1.ReplaceAllString(name, "")
+		name = new_name
+		is_clean = true
+	}
+	//清洗 XX科室
+	if suffixReg_1.MatchString(name) { //需要采用截取的方式
+		index_arr := suffixReg_2.FindAllStringIndex(name, -1)
+		last := index_arr[len(index_arr)-1]
+		last_index := last[len(last)-1]
+		new_name = name[:last_index]
+		is_clean = true
+	}
+
+	//清洗多余结尾~路标地址等
+	if suffixReg_3.MatchString(name) {
+		index_arr := suffixReg_3.FindAllStringIndex(name, -1)
+		last := index_arr[len(index_arr)-1]
+		strat_index := last[0]
+		last_index := last[len(last)-1]
+		suffix_name := name[strat_index:last_index]
+		if suffixReg_4.MatchString(suffix_name) {
+			his_name = suffixReg_4.ReplaceAllString(suffix_name, "${1}")
+			new_name = suffixReg_4.ReplaceAllString(name, "")
+			is_clean = true
+		}
+		if utf8.RuneCountInString(suffix_name) == 4 {
+			if !suffixReg_5.MatchString(suffix_name) {
+				new_name = strings.ReplaceAll(name, suffix_name, "")
+				is_clean = true
+			}
+		}
+	}
+
+	return is_clean, new_name, his_name
+}
+
+//重置重复标记~
+func resetRepeatHospital() {
+	log.Debug("重置~~重复标记~~")
+	data_hospitals = map[string]string{}
+	sess := class.Save_Mgo.GetMgoConn()
+	defer class.Save_Mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{}
+	it := sess.DB(class.Save_Mgo.DbName).C(save_coll).Find(&q).Sort("_id").Select(map[string]interface{}{
+		"name": 1,
+	}).Iter()
+	total := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total%10000 == 0 {
+			log.Debug("cur index ", total)
+		}
+		name := qu.ObjToString(tmp["name"])
+		tmpid := class.BsonTOStringId(tmp["_id"])
+		update := map[string]interface{}{}
+		if data_hospitals[name] == "" {
+			data_hospitals[name] = tmpid
+			update = map[string]interface{}{
+				"repeat":    0,
+				"repeat_id": "",
+			}
+		} else {
+			update = map[string]interface{}{
+				"repeat":    1,
+				"repeat_id": data_hospitals[name],
+			}
+		}
+		class.Save_Mgo.UpdateById(save_coll, tmpid, map[string]interface{}{
+			"$set": update,
+		})
+		tmp = make(map[string]interface{})
+	}
+	log.Debug("重置~~完毕~~over~~")
+
+}

+ 138 - 0
filedproject_medical/data_preparation/src/hospital/hospital_info.go

@@ -0,0 +1,138 @@
+package hospital
+
+import (
+	"class"
+	log "github.com/donnie4w/go-logger/logger"
+	qu "qfw/util"
+	"strings"
+	"sync"
+	"unicode/utf8"
+)
+
+var (
+	save_coll = "zktest_hospital_info"
+)
+
+//处理医院基本信息~涉及多张表
+func dealWithHospitalBaseInfo(hos_coll string, depart_coll string, sourceweb string) {
+	sess := class.Save_Mgo.GetMgoConn()
+	defer class.Save_Mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{}
+	it := sess.DB(class.Save_Mgo.DbName).C(hos_coll).Find(&q).Sort("_id").Iter()
+	pool := make(chan bool, 5)
+	wg := &sync.WaitGroup{}
+	total := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		pool <- true
+		wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-pool
+				wg.Done()
+			}()
+			name := qu.ObjToString(tmp["hospitalname"])
+			if name != "" && utf8.RuneCountInString(name) > 3 {
+				//医院基本信息
+				data_info := treatHospitalInfo(tmp)
+				name = qu.ObjToString(data_info["name"])
+				//科室相关信息
+				hos_id := qu.ObjToString(tmp["_id"])
+				data_depart := treatHospitalDepartment(hos_id, depart_coll)
+				data_info["departs"] = data_depart
+				//别名信息
+				data_info["alias"] = qu.ObjToString(tmp["alias"])
+				//网站来源
+				data_info["sourceweb"] = sourceweb
+
+				class.Save_Mgo.Save(save_coll, data_info)
+			}
+		}(tmp)
+		tmp = make(map[string]interface{})
+	}
+	wg.Wait()
+	log.Debug("构建信息~~over~~ ", total)
+}
+
+//数据构建~医院基本信息
+func treatHospitalInfo(tmp map[string]interface{}) map[string]interface{} {
+	data := map[string]interface{}{}
+	//名称需要清洗~~~
+	hospitalname := qu.ObjToString(tmp["hospitalname"])
+	hospitalname = strings.ReplaceAll(hospitalname, " ", "")
+
+	//医院名称~
+	data["name"] = hospitalname
+	//医院简介
+	data["introduce"] = qu.ObjToString(tmp["introduce"])
+	//医院地址
+	data["address"] = qu.ObjToString(tmp["address"])
+	//医院设备
+	data["equipment"] = qu.ObjToString(tmp["equipment"])
+	//床位数~
+	data["beds"] = qu.IntAll(tmp["beds"])
+	//门诊量/日
+	data["visit_perday"] = qu.IntAll(tmp["visit_perday"])
+	//医生人数
+	data["doctorsnum"] = qu.IntAll(tmp["doctorsnum"])
+
+	//联系电话~
+	data["tel"] = qu.ObjToString(tmp["tel"])
+	//传真号码
+	data["fax_number"] = qu.ObjToString(tmp["fax_number"])
+
+	//医院网站~
+	data["website"] = qu.ObjToString(tmp["website"])
+
+	data["level"] = qu.ObjToString(tmp["level"])
+	data["type"] = qu.ObjToString(tmp["type"])
+	data["business_type"] = qu.ObjToString(tmp["business_type"])
+
+	//省市区
+	area := qu.ObjToString(tmp["area"])
+	city := qu.ObjToString(tmp["city"])
+	district := qu.ObjToString(tmp["district"])
+	if city == "" { //补充~省份城市信息
+		//supplementRegionally(&area, &city, &district, hospitalname, qu.ObjToString(tmp["address"]))
+	}
+	data["area"] = area
+	data["area"] = city
+	data["area"] = district
+
+	company_id := ""
+	//关联企业信息~查询顺序~企业~特殊~自生
+	//company_id = inquirBaseInfoid(hospitalname)
+	//if company_id == "" {
+	//	company_id = uuid.New().String()
+	//	//新增一个信息来源
+	//}
+	data["company_id"] = company_id
+
+	return data
+}
+
+//数据构建~医院科室信息
+func treatHospitalDepartment(hos_id string, depart_coll string) []map[string]interface{} {
+	new_arr := []map[string]interface{}{}
+	if depart_coll == "" {
+		return new_arr
+	}
+	dataArr, _ := class.Save_Mgo.Find(depart_coll, map[string]interface{}{"hospital_id": hos_id}, nil, nil)
+	//考虑去重~相同科室~
+	keys := map[string]string{}
+	for _, v := range dataArr {
+		main_departclass1 := qu.ObjToString(v["main_departclass1"])
+		main_departclass2 := qu.ObjToString(v["main_departclass2"])
+		depart_introduce := qu.ObjToString(v["depart_introduce"])
+
+		key := main_departclass1 + "~" + main_departclass2
+		if keys[key] == "" {
+			new_arr = append(new_arr, map[string]interface{}{
+				"departclass1":     main_departclass1,
+				"departclass2":     main_departclass2,
+				"depart_introduce": depart_introduce,
+			})
+			keys[key] = key
+		}
+	}
+	return new_arr
+}

+ 102 - 0
filedproject_medical/data_preparation/src/hospital/hospital_merge.go

@@ -0,0 +1,102 @@
+package hospital
+
+import (
+	"class"
+	log "github.com/donnie4w/go-logger/logger"
+	qu "qfw/util"
+)
+
+//最终合并判重后的医院数据
+func mergeRepeatHospital() {
+	log.Debug("开始合并重复后的数据~~~")
+	sess := class.Save_Mgo.GetMgoConn()
+	defer class.Save_Mgo.DestoryMongoConn(sess)
+	q := map[string]interface{}{
+		"repeat": 0,
+	}
+	it := sess.DB(class.Save_Mgo.DbName).C(save_coll).Find(&q).Sort("_id").Iter()
+	total, isok := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total%1000 == 0 {
+			log.Debug("cur index ", total, "~", isok)
+		}
+		dataArr, _ := class.Save_Mgo.Find(save_coll, map[string]interface{}{
+			"repeat_id": class.BsonTOStringId(tmp["_id"]),
+		}, nil, nil)
+		isok = isok + len(dataArr) + 1
+		new_arr := []map[string]interface{}{tmp}
+		for _, v := range dataArr {
+			new_arr = append(new_arr, v)
+		}
+		new_data := mergeHospitalInfo(new_arr)
+		class.Save_Mgo.Save(save_coll+"_new", new_data)
+		tmp = make(map[string]interface{})
+	}
+	log.Debug("is over ", total, "~", isok)
+}
+
+//合并医院信息~
+func mergeHospitalInfo(dataArr []map[string]interface{}) map[string]interface{} {
+	data := map[string]interface{}{}
+	if len(dataArr) == 1 {
+		data = dataArr[0]
+		delete(data, "repeat")
+		delete(data, "repeat_id")
+		return data
+	} else if len(dataArr) > 1 {
+		//合并重复信息
+		temp := dataArr[0]
+		name := qu.ObjToString(temp["name"])
+		delete(temp, "repeat")
+		delete(temp, "repeat_id")
+		for k, v := range temp {
+			isMerge := false
+			if k == "alias" { //合并别名~
+				new_alias := mergeAliasData(dataArr, name)
+				data[k] = new_alias
+				continue
+			}
+			if k == "departs" { //合并科室~
+				new_departs := mergeDepartData(dataArr)
+				data[k] = new_departs
+				continue
+			}
+			if k == "area" || k == "city" || k == "district" {
+				data[k] = v
+				continue
+			}
+			//选取最优值
+			if new_v, ok := v.(string); ok {
+				if new_v == "" { //非空补充
+					for ks, vs := range dataArr {
+						if ks == 0 {
+							continue
+						}
+						if qu.ObjToString(vs[k]) != "" {
+							data[k] = vs[k]
+							isMerge = true
+							break
+						}
+					}
+				}
+			}
+			if new_v, ok := v.(int); ok {
+				for ks, vs := range dataArr {
+					if ks == 0 {
+						continue
+					}
+					if qu.IntAll(vs[k]) > new_v {
+						data[k] = vs[k]
+						isMerge = true
+					}
+				}
+			}
+			if !isMerge {
+				data[k] = v
+			}
+		}
+	} else {
+
+	}
+	return data
+}

+ 10 - 9
filedproject_medical/data_preparation/src/hospital/hospital_method.go

@@ -95,7 +95,8 @@ func relevanceBusType(med_bus_type string) string {
 	if strings.Contains(med_bus_type, "公立") {
 		new_bus_type = "1"
 	}
-	if strings.Contains(med_bus_type, "民营") {
+	if strings.Contains(med_bus_type, "民营") ||
+		strings.Contains(med_bus_type, "个体户") {
 		new_bus_type = "2"
 	}
 	return new_bus_type
@@ -127,9 +128,7 @@ func supplementRegionally(area *string, city *string, district *string, name str
 		*city = ""
 		*district = ""
 	}
-	data := class.Save_Mgo.FindOne("data_info", map[string]interface{}{
-		"name": name,
-	})
+	data := class.YXBK_DATA[name]
 	if data != nil && len(data) > 2 {
 		data_area := qu.ObjToString(data["area"])
 		data_city := qu.ObjToString(data["city"])
@@ -181,11 +180,6 @@ func supplementRegionally(area *string, city *string, district *string, name str
 		}
 	}
 }
-func regionallyCode(area string, city string, district string) string {
-	region_code := "00"
-
-	return region_code
-}
 
 //是否相同地域
 func repeatRegion(region map[string]string, data map[string]string) bool {
@@ -266,3 +260,10 @@ func confirmDepartCode(class1 string, class2 string) string {
 
 	return new_code
 }
+
+//区域代码
+func regionallyCode(area string, city string, district string) string {
+	region_code := "00"
+
+	return region_code
+}

+ 0 - 346
filedproject_medical/data_preparation/src/hospital/hospital_work.go

@@ -1,346 +0,0 @@
-package hospital
-
-import (
-	"class"
-	log "github.com/donnie4w/go-logger/logger"
-	"github.com/uuid"
-	qu "qfw/util"
-	"strings"
-	"sync"
-	"time"
-	"unicode/utf8"
-)
-
-var hospitallock sync.Mutex
-var data_hospitals = map[string][]map[string]interface{}{}
-
-//整合医院数据
-func confirmHospitalData(save_coll string) {
-	//处理~医院信息
-	log.Debug("整合医院信息~", len(data_hospitals))
-	for _, v := range data_hospitals {
-		data := mergeHospitalInfo(v)
-		class.Save_Mgo.Save(save_coll, data)
-	}
-	log.Debug("待判重数据存储完毕~")
-	time.Sleep(10 * time.Second)
-	repeatHospital(save_coll)
-}
-
-//判重~医院数据
-func repeatHospital(save_coll string) {
-	log.Debug("开始判重医院数据...")
-	sess := class.Save_Mgo.GetMgoConn()
-	defer class.Save_Mgo.DestoryMongoConn(sess)
-	data_hospitals_alias := make(map[string][]string, 0)
-	data_hospitals_ids := make(map[string]string, 0)
-	data_hospitals_region := make(map[string]map[string]string, 0)
-
-	q := map[string]interface{}{}
-	it := sess.DB(class.Save_Mgo.DbName).C(save_coll).Find(&q).Sort("_id").Select(map[string]interface{}{
-		"name":  1,
-		"alias": 1,
-	}).Iter()
-	total, isok := 0, 0
-	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total%1000 == 0 {
-			log.Debug("cur index ", total, "~", isok)
-		}
-		region := map[string]string{
-			"area":     qu.ObjToString(tmp["area"]),
-			"city":     qu.ObjToString(tmp["city"]),
-			"district": qu.ObjToString(tmp["district"]),
-		}
-		alias := qu.ObjToString(tmp["alias"])
-		name := qu.ObjToString(tmp["name"])
-		alias_arr := strings.Split(alias, ",")
-		alias_arr = append(alias_arr, name)
-		//减少重复对比...待
-		tmpid := class.BsonTOStringId(tmp["_id"])
-		temp_id := tmpid
-		//记录所有关联的id
-		if data_hospitals_alias[name] == nil {
-			//对比名称~
-			is_r := false
-			is_r_n := ""
-			for _, name_new := range alias_arr {
-				if name_new == "" {
-					continue
-				}
-				for k, v := range data_hospitals_alias {
-					//地域不一致~过滤
-					if !repeatRegion(data_hospitals_region[k], region) {
-						continue
-					}
-					for _, v1 := range v {
-						if v1 == "" {
-							continue
-						}
-						if strings.Contains(v1, name_new) || strings.Contains(name_new, v1) {
-							is_r = true
-							is_r_n = k
-							temp_id = data_hospitals_ids[is_r_n]
-							break
-						}
-					}
-					if is_r {
-						break
-					}
-				}
-				if is_r {
-					break
-				}
-			}
-			if is_r {
-				isok++
-				class.Save_Mgo.UpdateById(save_coll, tmpid, map[string]interface{}{
-					"$set": map[string]interface{}{
-						"repeat":    1,
-						"repeat_id": temp_id,
-					},
-				})
-			}
-			//名称指向的id需要变更
-			data_hospitals_ids[name] = temp_id
-			data_hospitals_region[name] = region
-			data_hospitals_alias[name] = alias_arr
-		}
-		tmp = make(map[string]interface{})
-	}
-	log.Debug("is over ", total, "~", isok)
-
-	//合并判重后的数据
-	mergeRepeatHospital(save_coll)
-}
-
-//最终合并判重后的医院数据
-func mergeRepeatHospital(save_coll string) {
-	log.Debug("开始合并重复后的数据~待")
-	sess := class.Save_Mgo.GetMgoConn()
-	defer class.Save_Mgo.DestoryMongoConn(sess)
-	q := map[string]interface{}{
-		"repeat": 0,
-	}
-	it := sess.DB(class.Save_Mgo.DbName).C(save_coll).Find(&q).Sort("_id").Iter()
-	total, isok := 0, 0
-	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total%1000 == 0 {
-			log.Debug("cur index ", total, "~", isok)
-		}
-		dataArr, _ := class.Save_Mgo.Find(save_coll, map[string]interface{}{
-			"repeat_id": class.BsonTOStringId(tmp["_id"]),
-		}, nil, nil)
-		isok = isok + len(dataArr) + 1
-		new_arr := []map[string]interface{}{tmp}
-		for _, v := range dataArr {
-			new_arr = append(new_arr, v)
-		}
-		new_data := mergeHospitalInfo(new_arr)
-		class.Save_Mgo.Save("zktest_hospital_info_new", new_data)
-		tmp = make(map[string]interface{})
-	}
-	log.Debug("is over ", total, "~", isok)
-}
-
-//处理医院基本信息~涉及多张表
-func dealWithHospitalBaseInfo(hos_coll string, depart_coll string, sourceweb string) {
-	log.Debug("处理医疗~医院基本信息")
-	sess := class.Save_Mgo.GetMgoConn()
-	defer class.Save_Mgo.DestoryMongoConn(sess)
-	q := map[string]interface{}{}
-	it := sess.DB(class.Save_Mgo.DbName).C(hos_coll).Find(&q).Sort("_id").Iter()
-	pool := make(chan bool, 6)
-	wg := &sync.WaitGroup{}
-	total, isok := 0, 0
-	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
-		if total%10000 == 0 {
-			log.Debug("cur index ", total)
-		}
-		pool <- true
-		wg.Add(1)
-		go func(tmp map[string]interface{}) {
-			defer func() {
-				<-pool
-				wg.Done()
-			}()
-			name := qu.ObjToString(tmp["hospitalname"])
-			if name != "" && utf8.RuneCountInString(name) > 3 {
-				//医院基本信息
-				data_info := treatHospitalInfo(tmp)
-				name = qu.ObjToString(data_info["name"])
-				//科室相关信息
-				hos_id := qu.ObjToString(tmp["_id"])
-				data_depart := treatHospitalDepartment(hos_id, depart_coll)
-				data_info["departs"] = data_depart
-				//别名信息
-				data_info["alias"] = qu.ObjToString(tmp["alias"])
-				//网站来源
-				data_info["sourceweb"] = sourceweb
-
-				hospitallock.Lock()
-				if data_hospitals[name] == nil {
-					arr := []map[string]interface{}{data_info}
-					data_hospitals[name] = arr
-					isok++
-				} else {
-					arr := data_hospitals[name]
-					arr = append(arr, data_info)
-					data_hospitals[name] = arr
-				}
-				hospitallock.Unlock()
-			}
-		}(tmp)
-		tmp = make(map[string]interface{})
-	}
-	wg.Wait()
-	log.Debug("is cur over ", len(data_hospitals), "~", total, "~", isok)
-}
-
-//数据构建~医院基本信息
-func treatHospitalInfo(tmp map[string]interface{}) map[string]interface{} {
-	data := map[string]interface{}{}
-	hospitalname := qu.ObjToString(tmp["hospitalname"])
-	hospitalname = strings.ReplaceAll(hospitalname, " ", "")
-	//医院名称~
-	data["name"] = hospitalname
-	//医院简介
-	data["introduce"] = qu.ObjToString(tmp["introduce"])
-	//医院地址
-	data["address"] = qu.ObjToString(tmp["address"])
-	//医院设备
-	data["equipment"] = qu.ObjToString(tmp["equipment"])
-	//床位数~
-	data["beds"] = qu.IntAll(tmp["beds"])
-	//门诊量/日
-	data["visit_perday"] = qu.IntAll(tmp["visit_perday"])
-	//医生人数
-	data["doctorsnum"] = qu.IntAll(tmp["doctorsnum"])
-
-	//联系电话~
-	data["tel"] = qu.ObjToString(tmp["tel"])
-	//传真号码
-	data["fax_number"] = qu.ObjToString(tmp["fax_number"])
-	//医院网站
-	data["website"] = qu.ObjToString(tmp["website"])
-
-	data["level"] = qu.ObjToString(tmp["level"])
-	data["type"] = qu.ObjToString(tmp["type"])
-	data["business_type"] = qu.ObjToString(tmp["business_type"])
-
-	//省市区
-	area := qu.ObjToString(tmp["area"])
-	city := qu.ObjToString(tmp["city"])
-	district := qu.ObjToString(tmp["district"])
-	if city == "" { //补充~省份城市信息
-		supplementRegionally(&area, &city, &district, hospitalname, qu.ObjToString(tmp["address"]))
-	}
-	data["area"] = area
-	data["area"] = city
-	data["area"] = district
-
-	company_id := ""
-	//关联企业信息~查询顺序~企业~特殊~自生
-	company_id = inquirBaseInfoid(hospitalname)
-	if company_id == "" {
-		company_id = uuid.New().String()
-		//新增一个信息来源
-		//需要完善
-
-	}
-	data["company_id"] = company_id
-
-	return data
-}
-
-//数据构建~医院科室信息
-func treatHospitalDepartment(hos_id string, depart_coll string) []map[string]interface{} {
-	new_arr := []map[string]interface{}{}
-	if depart_coll == "" {
-		return new_arr
-	}
-	dataArr, _ := class.Save_Mgo.Find(depart_coll, map[string]interface{}{"hospital_id": hos_id}, nil, nil)
-	//考虑去重~相同科室~
-	keys := map[string]string{}
-	for _, v := range dataArr {
-		main_departclass1 := qu.ObjToString(v["main_departclass1"])
-		main_departclass2 := qu.ObjToString(v["main_departclass2"])
-		depart_introduce := qu.ObjToString(v["depart_introduce"])
-
-		key := main_departclass1 + "~" + main_departclass2
-		if keys[key] == "" {
-			new_arr = append(new_arr, map[string]interface{}{
-				"departclass1":     main_departclass1,
-				"departclass2":     main_departclass2,
-				"depart_introduce": depart_introduce,
-			})
-			keys[key] = key
-		}
-	}
-	return new_arr
-}
-
-//合并医院信息~
-func mergeHospitalInfo(dataArr []map[string]interface{}) map[string]interface{} {
-	data := map[string]interface{}{}
-	if len(dataArr) == 1 {
-		data = dataArr[0]
-		delete(data, "repeat")
-		delete(data, "repeat_id")
-		return data
-	} else if len(dataArr) > 1 {
-		//合并重复信息
-		temp := dataArr[0]
-		name := qu.ObjToString(temp["name"])
-		delete(temp, "repeat")
-		delete(temp, "repeat_id")
-		for k, v := range temp {
-			isMerge := false
-			if k == "alias" { //合并别名~
-				new_alias := mergeAliasData(dataArr, name)
-				data[k] = new_alias
-				continue
-			}
-			if k == "departs" { //合并科室~
-				new_departs := mergeDepartData(dataArr)
-				data[k] = new_departs
-				continue
-			}
-			if k == "area" || k == "city" || k == "district" {
-				data[k] = v
-				continue
-			}
-			//选取最优值
-			if new_v, ok := v.(string); ok {
-				if new_v == "" { //非空补充
-					for ks, vs := range dataArr {
-						if ks == 0 {
-							continue
-						}
-						if qu.ObjToString(vs[k]) != "" {
-							data[k] = vs[k]
-							isMerge = true
-							break
-						}
-					}
-				}
-			}
-			if new_v, ok := v.(int); ok {
-				for ks, vs := range dataArr {
-					if ks == 0 {
-						continue
-					}
-					if qu.IntAll(vs[k]) > new_v {
-						data[k] = vs[k]
-						isMerge = true
-					}
-				}
-			}
-			if !isMerge {
-				data[k] = v
-			}
-		}
-	} else {
-
-	}
-	return data
-}

+ 82 - 0
filedproject_medical/data_preparation/src/hospital/mark

@@ -0,0 +1,82 @@
+//判重~医院数据
+func repeatHospital(save_coll string) {
+	log.Debug("开始判重医院数据...")
+	sess := class.Save_Mgo.GetMgoConn()
+	defer class.Save_Mgo.DestoryMongoConn(sess)
+	data_hospitals_alias := make(map[string][]string, 0)
+	data_hospitals_ids := make(map[string]string, 0)
+	data_hospitals_region := make(map[string]map[string]string, 0)
+
+	q := map[string]interface{}{}
+	it := sess.DB(class.Save_Mgo.DbName).C(save_coll).Find(&q).Sort("_id").Iter()
+	total, isok := 0, 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+		if total%1000 == 0 {
+			log.Debug("cur index ", total, "~", isok)
+		}
+		region := map[string]string{
+			"area":     qu.ObjToString(tmp["area"]),
+			"city":     qu.ObjToString(tmp["city"]),
+			"district": qu.ObjToString(tmp["district"]),
+		}
+		alias := qu.ObjToString(tmp["alias"])
+		name := qu.ObjToString(tmp["name"])
+		alias_arr := strings.Split(alias, ",")
+		alias_arr = append(alias_arr, name)
+		//减少重复对比...待
+		tmpid := class.BsonTOStringId(tmp["_id"])
+		temp_id := tmpid
+		//记录所有关联的id
+		if data_hospitals_alias[name] == nil {
+			//对比名称~
+			is_r := false
+			is_r_n := ""
+			for _, name_new := range alias_arr {
+				if name_new == "" {
+					continue
+				}
+				for k, v := range data_hospitals_alias {
+					//地域不一致~过滤
+					if !repeatRegion(data_hospitals_region[k], region) {
+						continue
+					}
+					for _, v1 := range v {
+						if v1 == "" {
+							continue
+						}
+						if strings.Contains(v1, name_new) || strings.Contains(name_new, v1) {
+							is_r = true
+							is_r_n = k
+							temp_id = data_hospitals_ids[is_r_n]
+							break
+						}
+					}
+					if is_r {
+						break
+					}
+				}
+				if is_r {
+					break
+				}
+			}
+			if is_r {
+				isok++
+				class.Save_Mgo.UpdateById(save_coll, tmpid, map[string]interface{}{
+					"$set": map[string]interface{}{
+						"repeat":    1,
+						"repeat_id": temp_id,
+					},
+				})
+			}
+			//名称指向的id需要变更
+			data_hospitals_ids[name] = temp_id
+			data_hospitals_region[name] = region
+			data_hospitals_alias[name] = alias_arr
+		}
+		tmp = make(map[string]interface{})
+	}
+	log.Debug("is over ", total, "~", isok)
+
+	//合并判重后的数据
+	mergeRepeatHospital(save_coll)
+}

+ 3 - 3
filedproject_medical/data_preparation/src/main.go

@@ -13,9 +13,9 @@ func init() {
 	class.InitClass()
 }
 func main() {
-	log.Debug("领域化产品准备......")
-
-	hospital.RunHospitalInfo()
+	log.Debug("医疗信息相关~~准备~~")
+	//hospital.RunBuildHospitalInfo()
+	hospital.RunCleanHospitalInfo()
 
 	time.Sleep(999 * time.Hour)
 }