unknown 6 éve
szülő
commit
481743404a

+ 8 - 0
src/extractcity.json

@@ -0,0 +1,8 @@
+[
+	"city",
+	"buyer",
+	"buyeraddr",
+	"projectname",
+	"title"
+]
+	

+ 4 - 4
src/jy/admin/clear.go

@@ -24,7 +24,7 @@ func init() {
 //加载数据
 func ClearData(c *gin.Context) {
 	version, _ := c.GetPostForm("version")
-	data, _ := Mgo.Find("cleanup", `{"s_version":"`+version+`","delete":true}`, `{"_id":-1}`, nil, false, -1, -1)
+	data, _ := Mgo.Find("cleanup", `{"s_version":"`+version+`","delete":false}`, `{"_id":-1}`, nil, false, -1, -1)
 	for _, d := range *data {
 		timeStr := time.Unix(d["l_intime"].(int64), 0).Format(Date_Short_Layout)
 		d["l_intime"] = timeStr
@@ -56,7 +56,7 @@ func ClearSave(c *gin.Context) {
 		username, _ := c.GetPostForm("username")
 		fieldname, _ := c.GetPostForm("fieldname")
 		fieldArr := strings.Split(fieldname, "+")
-		data, _ := Mgo.FindOne("cleanup", `{"s_version":"`+version+`","s_field":"`+fieldArr[1]+`","delete":true}`)
+		data, _ := Mgo.FindOne("cleanup", `{"s_version":"`+version+`","s_field":"`+fieldArr[1]+`","delete":false}`)
 		if len(*data) > 0 { //创建判重
 			c.JSON(200, gin.H{"rep": false})
 			return
@@ -69,7 +69,7 @@ func ClearSave(c *gin.Context) {
 			"s_name":    fieldArr[0],
 			"clear":     clear,
 			"clearInfo": clearInfo,
-			"delete":    true,
+			"delete":    false,
 		}
 		b := Mgo.Save("cleanup", save)
 		if b != "" {
@@ -96,7 +96,7 @@ func ClearSave(c *gin.Context) {
 func ClearDel(c *gin.Context) {
 	_id, _ := c.GetPostForm("_id")
 	b := Mgo.Update("cleanup", `{"_id":"`+_id+`"}`, map[string]interface{}{
-		"$set": map[string]interface{}{"delete": false},
+		"$set": map[string]interface{}{"delete": true},
 	}, false, false)
 	if b {
 		c.JSON(200, gin.H{"rep": true})

+ 36 - 210
src/jy/extract/extract.go

@@ -25,21 +25,9 @@ var (
 	TaskList  map[string]*ExtractTask                //任务列表
 	saveLimit = 200                                  //抽取日志批量保存
 
-	AreaGet         DFA //敏感词
-	AreaProvinceGet DFA //敏感词
-	AreaSimGet      DFA //敏感词
-
 	Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
 )
 
-var CitySimConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市简称
-var CityAllConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市全称
-var ProviceConfig map[string]interface{} = make(map[string]interface{})                       //省份
-var ProvinceMap map[string]string = make(map[string]string)
-var CityBrief map[string]*City = make(map[string]*City)             //只加载一次即可
-var ProvinceBrief map[string]*Province = make(map[string]*Province) //只加载一次
-var AreaToCity map[string][]*City = make(map[string][]*City)        //两个文件共用
-
 //启动测试抽取
 func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
 	defer qu.Catch()
@@ -53,6 +41,12 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ext.InitRuleCore()
 	ext.InitTag()
 	ext.InitClearFn()
+
+	ext.InitProvince()
+	ext.InitCityAll()
+	ext.InitCitySim()
+	InitDFA()
+
 	return RunExtractTestTask(ext, startId, num)
 }
 
@@ -186,7 +180,7 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
 		City:       qu.ObjToString(doc["city"]),
 		Province:   qu.ObjToString(doc["area"]),
 		Result:     map[string][]*ju.ExtField{},
-		//BuyerAddr:  qu.ObjToString(doc["buyeraddr"]),
+		BuyerAddr:  qu.ObjToString(doc["buyeraddr"]),
 	}
 	pretreated.AnalyStart(j)
 	return j
@@ -242,12 +236,6 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 		}
 		//bs, _ := json.Marshal(j.Result)
 		//log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
-
-		//抽取省份城市县
-
-		//fmt.Println("-----------", j.Province, j.City, j.BuyerAddr, j.Title) //j.Address
-		//ExtractPC(j.Result, j.Province, j.City, j.Title, j.BuyerAddr, j.SourceMid) //j.Address
-		ExtractPC2(j.Result, "Province", "City", "Title", "Addr", j.SourceMid)
 		//分析抽取结果并保存 todo
 		AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
 
@@ -736,7 +724,8 @@ func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.Ext
 		values[key] = ju.ExtSort(objects)
 	}
 	//从排序结果中取值
-	tmp := map[string]interface{}{}
+	tmp := map[string]interface{}{} //抽取值
+	resulttmp := tmp                //保存结果
 	for key, val := range values {
 		for _, v := range val { //取第一个
 			if v.Key != "" {
@@ -745,202 +734,39 @@ func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.Ext
 			}
 		}
 	}
+	resulttmp["result"] = result
+	for k, v := range *doc {
+		if resulttmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
+			resulttmp[k] = v
+		}
+	}
+	b, p, c, d := TransmitData(resulttmp, _id) //抽取省份城市
+	//log.Println("抽取省份,城市,县结果=====", b, p, c, d)
+	resulttmp["district"] = d
+	if b {
+		resulttmp["city"] = c
+		resulttmp["area"] = p
+	}
 	if task.TestColl == "" {
 		if len(tmp) > 0 { //保存抽取结果
 			task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
 		}
 		//保存抽取详情
-		tmp["result"] = result
-		for k, v := range *doc {
-			if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
-				tmp[k] = v
-			}
-		}
-		db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
+		//		tmp["result"] = result
+		//		for k, v := range *doc {
+		//			if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
+		//				tmp[k] = v
+		//			}
+		//		}
+		db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
 	} else { //测试结果
 		//保存抽取详情
-		tmp["result"] = result
-		for k, v := range *doc {
-			if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
-				tmp[k] = v
-			}
-		}
-		db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
-	}
-}
-
-//抽取城市、省份
-func ExtractPC2(result map[string][]*ju.ExtField, province, city, title, addr, sourcemid string) (bres bool, c, p string) {
-	var pjnarr, buyerarr []string
-	var pb []interface{}
-	for n, val := range result["projectname"] {
-		pjnarr[n] = fmt.Sprint(val.Value)
-	}
-	for n, val := range result["buyer"] {
-		buyerarr[n] = fmt.Sprint(val.Value)
-	}
-	pl := len(pjnarr)
-	bl := len(buyerarr)
-	max := 0
-	if pl > bl {
-		max = pl
-	} else {
-		max = bl
-	}
-	//city, buyer, addr, projectname, title
-	if max == 0 { //没有projectname和buyer结果集
-		tmp1 := []string{city, "", addr, "", title}
-		pb = append(pb, tmp1)
-	} else { //至少有一个结果集
-		if max == pl {
-			for i := 0; i < max; i++ {
-				p := pjnarr[i]
-				b := ""
-				if i < bl {
-					b = buyerarr[i]
-				}
-				tmp2 := []string{city, b, addr, p, title}
-				pb = append(pb, tmp2)
-			}
-		} else {
-			for i := 0; i < max; i++ {
-				b := buyerarr[i]
-				p := ""
-				if i < pl {
-					p = pjnarr[i]
-				}
-				tmp3 := []string{city, b, addr, p, title}
-				pb = append(pb, tmp3)
-			}
-		}
-
-	}
-	log.Println(pb)
-	return
-}
-func ExtractPC(buyer, projectname, title, city, province, addr string, id interface{}) (bres bool, c, p string) {
-	defer qu.Catch()
-	bc := true //是否继续抽取
-	if city != "" {
-		if CityBrief[city] == nil { //简称不存在
-			//log.Println("city err:", city, id)
-		} else { //简称存在
-			if province != CityBrief[city].P.Brief { //省份不对
-				log.Println("province err:", city, province, id)
-			} else {
-				bc = false
-				//原值正确,不用抽取
-			}
-		}
-	}
-	//有省份
-	bp := false
-	if ProvinceBrief[province] != nil {
-		bp = true
-	} else { //没有省份,先识别省份
-		for _, str := range []string{city, buyer, addr, projectname, title} {
-			word := AreaProvinceGet.CheckSensitiveWord(str) //省全称
-			if word != "" {
-				province = ProvinceMap[word] //省简称
-				bp = true
-				break
-			}
-		}
-	}
-	//匹配城市
-	if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不对,继续抽取
-		//目前是全匹配模式,如果再加上精简匹配,加一层循环
-		for pos, GET := range []DFA{AreaGet, AreaSimGet} {
-			ws := make([]string, 5)
-			for n, str := range []string{city, buyer, addr, projectname, title} {
-				if str != "" {
-					word := GET.CheckSensitiveWord(str)
-					if pos == 1 { //用简称 后辍为路、集团替换
-						str1 := strings.Replace(str, word+"路", "", 1)
-						if str1 != str {
-							word = GET.CheckSensitiveWord(str1)
-						}
-					}
-					ws[n] = word
-					if word != "" {
-						res := AreaToCity[word]
-						if len(res) == 1 {
-							//判断省份
-							if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回
-								bres = true
-								c = res[0].Brief
-								p = res[0].P.Brief
-								break
-							} else { //不一致时。。暂时不处理
-							}
-						} else { //多个时
-						}
-					}
-				}
-			}
-			if !bres {
-				mc := map[string]int{}
-				for _, w := range ws {
-					res := AreaToCity[w]
-					for _, ct := range res {
-						if ct == nil {
-							continue
-						}
-						if bp { //有省份
-							if ct.P != nil && ct.P.Brief == province {
-								mc[ct.Brief]++
-							}
-						} else { //没有省份
-							mc[ct.Brief]++
-						}
-					}
-				}
-				//计算mc中最大值且大于1
-				max := 1
-				v := ""
-				for mk, mv := range mc {
-					if mv > max {
-						v = mk
-					}
-				}
-				if v != "" {
-					bres = true
-					c = CityBrief[v].Brief
-					p = CityBrief[v].P.Brief
-				} else if len(mc) > 0 {
-					//取级别更大的
-					v := ""
-					for mk, _ := range mc {
-						if CityBrief[mk].P.Cap == mk {
-							bres = true
-							c = CityBrief[mk].Brief
-							p = CityBrief[mk].P.Brief
-							break
-						} else {
-							v = mk
-						}
-					}
-					if !bres {
-						bres = true
-						c = CityBrief[v].Brief
-						p = CityBrief[v].P.Brief
-					}
-				}
-			}
-			if bres {
-				break
-			}
-		}
-	} else {
-		return
-	}
-	if !bres {
-		//取默认省会
-		if ProvinceBrief[province] != nil {
-			bres = true
-			c = ProvinceBrief[province].Cap
-			p = province
-		}
+		//		tmp["result"] = result
+		//		for k, v := range *doc {
+		//			if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
+		//				tmp[k] = v
+		//			}
+		//		}
+		db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
 	}
-	return
 }

+ 35 - 116
src/jy/extract/extractInit.go

@@ -46,19 +46,6 @@ type Tag struct {
 	Reg  *regexp.Regexp //
 }
 
-type City struct {
-	Name  string
-	Brief string
-	P     *Province
-}
-
-type Province struct {
-	Name    string
-	Brief   string
-	Cap     string
-	Captial *City
-}
-
 type ExtractTask struct {
 	Id        string              //任务id
 	IsRun     bool                //是否启动
@@ -71,11 +58,6 @@ type ExtractTask struct {
 	ClearFn   map[string][]string //清理函数
 }
 
-//敏感词
-type DFA struct {
-	Link map[string]interface{}
-}
-
 func init() {
 	TaskList = make(map[string]*ExtractTask)
 	go SaveExtLog()
@@ -422,28 +404,26 @@ func (e *ExtractTask) InitCityAll() {
 func InitDFA() {
 	AreaGet = DFA{}
 	AreaProvinceGet = DFA{}
+	AreaStreet = DFA{}
 	for k, v := range ProviceConfig {
-		log.Println(k, "----------", v)
 		for _, p := range v.([]interface{}) {
-			log.Println("ppppp", p)
 			p1, _ := p.(string)
 			AreaProvinceGet.AddWord(p1)
 			ProvinceMap[p1] = k
 		}
 	}
-	log.Println("ProvinceMap11----", ProvinceMap)
+	//	ProvinceMap["新疆省"] = "新疆"
+	//	ProvinceMap["新疆兵团"] = "新疆"
+	//	provinceMap["广西省"] = "广西"
 	for k, v := range CityAllConfig {
 		AreaProvinceGet.AddWord(k) //省全称
 		p := &Province{}
 		p.Name = k
 		p.Brief = v["brief"].(string)
 		ProvinceMap[k] = p.Brief
-		log.Println("ProvinceMap22----", ProvinceMap)
 		ProvinceBrief[p.Brief] = p
 		p.Cap = v["captial"].(string)
-		log.Println("ProvinceBrief11====", p.Brief, ProvinceBrief[p.Brief].Name, ProvinceBrief[p.Brief].Brief, "==", ProvinceBrief[p.Brief].Cap)
 		city, _ := v["city"].(map[string]interface{})
-		log.Println("======================================================")
 		for k1, v1 := range city {
 			v1m, _ := v1.(map[string]interface{})
 			c := &City{}
@@ -452,13 +432,11 @@ func InitDFA() {
 				log.Println(k, k1)
 			}
 			c.Brief = v1m["brief"].(string)
-			//cityAll[k1] = c
 			CityBrief[c.Brief] = c
 			c.P = p
 			if c.Brief == p.Cap {
 				p.Captial = c
 			}
-			log.Println("CityBrief11+++", k1, "---", CityBrief[c.Brief].Name, CityBrief[c.Brief].Brief, "===", CityBrief[c.Brief].P.Captial, "===", CityBrief[c.Brief].P.Name)
 			//加入到城市map中
 			cs := AreaToCity[k1]
 			AreaGet.AddWord(k1) //市全称
@@ -468,44 +446,41 @@ func InitDFA() {
 				cs = []*City{c}
 			}
 			AreaToCity[k1] = cs
-			log.Println("市---", k1, AreaToCity[k1][0].Brief, AreaToCity[k1][0].Name, AreaToCity[k1][0].P.Name)
 
-			/*
-				AreaToCity["衢州市"] = []interface{}{
-					&City{
-						c.Name = 衢州市,
-						c.Brief = 衢州,
-						c.P = xxx
-					},
+			//区县
+			districtmap := v1m["area"].(map[string]interface{}) //区或县
+			for district, streetarr := range districtmap {
+				d := &District{}
+				d.Name = district
+				d.C = c
+				AreaDistrict.AddWord(district) //加入区或县敏感词
+				ctmp := DistrictCityMap[district]
+				if ctmp == nil {
+					DistrictCityMap[district] = c
 				}
-			*/
 
-			arr := v1m["area"].([]interface{})
-			for _, k2 := range arr {
-				s := k2.(string)
-				cs := AreaToCity[s]
-				AreaGet.AddWord(s) //街道全称
-				if cs != nil {
-					cs = append(cs, c)
-				} else {
-					cs = []*City{c}
+				//街道
+				for _, s := range qu.ObjArrToStringArr(streetarr.([]interface{})) {
+					AreaStreet.AddWord(s) //加入街道敏感词
+					dtmp := StreetDistrictMap[s]
+					if dtmp == nil {
+						StreetDistrictMap[s] = d
+					}
 				}
-				AreaToCity[s] = cs
-				log.Println("街道===", k2, AreaToCity)
 			}
 		}
 	}
-	log.Println("======================================================")
+	//	log.Println("ProvinceMap---", ProvinceMap)
+	//	log.Println("ProvinceBrief---", ProvinceBrief)
+	//	log.Println("CityBrief---", CityBrief)
+	//	log.Println("AreaToCity---", AreaToCity)
+	//	log.Println("DistrictCityMap---", DistrictCityMap)
+	//	log.Println("StreetDistrictMap---", StreetDistrictMap)
 	//加载简称
 	AreaSimGet = DFA{}
-	//util.ReadConfig("./city_sim.json", &CitySimConfig)
-	//	if len(CitySimConfig) != 34 {
-	//		log.Println("加载简称配置文件出错", len(CitySimConfig))
-	//	}
 	for k, v := range CitySimConfig {
 		pb := v["brief"].(string)
 		p := ProvinceBrief[pb]
-		log.Println("++++++++++++++++++", p)
 		//加载
 		for _, ss := range []string{k, pb} {
 			cs := AreaToCity[ss]
@@ -515,7 +490,6 @@ func InitDFA() {
 				cs = []*City{p.Captial}
 			}
 			AreaToCity[ss] = cs
-			log.Println("+++", ss, AreaToCity)
 			AreaSimGet.AddWord(ss) //省全称和省简称
 		}
 		city, _ := v["city"].(map[string]interface{})
@@ -536,7 +510,6 @@ func InitDFA() {
 					cs = []*City{c}
 				}
 				AreaToCity[ss] = cs
-				log.Println("+-+-", ss, AreaToCity)
 			}
 			arr := v1m["area"].([]interface{})
 			for _, k2 := range arr {
@@ -550,71 +523,17 @@ func InitDFA() {
 						cs = []*City{c}
 					}
 					AreaToCity[ss] = cs
-					log.Println("-+-+", ss, AreaToCity)
-				}
-			}
-		}
-	}
-	log.Println(AreaToCity)
-}
-
-func (d *DFA) AddWord(keys ...string) {
-	d.AddWordAll(true, keys...)
-}
-
-func (d *DFA) AddWordAll(haskey bool, keys ...string) {
-	if d.Link == nil {
-		d.Link = make(map[string]interface{})
-	}
-	for _, key := range keys {
-		nowMap := &d.Link
-		for i := 0; i < len(key); i++ {
-			kc := key[i : i+1]
-			if v, ok := (*nowMap)[kc]; ok {
-				nowMap, _ = v.(*map[string]interface{})
-			} else {
-				newMap := map[string]interface{}{}
-				newMap["YN"] = "0"
-				(*nowMap)[kc] = &newMap
-				nowMap = &newMap
-			}
-			if i == len(key)-1 {
-				(*nowMap)["YN"] = "1"
-				if haskey {
-					(*nowMap)["K"] = key
-				}
-			}
-		}
-	}
-}
 
-func (d *DFA) CheckSensitiveWord(src string) string {
-	pos := 0
-	nowMap := &d.Link
-	res := ""
-	for i := 0; i < len(src); i++ {
-		word := src[i : i+1]
-		nowMap, _ = (*nowMap)[word].(*map[string]interface{})
-		if nowMap != nil { // 存在,则判断是否为最后一个
-			if pos == 0 {
-				pos = i
-			}
-			if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
-				res = qu.ObjToString((*nowMap)["K"])
-				//pos = 0
-				//break
-			}
-		} else {
-			if res != "" {
-				break
-			} else {
-				nowMap = &d.Link
-				if pos > 0 {
-					i = pos
-					pos = 0
+					d := &District{}
+					d.Name = ss
+					d.C = c
+					AreaDistrict.AddWord(ss) //加入区或县敏感词
+					ctmp := DistrictCityMap[ss]
+					if ctmp == nil {
+						DistrictCityMap[ss] = c
+					}
 				}
 			}
 		}
 	}
-	return res
 }

+ 316 - 0
src/jy/extract/extractcity.go

@@ -0,0 +1,316 @@
+package extract
+
+import (
+	"fmt"
+	//ju "jy/util"
+	"log"
+	qu "qfw/util"
+	"strings"
+)
+
+//省
+type Province struct {
+	Name    string
+	Brief   string
+	Cap     string
+	Captial *City
+}
+
+//市
+type City struct {
+	Name  string
+	Brief string
+	P     *Province
+}
+
+//区或县
+type District struct {
+	Name string
+	C    *City
+}
+
+//街道
+type Street struct {
+	Name string
+	D    *District
+}
+
+//敏感词
+type DFA struct {
+	Link map[string]interface{}
+}
+
+var SortField []string
+var (
+	AreaGet         DFA //市全称
+	AreaDistrict    DFA //区或县
+	AreaProvinceGet DFA //省
+	AreaSimGet      DFA //市简称
+	AreaStreet      DFA //街道
+)
+var CitySimConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市简称
+var CityAllConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市全称
+var ProviceConfig map[string]interface{} = make(map[string]interface{})                       //省份
+var ProvinceMap map[string]string = make(map[string]string)
+var CityBrief map[string]*City = make(map[string]*City)             //只加载一次即可
+var ProvinceBrief map[string]*Province = make(map[string]*Province) //只加载一次
+var AreaToCity map[string][]*City = make(map[string][]*City)        //两个文件共用
+var DistrictCityMap map[string]*City = make(map[string]*City)
+var StreetDistrictMap map[string]*District = make(map[string]*District)
+
+func init() {
+	qu.ReadConfig("./extractcity.json", &SortField)
+}
+func TransmitData(resulttmp map[string]interface{}, id string) (bres bool, p, c, d string) {
+	province := fmt.Sprint(resulttmp["area"])
+	city := fmt.Sprint(resulttmp["city"])
+	field := make([]string, 0)
+	for _, f := range SortField { //
+		val := resulttmp[f]
+		if val == nil {
+			field = append(field, "")
+		} else {
+			field = append(field, fmt.Sprint(val))
+		}
+	}
+	bres, c, p = ExtractProvinceCity(province, city, id, field) //抽取省和市
+	bres, p, c, d = ExtractDistrict(field, bres, c, p, id)      //抽取区或县
+	return
+}
+
+//抽取区或县(从配置的字段信息中抽取区或县)
+func ExtractDistrict(field []string, bres bool, c, p, id string) (bool, string, string, string) {
+	d := ""
+	for _, str := range field {
+		for pos, GET := range []DFA{AreaDistrict, AreaStreet} { //先匹配区或县再匹配街道
+			word := GET.CheckSensitiveWord(str)
+			if word != "" {
+				if pos == 0 { //区或县匹配
+					//log.Println("县直接匹配到====", word)
+					city := DistrictCityMap[word]
+					if city != nil {
+						d = word
+						ctmp := city.Brief
+						ptmp := city.P.Brief
+						if !bres { //城市省份没有抽到,通过区或县定位市和省
+							c = ctmp
+							p = ptmp
+							bres = true
+						} else { //对比抽到的城市省份是否一致
+							if c != ctmp || p != ptmp {
+								log.Println("City And Province, Inconsistent Before And After,Id:", id)
+							}
+						}
+					}
+				} else { //街道匹配
+					//log.Println("匹配到街道====", word)
+					district := StreetDistrictMap[word]
+					if district != nil {
+						d = district.Name
+						ctmp := district.C.Brief
+						ptmp := district.C.P.Brief
+						if !bres { //城市省份没有抽到,通过区或县定位市和省
+							c = ctmp
+							p = ptmp
+							bres = true
+						} else { //对比抽到的城市省份是否一致
+							if c != ctmp || p != ptmp {
+								log.Println("City And Province, Inconsistent Before And After,Id:", id)
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	return bres, p, c, d
+}
+
+//抽取城市、省份
+func ExtractProvinceCity(province, city, id string, field []string) (bres bool, c, p string) {
+	defer qu.Catch()
+	bc := true //是否继续抽取
+	if city != "" {
+		if CityBrief[city] == nil { //简称不存在
+			log.Println("city err:", city, id)
+		} else { //简称存在
+			if province != CityBrief[city].P.Brief { //省份不配对
+				log.Println("province err:", city, province, id)
+			} else {
+				bc = false
+				//城市省份都正确
+			}
+		}
+	}
+	//有省份
+	bp := false
+	if ProvinceBrief[province] != nil { //省份简称正确
+		bp = true
+	} else { //没有省份,先识别省份
+		for _, str := range field { //没有省的简称,从配置的字段信息中抽取省
+			word := AreaProvinceGet.CheckSensitiveWord(str) //省全称DFA中匹配
+			if word != "" {
+				province = ProvinceMap[word] //
+				bp = true
+				break
+			}
+		}
+	}
+	//匹配城市
+	if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不配对,继续抽取
+		//目前是全匹配模式,如果再加上精简匹配,加一层循环
+		for pos, GET := range []DFA{AreaGet, AreaSimGet} { //AreaGet市全称,AreaSimGet省全称和简称
+			ws := make([]string, 5)
+			for n, str := range field {
+				if str != "" {
+					word := GET.CheckSensitiveWord(str)
+					if pos == 1 { //用简称 后辍为路、集团替换
+						str1 := strings.Replace(str, word+"路", "", 1)
+						if str1 != str {
+							word = GET.CheckSensitiveWord(str1)
+						}
+					}
+					ws[n] = word
+					if word != "" {
+						res := AreaToCity[word]
+						if len(res) == 1 {
+							//判断省份
+							if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回(!bp:省的简称)
+								bres = true
+								c = res[0].Brief
+								p = res[0].P.Brief
+								break
+							} else { //不一致时。。暂时不处理
+							}
+						} else { //多个时(出现这种情况是多个省中的市,市名相同)
+						}
+					}
+				}
+			}
+			if !bres { //没有匹配到
+				mc := map[string]int{}
+				for _, w := range ws {
+					res := AreaToCity[w]
+					for _, ct := range res {
+						if ct == nil {
+							continue
+						}
+						if bp { //有省份
+							if ct.P != nil && ct.P.Brief == province {
+								mc[ct.Brief]++
+							}
+						} else { //没有省份
+							mc[ct.Brief]++
+						}
+					}
+				}
+				//计算mc中最大值且大于1
+				max := 1
+				v := ""
+				for mk, mv := range mc {
+					if mv > max {
+						v = mk
+					}
+				}
+				if v != "" {
+					bres = true
+					c = CityBrief[v].Brief
+					p = CityBrief[v].P.Brief
+				} else if len(mc) > 0 {
+					//取级别更大的
+					v := ""
+					for mk, _ := range mc {
+						if CityBrief[mk].P.Cap == mk {
+							bres = true
+							c = CityBrief[mk].Brief
+							p = CityBrief[mk].P.Brief
+							break
+						} else {
+							v = mk
+						}
+					}
+					if !bres {
+						bres = true
+						c = CityBrief[v].Brief
+						p = CityBrief[v].P.Brief
+					}
+				}
+			}
+			if bres {
+				break
+			}
+		}
+	} else {
+		return
+	}
+	if !bres {
+		//取默认省会
+		if ProvinceBrief[province] != nil {
+			bres = true
+			c = ProvinceBrief[province].Cap
+			p = province
+		}
+	}
+	return
+}
+
+func (d *DFA) AddWord(keys ...string) {
+	d.AddWordAll(true, keys...)
+}
+
+func (d *DFA) AddWordAll(haskey bool, keys ...string) {
+	if d.Link == nil {
+		d.Link = make(map[string]interface{})
+	}
+	for _, key := range keys {
+		nowMap := &d.Link
+		for i := 0; i < len(key); i++ {
+			kc := key[i : i+1]
+			if v, ok := (*nowMap)[kc]; ok {
+				nowMap, _ = v.(*map[string]interface{})
+			} else {
+				newMap := map[string]interface{}{}
+				newMap["YN"] = "0"
+				(*nowMap)[kc] = &newMap
+				nowMap = &newMap
+			}
+			if i == len(key)-1 {
+				(*nowMap)["YN"] = "1"
+				if haskey {
+					(*nowMap)["K"] = key
+				}
+			}
+		}
+	}
+}
+
+func (d *DFA) CheckSensitiveWord(src string) string {
+	pos := 0
+	nowMap := &d.Link
+	res := ""
+	for i := 0; i < len(src); i++ {
+		word := src[i : i+1]
+		nowMap, _ = (*nowMap)[word].(*map[string]interface{})
+		if nowMap != nil { // 存在,则判断是否为最后一个
+			if pos == 0 {
+				pos = i
+			}
+			if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
+				res = qu.ObjToString((*nowMap)["K"])
+				//pos = 0
+				//break
+			}
+		} else {
+			if res != "" {
+				break
+			} else {
+				nowMap = &d.Link
+				if pos > 0 {
+					i = pos
+					pos = 0
+				}
+			}
+		}
+	}
+	return res
+}

+ 1 - 1
src/jy/util/article.go

@@ -14,7 +14,7 @@ type Job struct {
 	Data       *map[string]interface{} //数据库源数据
 	Block      []*Block                //分块
 	Result     map[string][]*ExtField  //结果
-	//BuyerAddr  string                  //采购单位地址
+	BuyerAddr  string                  //采购单位地址
 }
 
 type ExtField struct {

+ 7 - 2
src/main_test.go

@@ -12,8 +12,8 @@ import (
 
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
-	extract.StartExtractTaskId("5b8f804025e29a290415aee1")
-	//extract.StartExtractTestTask("5b8f804025e29a290415aee1", "5b8dcc45a5cb26b9b7f68469", "10", "result_v3", "track_v3")
+	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")
+	extract.StartExtractTestTask("5b8f804025e29a290415aee1", "5b8dd21ca5cb26b9b7fa4afa", "1", "result_v3", "track_v3")
 	time.Sleep(300 * time.Second)
 }
 func Test_reg(t *testing.T) {
@@ -50,3 +50,8 @@ func Test_city(t *testing.T) {
 	extract.InitDFA()
 	time.Sleep(300 * time.Second)
 }
+
+func Test_arr(t *testing.T) {
+	var DistrictToCity map[string]interface{} = make(map[string]interface{})
+	log.Println(DistrictToCity["a"])
+}