Explorar el Código

抽取省份城市预加载

unknown hace 6 años
padre
commit
f2ee6940ba

+ 12 - 0
src/jy/admin/admin.go

@@ -49,6 +49,18 @@ func init() {
 		data = append(data, *list...)
 		c.JSON(200, gin.H{"data": data})
 	})
+	//获取省份列表
+	Admin.POST("/getprovinces", func(c *gin.Context) {
+		list, b := Mgo.Find("province", `{}`, `{"_id":-1}`, nil, false, -1, -1)
+		if b && list != nil {
+			for _, v := range *list {
+				v["s_field"] = ""
+			}
+		}
+		data := []map[string]interface{}{map[string]interface{}{"_id": "", "s_name": ""}}
+		data = append(data, *list...)
+		c.JSON(200, gin.H{"data": data})
+	})
 	//获取字段列表
 	Admin.POST("/getversions", func(c *gin.Context) {
 		list, b := Mgo.Find("version", `{}`, `{"_id":-1}`, `{"version":1}`, false, -1, -1)

+ 8 - 1
src/jy/admin/resulttrack.go

@@ -99,6 +99,13 @@ func GetVersionInfo(c *gin.Context) {
 		"vid":   vid,
 		"isuse": true,
 	}
-	versioninfo, _ := Mgo.Find("versioninfo", query, nil, `{"s_field":1,"s_descrip":1}`, false, -1, -1)
+	versioninfo, _ := Mgo.Find("versioninfo", query, nil, `{"s_field":1}`, false, -1, -1)
+	if len(*versioninfo) > 0 {
+		for _, val := range *versioninfo {
+			s_field := val["s_field"].(string)
+			f, _ := Mgo.FindOne("fields", `{"s_field":"`+s_field+`"}`)
+			val["s_name"] = (*f)["s_name"]
+		}
+	}
 	c.JSON(200, gin.H{"versioninfo": versioninfo})
 }

+ 144 - 0
src/jy/extract/extract.go

@@ -25,6 +25,17 @@ var (
 	TaskList  map[string]*ExtractTask                //任务列表
 	saveLimit = 200                                  //抽取日志批量保存
 
+	CitySimConfig   map[string]map[string]interface{} //城市简称
+	CityAllConfig   map[string]map[string]interface{} //城市全称
+	ProviceConfig   map[string]interface{}            //省份
+	ProvinceMap     map[string]string
+	CityBrief       map[string]*City     //只加载一次即可
+	ProvinceBrief   map[string]*Province //只加载一次
+	AreaToCity      map[string][]*City   //两个文件共用
+	AreaGet         DFA                  //敏感词
+	AreaProvinceGet DFA                  //敏感词
+	AreaSimGet      DFA                  //敏感词
+
 	Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
 )
 
@@ -86,6 +97,7 @@ func StartExtractTaskId(taskId string) bool {
 	ext.InitRuleCore()
 	ext.InitTag()
 	ext.InitClearFn()
+	return true
 	ext.IsRun = true
 	if isgo {
 		go RunExtractTask(taskId)
@@ -224,6 +236,10 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 		}
 		//bs, _ := json.Marshal(j.Result)
 		//log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
+
+		//抽取省份城市县
+		fmt.Println("-----------", j.Province, j.City, j.Title)
+		//ExtractPC()
 		//分析抽取结果并保存 todo
 		AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
 
@@ -744,3 +760,131 @@ func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.Ext
 		db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
 	}
 }
+
+//抽取城市、省份
+func ExtractPC(buyer, projectname, title, city, province, addr string, id interface{}) (bres bool, c, p string) {
+	defer qu.Catch()
+	bc := true //是否继续抽取
+	if city != "" {
+		if CityBrief[city] == nil { //简称不存在
+			//log.Println("city err:", city, id)
+		} else { //简称存在
+			if province != CityBrief[city].P.Brief { //省份不对
+				log.Println("province err:", city, province, id)
+			} else {
+				bc = false
+				//原值正确,不用抽取
+			}
+		}
+	}
+	//有省份
+	bp := false
+	if ProvinceBrief[province] != nil {
+		bp = true
+	} else { //先识别省份
+		for _, str := range []string{city, buyer, addr, projectname, title} {
+			word := AreaProvinceGet.CheckSensitiveWord(str)
+			if word != "" {
+				province = ProvinceMap[word]
+				bp = true
+				break
+			}
+		}
+	}
+	//匹配城市
+	if bc {
+		//目前是全匹配模式,如果再加上精简匹配,加一层循环
+		for pos, GET := range []DFA{AreaGet, AreaSimGet} {
+			ws := make([]string, 5)
+			for n, str := range []string{city, buyer, addr, projectname, title} {
+				if str != "" {
+					word := GET.CheckSensitiveWord(str)
+					if pos == 1 { //用简称 后辍为路、集团替换
+						str1 := strings.Replace(str, word+"路", "", 1)
+						if str1 != str {
+							word = GET.CheckSensitiveWord(str1)
+						}
+					}
+					ws[n] = word
+					if word != "" {
+						res := AreaToCity[word]
+						if len(res) == 1 {
+							//判断省份
+							if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回
+								bres = true
+								c = res[0].Brief
+								p = res[0].P.Brief
+								break
+							} else { //不一致时。。暂时不处理
+							}
+						} else { //多个时
+						}
+					}
+				}
+			}
+			if !bres {
+				mc := map[string]int{}
+				for _, w := range ws {
+					res := AreaToCity[w]
+					for _, ct := range res {
+						if ct == nil {
+							continue
+						}
+						if bp { //有省份
+							if ct.P != nil && ct.P.Brief == province {
+								mc[ct.Brief]++
+							}
+						} else { //没有省份
+							mc[ct.Brief]++
+						}
+					}
+				}
+				//计算mc中最大值且大于1
+				max := 1
+				v := ""
+				for mk, mv := range mc {
+					if mv > max {
+						v = mk
+					}
+				}
+				if v != "" {
+					bres = true
+					c = CityBrief[v].Brief
+					p = CityBrief[v].P.Brief
+				} else if len(mc) > 0 {
+					//取级别更大的
+					v := ""
+					for mk, _ := range mc {
+						if CityBrief[mk].P.Cap == mk {
+							bres = true
+							c = CityBrief[mk].Brief
+							p = CityBrief[mk].P.Brief
+							break
+						} else {
+							v = mk
+						}
+					}
+					if !bres {
+						bres = true
+						c = CityBrief[v].Brief
+						p = CityBrief[v].P.Brief
+					}
+				}
+			}
+			if bres {
+				break
+			}
+		}
+	} else {
+		return
+	}
+	if !bres {
+		//取默认省会
+		if ProvinceBrief[province] != nil {
+			bres = true
+			c = ProvinceBrief[province].Cap
+			p = province
+		}
+	}
+	return
+}

+ 250 - 2
src/jy/extract/extractInit.go

@@ -2,6 +2,7 @@
 package extract
 
 import (
+	"fmt"
 	db "jy/mongodbutil"
 	"log"
 	qu "qfw/util"
@@ -45,6 +46,20 @@ type Tag struct {
 	Key  string         //
 	Reg  *regexp.Regexp //
 }
+
+type City struct {
+	Name  string
+	Brief string
+	P     *Province
+}
+
+type Province struct {
+	Name    string
+	Brief   string
+	Cap     string
+	Captial *City
+}
+
 type ExtractTask struct {
 	Id        string              //任务id
 	IsRun     bool                //是否启动
@@ -57,6 +72,11 @@ type ExtractTask struct {
 	ClearFn   map[string][]string //清理函数
 }
 
+//敏感词
+type DFA struct {
+	Link map[string]interface{}
+}
+
 func init() {
 	TaskList = make(map[string]*ExtractTask)
 	go SaveExtLog()
@@ -302,7 +322,7 @@ func (e *ExtractTask) InitTag() {
 	defer qu.Catch()
 	e.Tag = map[string][]*Tag{}
 	//字符串标签库
-	list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"字符串","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
+	list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"string","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
 	for _, v := range *list {
 		field := qu.ObjToString(v["s_field"])
 		if tmp, ok := v["content"].([]interface{}); ok {
@@ -313,7 +333,7 @@ func (e *ExtractTask) InitTag() {
 		}
 	}
 	//正则标签库
-	list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"正则","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
+	list, _ = db.Mgo.Find("tagdetailinfo", `{"s_type":"reg","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
 	for _, v := range *list {
 		field := qu.ObjToString(v["s_field"])
 		if tmp, ok := v["content"].([]interface{}); ok {
@@ -351,3 +371,231 @@ func (e *ExtractTask) InitClearFn() {
 	}
 	e.ClearFn = fn
 }
+
+//加载省份
+func (e *ExtractTask) InitProvince() {
+	defer qu.Catch()
+	fn := map[string]interface{}{}
+	list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"province","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
+	for _, v := range *list {
+		name := qu.ObjToString(v["s_name"])
+		fn[name] = qu.ObjArrToStringArr(v["content"].([]interface{}))
+	}
+	ProviceConfig = fn
+}
+
+//加载城市简称
+func (e *ExtractTask) InitCitySim() {
+	defer qu.Catch()
+	list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"citysim","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
+	fn := map[string]map[string]interface{}{}
+	for _, v := range *list {
+		name := qu.ObjToString(v["s_name"])
+		tmp := v["content"].(map[string]interface{})
+		fn[name] = tmp
+	}
+	CitySimConfig = fn
+}
+
+//加载城市全称
+func (e *ExtractTask) InitCityAll() {
+	defer qu.Catch()
+	list, _ := db.Mgo.Find("tagdetailinfo", `{"s_type":"cityall","s_version":"`+e.TaskInfo.Version+`"}`, nil, nil, false, -1, -1)
+	if len(*list) != 34 {
+		fmt.Println("加载城市配置文件出错", len(*list))
+	}
+	fn := map[string]map[string]interface{}{}
+	for _, v := range *list {
+		name := qu.ObjToString(v["s_name"])
+		tmp := v["content"].(map[string]interface{})
+		fn[name] = tmp
+	}
+	CityAllConfig = fn
+}
+
+//初始化城市省份敏感词
+func InitDfa() {
+	AreaGet = DFA{}
+	AreaProvinceGet = DFA{}
+	for k, v := range ProviceConfig {
+		for _, p := range v.([]interface{}) {
+			p1, _ := p.(string)
+			AreaProvinceGet.AddWord(p1)
+			ProvinceMap[p1] = k
+		}
+	}
+	for k, v := range CityAllConfig {
+		AreaProvinceGet.AddWord(k)
+		p := &Province{}
+		p.Name = k
+		p.Brief = v["brief"].(string)
+		ProvinceMap[k] = p.Brief
+		ProvinceBrief[p.Brief] = p
+		p.Cap = v["captial"].(string)
+		city, _ := v["city"].(map[string]interface{})
+		for k1, v1 := range city {
+			v1m, _ := v1.(map[string]interface{})
+			c := &City{}
+			c.Name = k1
+			if v1m["brief"] == nil {
+				log.Println(k, k1)
+			}
+			c.Brief = v1m["brief"].(string)
+			//cityAll[k1] = c
+			CityBrief[c.Brief] = c
+			c.P = p
+			if c.Brief == p.Cap {
+				p.Captial = c
+			}
+			//加入到城市map中
+			cs := AreaToCity[k1]
+			AreaGet.AddWord(k1)
+			if cs != nil {
+				cs = append(cs, c)
+			} else {
+				cs = []*City{c}
+			}
+			AreaToCity[k1] = cs
+
+			/*
+				AreaToCity["衢州市"] = []interface{}{
+					&City{
+						c.Name = 衢州市,
+						c.Brief = 衢州,
+						c.P = xxx
+					},
+				}
+			*/
+
+			arr := v1m["area"].([]interface{})
+			for _, k2 := range arr {
+				s := k2.(string)
+				cs := AreaToCity[s]
+				AreaGet.AddWord(s)
+				if cs != nil {
+					cs = append(cs, c)
+				} else {
+					cs = []*City{c}
+				}
+				AreaToCity[s] = cs
+			}
+		}
+	}
+	//加载简称
+	AreaSimGet = DFA{}
+	//util.ReadConfig("./city_sim.json", &CitySimConfig)
+	if len(CitySimConfig) != 34 {
+		log.Println("加载简称配置文件出错", len(CitySimConfig))
+	}
+	for k, v := range CitySimConfig {
+		pb := v["brief"].(string)
+		p := ProvinceBrief[pb]
+		//加载
+		for _, ss := range []string{k, pb} {
+			cs := AreaToCity[ss]
+			if cs != nil {
+				cs = append(cs, p.Captial)
+			} else {
+				cs = []*City{p.Captial}
+			}
+			AreaToCity[ss] = cs
+			AreaSimGet.AddWord(ss)
+		}
+		city, _ := v["city"].(map[string]interface{})
+		for k1, v1 := range city {
+			v1m, _ := v1.(map[string]interface{})
+			if v1m["brief"] == nil {
+				log.Println(k, k1)
+			}
+			cb := v1m["brief"].(string)
+			c := AreaToCity[k1][0]
+			//加入到城市map中
+			for _, ss := range []string{cb, k + cb, pb + cb} {
+				AreaSimGet.AddWord(ss)
+				cs := AreaToCity[ss]
+				if cs != nil {
+					cs = append(cs, c)
+				} else {
+					cs = []*City{c}
+				}
+				AreaToCity[ss] = cs
+			}
+			arr := v1m["area"].([]interface{})
+			for _, k2 := range arr {
+				s := k2.(string)
+				for _, ss := range []string{s, cb + s, pb + s, k + s} {
+					cs := AreaToCity[ss]
+					AreaSimGet.AddWord(ss)
+					if cs != nil {
+						cs = append(cs, c)
+					} else {
+						cs = []*City{c}
+					}
+					AreaToCity[ss] = cs
+				}
+			}
+		}
+	}
+}
+
+func (d *DFA) AddWord(keys ...string) {
+	d.AddWordAll(true, keys...)
+}
+
+func (d *DFA) AddWordAll(haskey bool, keys ...string) {
+	if d.Link == nil {
+		d.Link = make(map[string]interface{})
+	}
+	for _, key := range keys {
+		nowMap := &d.Link
+		for i := 0; i < len(key); i++ {
+			kc := key[i : i+1]
+			if v, ok := (*nowMap)[kc]; ok {
+				nowMap, _ = v.(*map[string]interface{})
+			} else {
+				newMap := map[string]interface{}{}
+				newMap["YN"] = "0"
+				(*nowMap)[kc] = &newMap
+				nowMap = &newMap
+			}
+			if i == len(key)-1 {
+				(*nowMap)["YN"] = "1"
+				if haskey {
+					(*nowMap)["K"] = key
+				}
+			}
+		}
+	}
+}
+
+//匹配最长
+func (d *DFA) CheckSensitiveWord(src string) string {
+	pos := 0
+	nowMap := &d.Link
+	res := ""
+	for i := 0; i < len(src); i++ {
+		word := src[i : i+1]
+		nowMap, _ = (*nowMap)[word].(*map[string]interface{})
+		if nowMap != nil { // 存在,则判断是否为最后一个
+			if pos == 0 {
+				pos = i
+			}
+			if "1" == qu.ObjToString((*nowMap)["YN"]) { // 如果为最后一个匹配规则,结束循环,返回匹配标识数
+				res = qu.ObjToString((*nowMap)["K"])
+				//pos = 0
+				//break
+			}
+		} else {
+			if res != "" {
+				break
+			} else {
+				nowMap = &d.Link
+				if pos > 0 {
+					i = pos
+					pos = 0
+				}
+			}
+		}
+	}
+	return res
+}

+ 1 - 1
src/main_test.go

@@ -11,7 +11,7 @@ import (
 )
 
 func Test_task(t *testing.T) {
-	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_v3")
+	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	extract.StartExtractTaskId("5b8f804025e29a290415aee1")
 	//extract.StartExtractTestTask("5b8f804025e29a290415aee1", "5b8dcc45a5cb26b9b7f68469", "10", "result_v3", "track_v3")
 	time.Sleep(300 * time.Second)

+ 8 - 6
src/web/templates/admin/onetag.html

@@ -157,7 +157,7 @@ $(function () {
 		"language": {
             "url": "../res/dist/js/dataTables.chinese.lang"
         },
-		{{if eq .tp "地区"}}
+		{{if eq .tp "citysim" "cityall"}}
 		"columnDefs": [
 				{ "targets": 4 ,"bVisible": false} //隐藏列
 			],
@@ -262,9 +262,12 @@ function createOneTag(){
 	$("#jsonadd").html('');
 	var arr = [];
 	var url = "/admin/getfields";
-	if(tp == "地区"){
+	if(tp == "citysim" || tp == "cityall"){
 		$("#labelname").html("省/直辖市");
 		url = "/admin/getcitys";
+	}else if(tp == "province"){
+		$("#labelname").html("省份");
+		url = "/admin/getprovinces";
 	}
 	$.ajax({
 		url:url,
@@ -294,15 +297,14 @@ function saveaddone(){
 	namearr = name.split("+");
 	content = editor.getText();
 	var contentrep = content.replace(/\s+/g,"");
-	if(contentrep == "" || contentrep == "[]" || contentrep == "{}" || name == "" || (tp != "地区" && content[0] == "{")){
+	if(contentrep == "" || contentrep == "[]" || contentrep == "{}" || name == "" || (tp != "citysim" && tp != "cityall" && content[0] == "{")){
 		alert("表单填写不完整或格式不正确!");
-			return false;
+		return false;
 	}
 	if(namearr.length != 2){
 		alert("名称或省/直辖市数据错误!");
-			return false;
+		return false;
 	}
-	console.log("field---",namearr[1])
 	$.ajax({
 		url:"/admin/onetag/create",
 		type:"post",

+ 8 - 3
src/web/templates/admin/result_list.html

@@ -160,6 +160,7 @@ $(function () {
       	}
 	});
 	ttable.on('init.dt', function () {
+		console.log("每次页面刷新")
 		/*查询所有结果表和对应的日志表*/
 		gettasktestlog();
 		/*抽取测试*/
@@ -201,7 +202,8 @@ $(function () {
 							}else{
 								num = "5";
 							}
-							if(bcon){								
+							if(bcon){
+								//抽取测试								
 								$.post("/admin/task/test",{"startid":id,"num":num,"taskid":taskid,"resultcoll":resultcoll,"trackcoll":trackcoll},function(data){
 									if(data&&data.rep){
 										sessionStorage.setItem("resultinfo",resultcoll+"+"+data.version);
@@ -232,6 +234,7 @@ $(function () {
 				break;
 			}
 		});
+		console.log("测试完成后标签展示")
 		var trackcoll = sessionStorage.getItem("trackcoll");
 		var resultinfo = sessionStorage.getItem("resultinfo");
 		if(trackcoll != null && resultinfo != null && trackcoll != "undefined"){
@@ -270,7 +273,7 @@ function trackShow(_id){
 					val = ""
 				}
 				var tmp ="<div class='form-group'>"
-						   +"<label for='code' class='col-sm-2 control-label'>"+datainfo[i]["s_descrip"]+":</label>"
+						   +"<label for='code' class='col-sm-2 control-label'>"+datainfo[i]["s_name"]+":</label>"
 						    +"<div class='col-sm-10 styleclass'>"
 							   	+"<input type='text'id='"+key+"' class='form-control' value='"+val+"'>"
 								+"<button type='button' class='btn btn-primary track'>追踪</button>"
@@ -315,8 +318,9 @@ function tablereload(){
 
 //获取所有对应的结果表和日志表
 function gettasktestlog(){
+	console.log("铺标签")
 	$("#dataTable_filter div").remove();
-	clearcoll();
+	clearcoll();//清理标签
 	var opt2="<option value='-1'>全部</option>";
 	var select2="<div class='form-group'><label for='name'>日志表:</label>"+
 		"<select disabled id='tracklog' class='form-control input-sm'>"+
@@ -370,6 +374,7 @@ function cleardata(){
 					sessionStorage.setItem("trackcoll","");
 					tablereload();
 				}else{
+					sessionStorage.setItem("trackcoll","");
 					showTip("删除失败", 1000, function() {});
 				}
 			}

+ 13 - 7
src/web/templates/admin/taglist.html

@@ -59,8 +59,9 @@
 				      	<input type="radio" name="name" id="add-str" value="正则类">正则类</input>&nbsp;&nbsp;&nbsp;&nbsp;
 					  	<input type="radio" name="name" id="add-reg" value="字符串类">字符串类</input>&nbsp;&nbsp;&nbsp;&nbsp;
 					   	<input type="radio" name="name" id="add-simarea" value="城市简称">城市简称</input>&nbsp;&nbsp;&nbsp;&nbsp;
-						<input type="radio" name="name" id="add-allarea" value="城市全称">城市全称</input>
-				      </div>
+						<input type="radio" name="name" id="add-allarea" value="城市全称">城市全称</input>&nbsp;&nbsp;&nbsp;&nbsp;
+				      	<input type="radio" name="name" id="add-province" value="省份">省份
+					</div>
 					<!--<input id="add-name" type="text" class="form-control" placeholder="请输名称">-->
 				</div>
 				<div class="form-group">
@@ -145,12 +146,17 @@ function addsave(){
 	}
 	var tp = "";
 	if(name == "字符串类"){
-		tp = "字符串";
+		tp = "string";
 	}else if(name == "正则类"){
-		tp = "正则";
-	}else{
-		tp = "地区";
-	}
+		tp = "reg";
+	}else if(name == "城市简称"){
+		tp = "citysim";
+	}else if(name == "城市全称"){
+		tp = "cityall";
+	}else if(name == "省份"){
+		tp = "province";
+	}	
+	
 	$.ajax({
 		url:"/admin/tag/save",
 		type:"post",