Browse Source

抽取优化

zhangjinkun 6 years ago
parent
commit
1da90de1ba

+ 11 - 67
src/jy/extract/extract.go

@@ -190,12 +190,6 @@ func PreInfo(doc map[string]interface{}) *ju.Job {
 
 //抽取
 func (e *ExtractTask) ExtractProcess(j *ju.Job) {
-	if 1 == 2 {
-		for k, v := range j.Block {
-			bs, _ := json.Marshal(v.TableKV)
-			log.Println("Block TableKV", k, string(bs))
-		}
-	}
 	qu.Catch()
 	qu.Try(func() {
 		doc := *j.Data
@@ -242,8 +236,8 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 				v.Value = data[0]
 			}
 		}
-		//		bs, _ := json.Marshal(j.Result)
-		//		log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
+		bs, _ := json.Marshal(j.Result)
+		log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
 		//分析抽取结果并保存 todo
 		AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
 
@@ -286,13 +280,15 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 		lua.Block = j.Block
 		extinfo := lua.RunScript("core")
 		for k, v := range extinfo {
-			if j.Result[k] == nil {
-				j.Result[k] = [](*ju.ExtField){}
-			}
-			if tmps, ok := v.([]map[string]interface{}); ok {
-				for _, tmp := range tmps {
-					j.Result[k] = append(j.Result[k],
-						&ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"]})
+			if k == in.Field {
+				if j.Result[k] == nil {
+					j.Result[k] = [](*ju.ExtField){}
+				}
+				if tmps, ok := v.([]map[string]interface{}); ok {
+					for _, tmp := range tmps {
+						j.Result[k] = append(j.Result[k],
+							&ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"]})
+					}
 				}
 			}
 		}
@@ -459,44 +455,6 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 						}
 					}
 				}
-				//				kv := bl.TableKV.Kv
-				//				for _, tag := range tags {
-				//					for k, val := range kv {
-				//						if tag.Type == "string" {
-				//							if k == tag.Key {
-				//								text := ju.TrimLRSpace(val, "")
-				//								if text != "" {
-				//									kvmap[field] = append(kvmap[field], map[string]interface{}{
-				//										"field":     field,
-				//										"code":      in.Code,
-				//										"ruletext":  tag.Key,
-				//										"extfrom":   extfrom,
-				//										"value":     text,
-				//										"type":      "table",
-				//										"matchtype": "tag_string",
-				//									})
-				//								}
-				//								break
-				//							}
-				//						} else if tag.Type == "regexp" {
-				//							if tag.Reg.MatchString(k) {
-				//								text := ju.TrimLRSpace(val, "")
-				//								if text != "" {
-				//									kvmap[field] = append(kvmap[field], map[string]interface{}{
-				//										"field":     field,
-				//										"code":      in.Code,
-				//										"ruletext":  tag.Key,
-				//										"extfrom":   extfrom,
-				//										"value":     text,
-				//										"type":      "table",
-				//										"matchtype": "tag_regexp",
-				//									})
-				//								}
-				//								break
-				//							}
-				//						}
-				//					}
-				//				}
 			}
 		}
 	}
@@ -776,22 +734,8 @@ func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.Ext
 		if len(tmp) > 0 { //保存抽取结果
 			task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
 		}
-		//保存抽取详情
-		//		tmp["result"] = result
-		//		for k, v := range *doc {
-		//			if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
-		//				tmp[k] = v
-		//			}
-		//		}
 		db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
 	} else { //测试结果
-		//保存抽取详情
-		//		tmp["result"] = result
-		//		for k, v := range *doc {
-		//			if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
-		//				tmp[k] = v
-		//			}
-		//		}
 		db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
 	}
 }

+ 3 - 3
src/jy/extract/extractInit.go

@@ -193,6 +193,7 @@ func (e *ExtractTask) InitRuleCore() {
 			plist, _ := db.Mgo.Find("rule_logicpre", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
 			for _, v := range *plist {
 				rinfo := &RegLuaInfo{
+					Field: qu.ObjToString(v["s_field"]),
 					Code:  v["s_code"].(string),
 					Name:  v["s_name"].(string),
 					IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
@@ -202,7 +203,6 @@ func (e *ExtractTask) InitRuleCore() {
 				} else {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
-						rinfo.Field = v["s_field"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
 						if len(tmp) == 2 {
 							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
@@ -222,6 +222,7 @@ func (e *ExtractTask) InitRuleCore() {
 			blist, _ := db.Mgo.Find("rule_logicback", `{"sid":"`+qu.BsonIdToSId(vv["_id"])+`","delete":false}`, nil, nil, false, -1, -1)
 			for _, v := range *blist {
 				rinfo := &RegLuaInfo{
+					Field: qu.ObjToString(v["s_field"]),
 					Code:  v["s_code"].(string),
 					Name:  v["s_name"].(string),
 					IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
@@ -231,7 +232,6 @@ func (e *ExtractTask) InitRuleCore() {
 				} else {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
-						rinfo.Field = v["s_field"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
 						if len(tmp) == 2 {
 							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
@@ -254,6 +254,7 @@ func (e *ExtractTask) InitRuleCore() {
 					continue
 				}
 				rinfo := &RegLuaInfo{
+					Field: qu.ObjToString(v["s_field"]),
 					Code:  v["s_code"].(string),
 					Name:  v["s_name"].(string),
 					IsLua: qu.If(v["s_type"].(string) == "1", true, false).(bool),
@@ -265,7 +266,6 @@ func (e *ExtractTask) InitRuleCore() {
 				} else {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
-						rinfo.Field = v["s_field"].(string)
 						tmp := strings.Split(rinfo.RuleText, "__")
 						if len(tmp) == 2 {
 							epos := strings.Split(tmp[1], ",")

+ 15 - 0
src/jy/pretreated/analystep.go

@@ -31,6 +31,18 @@ func AnalyStart(job *util.Job) {
 	blockArrays, _ := DivideBlock(con, 1)
 	if len(blockArrays) > 0 { //有分块
 		for _, bl := range blockArrays {
+			if len([]rune(bl.Text)) > 80 {
+				ba1, _ := DivideBlock(bl.Text, 1)
+				if len(ba1) > 0 {
+					t := ""
+					for _, t1 := range ba1 {
+						t += t1.Text
+					}
+					bl.Text = t
+					bl.ColonKV = GetKVAll(t, bl.Title, 1)
+					bl.SpaceKV = spacekvEntity.entrance(t, bl.Title)
+				}
+			}
 			//块中再查找表格(块,处理完把值赋到块)
 			t1, _ := ComputeConRatio(bl.Text, 2)
 			if len(t1) > 0 {
@@ -50,6 +62,9 @@ func AnalyStart(job *util.Job) {
 			//table中kv覆盖全文正则的kv
 			tabres := AnalyTableV2(tabs, job.Category, "", con, 1, job.SourceMid)
 			processTableResult(tabres, bl)
+			//			for k, v := range bl.TableKV.Kv {
+			//				log.Println("bl.TableKV.Kv", k, v)
+			//			}
 		}
 		//调用kv解析
 		bl.ColonKV = GetKVAll(newCon, "", 1)

+ 12 - 16
src/jy/pretreated/analytable.go

@@ -128,10 +128,10 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1 []string,
 	//u.Debug(2, k)
 	//取标准key
 	res := u.GetTags(k)
-
 	if len(res) == 0 && tk != k {
 		res = u.GetTags(tk)
 	}
+	//log.Println(k, res)
 	//	if len(res) == 0 {
 	//		go u.AddtoNoMatchMap(tk)
 	//	}
@@ -254,7 +254,6 @@ func (table *Table) KVFilter() {
 		v := table.SortKV.Map[k]
 		//u.Debug(k, v)
 		if _, ok := v.(string); ok {
-
 			k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "")
 			k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v)
 			//u.Debug(k, v, k1, w1, v1, tag, b)
@@ -284,7 +283,6 @@ func (table *Table) KVFilter() {
 		} else {
 			//u.Debug(k, v, "---------")
 			as.AddKey(k, v)
-
 		}
 	}
 	//处理值是数组的kv放入标准化kv中
@@ -542,6 +540,7 @@ func (ts *TableResult) Analy() {
 		ts := tn.Analy()
 		for _, tab := range ts {
 			tabs = append(tabs, tab)
+			//log.Println("tab.SortKV.Map", tab.SortKV.Map)
 		}
 		//tn.SonTables = append(tn.SonTables, tn)
 	}
@@ -615,6 +614,9 @@ func (ts *TableResult) Analy() {
 	}
 	for _, table := range tabs {
 		table.MergerToTableresult()
+		//		for k, v := range table.TableResult.SortKV.Map {
+		//			log.Println(k, v)
+		//		}
 	}
 }
 
@@ -643,6 +645,7 @@ func (table *Table) Analy() []*Table {
 			//进入每一个单元格
 			td := NewTD(selm, TR, table)
 			//num++
+			//log.Println(td.SortKV.Keys, td.SortKV.Map)
 			TR.AddTD(td)
 		})
 		table.AddTR(TR)
@@ -776,11 +779,7 @@ func (table *Table) Analy() []*Table {
 			}
 			//u.Debug(str)
 		}
-
 	}
-	//	if table.BPackage {
-	//		u.Debug(table, fmt.Sprintf("%v", table.BlockPackage.Map["1"]))
-	//	}
 	return ts
 }
 
@@ -1291,19 +1290,16 @@ func (table *Table) FindKV() {
 				}
 				**/
 				if !td.BH && td.KVDirect < 3 {
-					if !table.FindTdVal(td, direct, vdirect) {
-						if !table.FindTdVal(td, vdirect, direct) {
-							//都识别不到时,对第一、二中标候选人的处理
-							bo, res := GetBidOrder(td, bodirect, sort)
-							if res {
-								sort++
-								bodirect = bo
-							}
+					if !table.FindTdVal(td, vdirect, direct) {
+						//都识别不到时,对第一、二中标候选人的处理
+						bo, res := GetBidOrder(td, bodirect, sort)
+						if res {
+							sort++
+							bodirect = bo
 						}
 					}
 					//u.Debug(td.Val, td.BH, td.HeadTd, td.KVDirect)
 				}
-
 			}
 		}
 	} else if len(table.TRs) > 0 { //没有表头的表格处理,默认纵向吧

+ 2 - 1
src/jy/pretreated/tablev2.go

@@ -5,6 +5,7 @@ package pretreated
 import (
 	"fmt"
 	u "jy/util"
+	"log"
 	qutil "qfw/util"
 	"regexp"
 	"strings"
@@ -335,7 +336,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 func (t *Table) Print() {
 	for row, trs := range t.TRs {
 		for col, td := range trs.TDs {
-			u.Debug(row, col, td.Val, td.BH, td.SortKV.Map)
+			log.Println(row, col, td.Val, td.BH, td.SortKV.Map)
 		}
 	}
 }

+ 5 - 1
src/jy/util/clearHtml.go

@@ -10,6 +10,7 @@ type Cut struct {
 	tag           *regexp.Regexp
 	scripttag     *regexp.Regexp
 	inputag       *regexp.Regexp
+	hiddentag     *regexp.Regexp
 	styletag      *regexp.Regexp
 	colstag       *regexp.Regexp
 	rowstag       *regexp.Regexp
@@ -28,13 +29,15 @@ func NewCut() *Cut {
 	//sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
 	//ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
 	scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
-	input := regexp.MustCompile(`<\s*input.*value=("|')(.*)("|')/?>(</>)?`)
+	hiddentag := regexp.MustCompile(`<\s*input[^<]*type=("|')hidden("|')[^<]*>`)
+	input := regexp.MustCompile(`<\s*input[^<]*value=("|')([^>"']*)[^<]*>`)
 	cols, _ := regexp.Compile(`colspan="\d+"`)
 	rows, _ := regexp.Compile(`rowspan="\d+"`)
 	dis, _ := regexp.Compile(`display:none`)
 	return &Cut{
 		tag:           t,
 		scripttag:     scs,
+		hiddentag:     hiddentag,
 		inputag:       input,
 		colstag:       cols,
 		rowstag:       rows,
@@ -55,6 +58,7 @@ func (c *Cut) ClearHtml(src string) string {
 	//清script,style
 	src = c.scripttag.ReplaceAllString(src, "")
 	//清理input
+	src = c.hiddentag.ReplaceAllString(src, "")
 	src = c.inputag.ReplaceAllString(src, "$2")
 	//换结束标签
 	src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {

+ 8 - 10
src/main_test.go

@@ -11,19 +11,17 @@ import (
 )
 
 func Test_task(t *testing.T) {
-	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
+	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_v3")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")
-	extract.StartExtractTestTask("5b8f804025e29a290415aee1", "5b8dd21ca5cb26b9b7fa4afa", "1", "result_v3", "track_v3")
-	time.Sleep(300 * time.Second)
+	extract.StartExtractTestTask("5b8f804025e29a290415aee1", "5beb9ba2a5cb26b9b74e0a64", "1", "zjk_v3", "zjk_v3")
+	time.Sleep(3 * time.Second)
 }
 func Test_reg(t *testing.T) {
-	context := ` 项目名称:   新碶街道2018年美女姜河、向家村河、塘湾河①河道清淤工程招标公告`
-	str := `[\s\\u3000\\u2003\\u00a0a-z0-9.、一二三]*项目名称[::][\n\s\\u3000\\u2003\\u00a0]*([^,,。;;::\n]{4,40})`
-	reg := regexp.MustCompile(str)
-	tmp := reg.FindAllStringSubmatch(context, -1)
-	for k, v := range tmp {
-		log.Println(k, v[1])
-	}
+	context := `sss<input  name="AgentCode" size="30" maxsize="50" value="91370800688271668P" class="textbox">
+    dfdf<input type="hidden" name="AgentCode" size="30" maxsize="50" value="tttt" class="textbox"></input>`
+	reg := regexp.MustCompile(`<\s*input.*value=['"](.[^'"]+).+>`)
+	tmp := reg.ReplaceAllString(context, "$1")
+	log.Println(tmp)
 }
 
 func Test_paths(t *testing.T) {

+ 30 - 22
src/web/templates/admin/result_list.html

@@ -30,6 +30,7 @@
 		            <table id="dataTable" class="table table-bordered table-hover">
 		              <thead>
 		              <tr>
+                        <th>公告类型</th> 
 		                <th>项目名称</th> 
 						<th>预算金额</th> 
 						<th>中标金额</th>
@@ -89,6 +90,13 @@ $(function () {
             "url": "/res/dist/js/dataTables.chinese.lang"
         },
 		"columns": [
+            { "data": "toptype",render:function(val,a,row){
+                if(val){                    
+    				return val
+                }else{
+                    return ""
+                }
+			}},
             { "data": "projectname","width":"16%",render:function(val,a,row){
 				var tmpval = val;
 				if(tmpval == undefined){
@@ -97,7 +105,7 @@ $(function () {
 				if(tmpval.length>10){
 					tmpval = tmpval.substr(0,10)+"...";
 				}
-				return"<a style='color:#333' target='_blank' title='"+val+"'>"+tmpval+"</a>"
+				return "<a style='color:#333' target='_blank' title='"+val+"'>"+tmpval+"</a>"
 			}},
 			{ "data": "budget",render:function(val,a,row){
 				if(val == undefined){
@@ -116,34 +124,34 @@ $(function () {
 				
 			}},
 			{ "data": "buyer",render:function(val,a,row){
-				if(val == undefined){
-					return "";
-				}else{
-					if(val.length > 10){
-						val = val.substr(0,10)+"...";
-					}
-					return val;
+				var tmpval = val;
+				if(tmpval == undefined){
+					tmpval = "";
+				} 
+				if(tmpval.length>10){
+					tmpval = tmpval.substr(0,10)+"...";
 				}
-				
+				return "<a style='color:#333' target='_blank' title='"+val+"'>"+tmpval+"</a>"
 			}},
 			{ "data": "winner",render:function(val,a,row){
-				if(val == undefined){
-					return "";
-				}else{
-					if(val.length > 10){
-						val = val.substr(0,10)+"...";
-					}
-					return val;
+				var tmpval = val;
+				if(tmpval == undefined){
+					tmpval = "";
+				} 
+				if(tmpval.length>10){
+					tmpval = tmpval.substr(0,10)+"...";
 				}
-				
+				return "<a style='color:#333' target='_blank' title='"+val+"'>"+tmpval+"</a>"	
 			}},
 			{ "data": "agency",render:function(val,a,row){
-				if(val == undefined){
-					return "";
-				}else{
-					return val;
+				var tmpval = val;
+				if(tmpval == undefined){
+					tmpval = "";
+				} 
+				if(tmpval.length>10){
+					tmpval = tmpval.substr(0,10)+"...";
 				}
-				
+				return "<a style='color:#333' target='_blank' title='"+val+"'>"+tmpval+"</a>"
 			}},
 			{ "data": "_id",render:function(val,a,row){
 				rowdata[val]= row;

+ 1 - 1
src/web/templates/admin/rule_logicore.html

@@ -154,7 +154,7 @@ $(function () {
 			case "new":
 				comtag=[{label:"名称",s_label:"s_name",placeholder:"",must:true},{label:"描述",s_label:"s_descript",type:"tpl_text"},{label:"启用",s_label:"isuse",type:"tpl_list_local",list:[{"s_name":"是","_id":true},{"s_name":"否","_id":false}],default:true}]
 				regtag=[{label:"字段",s_label:"s_field",type:"tpl_list_local",url:"/admin/getfields",default:{{.field}}},{label:"正则",s_label:"s_rule",type:"tpl_text",must:true}]
-				luatag=[{label:"脚本",s_label:"s_luascript",type:"tpl_text",must:true}]
+				luatag=[{label:"字段",s_label:"s_field",type:"tpl_list_local",url:"/admin/getfields",default:{{.field}}},{label:"脚本",s_label:"s_luascript",type:"tpl_text",must:true}]
 				testcon=[{label:"测试内容",s_label:"s_testcon",type:"tpl_text",must:true}]
 				hiddentag=[{s_label:"_id",type:"tpl_hidden"},{s_label:"vid",type:"tpl_hidden"},{s_label:"pid",type:"tpl_hidden"},{s_label:"sid",type:"tpl_hidden"},{s_label:"s_type",type:"tpl_hidden"}]
 				islua=false