瀏覽代碼

Merge branch 'master' of http://192.168.3.207:10080/qmx/jy-data-extract

unknown 6 年之前
父節點
當前提交
9de8c3f7cf
共有 5 個文件被更改,包括 163 次插入126 次删除
  1. 55 38
      src/jy/admin/rulecheck.go
  2. 32 19
      src/jy/extract/extract.go
  3. 69 46
      src/jy/extract/extractInit.go
  4. 7 4
      src/jy/util/clearHtml.go
  5. 0 19
      src/main_test.go

+ 55 - 38
src/jy/admin/rulecheck.go

@@ -2,6 +2,7 @@
 package admin
 
 import (
+	"fmt"
 	"jy/extract"
 	. "jy/mongodbutil"
 	ju "jy/util"
@@ -149,58 +150,74 @@ func getCheckInfos() *[]map[string]interface{} {
 
 //正则前置过滤检查
 func checkPreReg(content, ruleText string) string {
-	tmp := strings.Split(ruleText, "__")
-	if len(tmp) == 2 {
-		reg := regexp.MustCompile(tmp[0])
-		return reg.ReplaceAllString(content, tmp[1])
-	} else {
-		reg := regexp.MustCompile(tmp[0])
-		return reg.ReplaceAllString(content, "")
-	}
+	tmpstr := ""
+	qu.Try(func() {
+		tmp := strings.Split(ruleText, "__")
+		if len(tmp) == 2 {
+			reg := regexp.MustCompile(tmp[0])
+			tmpstr = reg.ReplaceAllString(content, tmp[1])
+		} else {
+			reg := regexp.MustCompile(tmp[0])
+			tmpstr = reg.ReplaceAllString(content, "")
+		}
+	}, func(err interface{}) {
+		tmpstr = fmt.Sprint(err)
+	})
+	return tmpstr
 }
 
 //正则后置过滤检查
 func checkBackReg(content, ruleText string) string {
-	tmp := strings.Split(ruleText, "__")
-	if len(tmp) == 2 {
-		reg := regexp.MustCompile(tmp[0])
-		return reg.ReplaceAllString(content, tmp[1])
-	} else {
-		reg := regexp.MustCompile(tmp[0])
-		return reg.ReplaceAllString(content, "")
-	}
+	tmpstr := ""
+	qu.Try(func() {
+		tmp := strings.Split(ruleText, "__")
+		if len(tmp) == 2 {
+			reg := regexp.MustCompile(tmp[0])
+			tmpstr = reg.ReplaceAllString(content, tmp[1])
+		} else {
+			reg := regexp.MustCompile(tmp[0])
+			tmpstr = reg.ReplaceAllString(content, "")
+		}
+	}, func(err interface{}) {
+		tmpstr = fmt.Sprint(err)
+	})
+	return tmpstr
 }
 
 //正则抽取检查
 func checkCoreReg(field, content, ruleText string) map[string]string {
 	rep := map[string]string{}
-	tmp := strings.Split(ruleText, "__")
-	if len(tmp) == 2 {
-		epos := strings.Split(tmp[1], ",")
-		posm := map[string]int{}
-		for _, v := range epos {
-			ks := strings.Split(v, ":")
-			if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
-				posm[ks[1]] = qu.IntAll(ks[0])
-			} else {
-				posm[field] = qu.IntAll(ks[0])
+	qu.Try(func() {
+		tmp := strings.Split(ruleText, "__")
+		if len(tmp) == 2 {
+			epos := strings.Split(tmp[1], ",")
+			posm := map[string]int{}
+			for _, v := range epos {
+				ks := strings.Split(v, ":")
+				if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
+					posm[ks[1]] = qu.IntAll(ks[0])
+				} else {
+					posm[field] = qu.IntAll(ks[0])
+				}
 			}
-		}
-		reg := regexp.MustCompile(tmp[0])
-		apos := reg.FindAllStringSubmatchIndex(content, -1)
-		if len(apos) > 0 {
-			pos := apos[0]
-			for k, p := range posm {
-				if len(pos) > p {
-					if pos[p] == -1 || pos[p+1] == -1 {
-						continue
+			reg := regexp.MustCompile(tmp[0])
+			apos := reg.FindAllStringSubmatchIndex(content, -1)
+			if len(apos) > 0 {
+				pos := apos[0]
+				for k, p := range posm {
+					if len(pos) > p {
+						if pos[p] == -1 || pos[p+1] == -1 {
+							continue
+						}
+						val := content[pos[p]:pos[p+1]]
+						rep[k] = val
 					}
-					val := content[pos[p]:pos[p+1]]
-					rep[k] = val
 				}
 			}
 		}
-	}
+	}, func(err interface{}) {
+		rep["err"] = fmt.Sprint(err)
+	})
 	return rep
 }
 

+ 32 - 19
src/jy/extract/extract.go

@@ -1,7 +1,7 @@
 package extract
 
 import (
-	"encoding/json"
+	//"encoding/json"
 	"fmt"
 	"jy/clear"
 	db "jy/mongodbutil"
@@ -9,6 +9,7 @@ import (
 	ju "jy/util"
 	"log"
 	qu "qfw/util"
+	"regexp"
 	"strconv"
 	"strings"
 	"sync"
@@ -68,22 +69,27 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
 
 //启动抽取
 func StartExtractTaskId(taskId string) bool {
+	isgo := false
 	ext := TaskList[taskId]
 	if ext == nil {
 		ext = &ExtractTask{}
 		ext.Id = taskId
-		ext.IsRun = true
 		ext.InitTaskInfo()
-		ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
-		ext.InitRulePres()
-		ext.InitRuleBacks()
-		ext.InitRuleCore()
-		ext.InitTag()
-		ext.InitClearFn()
-		//只启动一次taskId
-		go RunExtractTask(ext)
+		isgo = true
+	} else {
+		ext.Id = taskId
+		ext.InitTaskInfo()
 	}
+	ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+	ext.InitRulePres()
+	ext.InitRuleBacks()
+	ext.InitRuleCore()
+	ext.InitTag()
+	ext.InitClearFn()
 	ext.IsRun = true
+	if isgo {
+		go RunExtractTask(taskId)
+	}
 	TaskList[taskId] = ext
 	return true
 }
@@ -101,10 +107,8 @@ func StopExtractTaskId(taskId string) bool {
 }
 
 //开始抽取
-func RunExtractTask(ext *ExtractTask) {
-	if !ext.IsRun {
-		return
-	}
+func RunExtractTask(taskId string) {
+	ext := TaskList[taskId]
 	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
 	list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, -1, -1)
 	for k, v := range *list {
@@ -116,11 +120,10 @@ func RunExtractTask(ext *ExtractTask) {
 		ext.TaskInfo.ProcessPool <- true
 		go ext.ExtractProcess(j)
 		ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
-		time.Sleep(1 * time.Second)
 	}
 	//更新task.s_extlastid
 	db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
-	time.AfterFunc(30*time.Minute, func() { RunExtractTask(ext) })
+	time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
 }
 
 //信息预处理
@@ -206,6 +209,7 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 				ExtRegBack(j, v, e.TaskInfo)
 			}
 			//log.Println("抽取-后置规则", tmp)
+
 		}
 		//全局后置规则
 		for _, v := range e.RuleBacks {
@@ -218,8 +222,8 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 				v.Value = data[0]
 			}
 		}
-		bs, _ := json.Marshal(j.Result)
-		log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
+		//bs, _ := json.Marshal(j.Result)
+		//log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
 		//分析抽取结果并保存 todo
 		AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
 
@@ -497,7 +501,16 @@ func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[stri
 			}
 		}
 	} else {
-		val := v.RegCore.Reg.ReplaceAllString(text, "")
+		pos := v.RegCore.Reg.FindStringIndex(text)
+		val := ""
+		if len(pos) == 2 {
+			text = text[pos[1]:]
+			rs := regexp.MustCompile("[^\r\n\t]+")
+			tmp := rs.FindAllString(text, -1)
+			if len(tmp) > 0 {
+				val = tmp[0]
+			}
+		}
 		if val != "" {
 			tmps := []map[string]interface{}{}
 			tmp := map[string]interface{}{

+ 69 - 46
src/jy/extract/extractInit.go

@@ -3,6 +3,7 @@ package extract
 
 import (
 	db "jy/mongodbutil"
+	"log"
 	qu "qfw/util"
 	"regexp"
 	"strings"
@@ -85,6 +86,7 @@ func (e *ExtractTask) InitTestTaskInfo(resultcoll, trackcoll string) {
 //加载任务信息
 func (e *ExtractTask) InitTaskInfo() {
 	task, _ := db.Mgo.FindById("task", e.Id, nil)
+	log.Println("task", task)
 	if len(*task) > 1 {
 		v, _ := db.Mgo.FindOne("version", `{"version":"`+(*task)["s_version"].(string)+`"}`)
 		e.TaskInfo = &TaskInfo{
@@ -100,6 +102,7 @@ func (e *ExtractTask) InitTaskInfo() {
 			LastExtId:   qu.ObjToString((*task)["s_extlastid"]),
 			ProcessPool: make(chan bool, qu.IntAllDef((*task)["i_process"], 1)),
 		}
+		log.Println(e.TaskInfo.Name, e.TaskInfo.ProcessPool)
 	} else {
 		return
 	}
@@ -118,13 +121,17 @@ func (e *ExtractTask) InitRulePres() {
 		if rinfo.IsLua {
 			rinfo.RuleText = v["s_luascript"].(string)
 		} else {
-			rinfo.RuleText = v["s_rule"].(string)
-			tmp := strings.Split(rinfo.RuleText, "__")
-			if len(tmp) == 2 {
-				rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
-			} else {
-				rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
-			}
+			qu.Try(func() {
+				rinfo.RuleText = v["s_rule"].(string)
+				tmp := strings.Split(rinfo.RuleText, "__")
+				if len(tmp) == 2 {
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+				} else {
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+				}
+			}, func(err interface{}) {
+				log.Println(rinfo.Code, rinfo.Field, err)
+			})
 		}
 		e.RulePres = append(e.RulePres, rinfo)
 	}
@@ -143,13 +150,17 @@ func (e *ExtractTask) InitRuleBacks() {
 		if rinfo.IsLua {
 			rinfo.RuleText = v["s_luascript"].(string)
 		} else {
-			rinfo.RuleText = v["s_rule"].(string)
-			tmp := strings.Split(rinfo.RuleText, "__")
-			if len(tmp) == 2 {
-				rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
-			} else {
-				rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
-			}
+			qu.Try(func() {
+				rinfo.RuleText = v["s_rule"].(string)
+				tmp := strings.Split(rinfo.RuleText, "__")
+				if len(tmp) == 2 {
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+				} else {
+					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+				}
+			}, func(err interface{}) {
+				log.Println(rinfo.Code, rinfo.Field, err)
+			})
 		}
 		e.RuleBacks = append(e.RuleBacks, rinfo)
 	}
@@ -184,14 +195,18 @@ func (e *ExtractTask) InitRuleCore() {
 				if rinfo.IsLua {
 					rinfo.RuleText = v["s_luascript"].(string)
 				} else {
-					rinfo.RuleText = v["s_rule"].(string)
-					rinfo.Field = v["s_field"].(string)
-					tmp := strings.Split(rinfo.RuleText, "__")
-					if len(tmp) == 2 {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
-					} else {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
-					}
+					qu.Try(func() {
+						rinfo.RuleText = v["s_rule"].(string)
+						rinfo.Field = v["s_field"].(string)
+						tmp := strings.Split(rinfo.RuleText, "__")
+						if len(tmp) == 2 {
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+						} else {
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+						}
+					}, func(err interface{}) {
+						log.Println(rinfo.Code, rinfo.Field, err)
+					})
 				}
 				rulePres = append(rulePres, rinfo)
 			}
@@ -209,14 +224,18 @@ func (e *ExtractTask) InitRuleCore() {
 				if rinfo.IsLua {
 					rinfo.RuleText = v["s_luascript"].(string)
 				} else {
-					rinfo.RuleText = v["s_rule"].(string)
-					rinfo.Field = v["s_field"].(string)
-					tmp := strings.Split(rinfo.RuleText, "__")
-					if len(tmp) == 2 {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
-					} else {
-						rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
-					}
+					qu.Try(func() {
+						rinfo.RuleText = v["s_rule"].(string)
+						rinfo.Field = v["s_field"].(string)
+						tmp := strings.Split(rinfo.RuleText, "__")
+						if len(tmp) == 2 {
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: tmp[1]}
+						} else {
+							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
+						}
+					}, func(err interface{}) {
+						log.Println(rinfo.Code, rinfo.Field, err)
+					})
 				}
 				ruleBacks = append(ruleBacks, rinfo)
 			}
@@ -244,24 +263,28 @@ func (e *ExtractTask) InitRuleCore() {
 						rinfo.IsHasFields = true
 					}*/
 				} else {
-					rinfo.RuleText = v["s_rule"].(string)
-					rinfo.Field = v["s_field"].(string)
-					tmp := strings.Split(rinfo.RuleText, "__")
-					if len(tmp) == 2 {
-						epos := strings.Split(tmp[1], ",")
-						posm := map[string]int{}
-						for _, v := range epos {
-							ks := strings.Split(v, ":")
-							if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
-								posm[ks[1]] = qu.IntAll(ks[0])
-							} else { //(.*)招标公告__2
-								posm[rinfo.Field] = qu.IntAll(ks[0])
+					qu.Try(func() {
+						rinfo.RuleText = v["s_rule"].(string)
+						rinfo.Field = v["s_field"].(string)
+						tmp := strings.Split(rinfo.RuleText, "__")
+						if len(tmp) == 2 {
+							epos := strings.Split(tmp[1], ",")
+							posm := map[string]int{}
+							for _, v := range epos {
+								ks := strings.Split(v, ":")
+								if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
+									posm[ks[1]] = qu.IntAll(ks[0])
+								} else { //(.*)招标公告__2
+									posm[rinfo.Field] = qu.IntAll(ks[0])
+								}
 							}
+							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
+						} else {
+							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
 						}
-						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: true, ExtractPos: posm}
-					} else {
-						rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
-					}
+					}, func(err interface{}) {
+						log.Println(rinfo.Code, rinfo.Field, err)
+					})
 				}
 				ruleCores = append(ruleCores, rinfo)
 			}

+ 7 - 4
src/jy/util/clearHtml.go

@@ -9,6 +9,7 @@ import (
 type Cut struct {
 	tag           *regexp.Regexp
 	scripttag     *regexp.Regexp
+	inputag       *regexp.Regexp
 	styletag      *regexp.Regexp
 	colstag       *regexp.Regexp
 	rowstag       *regexp.Regexp
@@ -27,13 +28,14 @@ func NewCut() *Cut {
 	//sc, _ := regexp.Compile("\\<script[^\\>]*\\>*[^\\>]+\\</script\\>")
 	//ss, _ := regexp.Compile("\\<style[^\\>]*\\>*[^\\>]+\\</style\\>")
 	scs := regexp.MustCompile("<(script|style)[^>]*>[^>]+</(script|style)>")
+	input := regexp.MustCompile(`<\s*input.*value=("|')(.*)("|')/?>(</>)?`)
 	cols, _ := regexp.Compile(`colspan="\d+"`)
 	rows, _ := regexp.Compile(`rowspan="\d+"`)
 	dis, _ := regexp.Compile(`display:none`)
 	return &Cut{
-		tag:       t,
-		scripttag: scs,
-		//styletag:      ss,
+		tag:           t,
+		scripttag:     scs,
+		inputag:       input,
 		colstag:       cols,
 		rowstag:       rows,
 		display:       dis,
@@ -52,7 +54,8 @@ func (c *Cut) ClearHtml(src string) string {
 	src = c.tag.ReplaceAllStringFunc(src, strings.ToLower)
 	//清script,style
 	src = c.scripttag.ReplaceAllString(src, "")
-	//
+	//清理input
+	src = c.inputag.ReplaceAllString(src, "$2")
 	//换结束标签
 	src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
 		tmp = strings.Replace(tmp, " ", "", -1)

+ 0 - 19
src/main_test.go

@@ -25,25 +25,6 @@ func Test_reg(t *testing.T) {
 		log.Println(k, v[1])
 	}
 }
-func Test_checkreg(t *testing.T) {
-	context := ` 项目名称:      新碶街道2018年美女姜河、向家村河、塘湾河①河道清淤工程招标公告`
-	rule := `(.*)项目名称[::][\n\s ]{0,10}__`
-	str := extract.PreRulesCheckReg(context, rule)
-	log.Println(str)
-	rule = `(.*)招标公告__$1`
-	str = extract.BackRulesCheckReg(str, rule)
-	log.Println(str)
-
-	context = `新碶街道2018年美女姜河、向家村河、塘湾河①河道清淤工程招标公告郑州`
-	rule = `(.*)招标公告(.*)__2:projectname,4:city`
-	tmp := extract.ExtRulesCheckReg("projectname", context, rule)
-	log.Println(tmp)
-
-	rule = `(关于|就)?(.{6,70})(招标|中标|成交|延期|变更)公告__4`
-	context = `关于新碶街道2018年美女姜河、向家村河、塘湾河①河道清淤工程招标公告`
-	tmp = extract.ExtRulesCheckReg("projectname", context, rule)
-	log.Println(tmp)
-}
 
 func Test_paths(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_v3")