Browse Source

添加附件抽取

fengweiqiang 6 years ago
parent
commit
10af9068a4

+ 2 - 1
src/config.json

@@ -38,5 +38,6 @@
         "LaunchTemplateId4": "lt-2ze19qyi8votdjgeq2ma",
         "LaunchTemplateId8": "lt-2zeidqiydzusn7hw7lt8",
         "VSwitchId": "vsw-2ze23am2bl9e3v6rnyhfb"
-    }
+    },
+    "filelength": 100000
 } 

+ 9 - 1
src/jy/extract/exportask.go

@@ -66,7 +66,15 @@ func extractAndExport(v string, t map[string]interface{}) {
 		if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 			continue
 		}
-		j, jf := PreInfo(v, false)
+		var j, jf *ju.Job
+		if e.IsFileField{
+			if v["projectinfo"] != nil {
+				v["isextFile"] = true
+				j, jf = PreInfo(v)
+			}
+		}else {
+			j, _ = PreInfo(v)
+		}
 		e.TaskInfo.ProcessPool <- true
 		go e.ExtractProcess(j, jf)
 	}

+ 168 - 7
src/jy/extract/extract.go

@@ -1,6 +1,7 @@
 package extract
 
 import (
+	"bytes"
 	"encoding/json"
 	"fmt"
 	"jy/clear"
@@ -15,6 +16,7 @@ import (
 	"strconv"
 	"sync"
 	"time"
+	"unicode/utf8"
 
 	"gopkg.in/mgo.v2/bson"
 )
@@ -27,7 +29,7 @@ var (
 	ClearTaskList map[string]*ClearTask                  //清理任务列表
 	saveLimit     = 200                                  //抽取日志批量保存
 	PageSize      = 5000                                 //查询分页
-	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1}`
+	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -77,8 +79,15 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
 			if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 				continue
 			}
-			//log.Println(v["_id"])
-			j, jf := PreInfo(v, false)
+			var j, jf *ju.Job
+			if ext.IsFileField{
+				if v["projectinfo"] != nil {
+					v["isextFile"] = true
+					j, jf = PreInfo(v)
+				}
+			}else {
+				j, _ = PreInfo(v)
+			}
 			ext.TaskInfo.ProcessPool <- true
 			go ext.ExtractProcess(j, jf)
 		}
@@ -171,7 +180,15 @@ func RunExtractTask(taskId string) {
 			if !ext.IsRun {
 				break
 			}
-			j, jf := PreInfo(v, false)
+			var j, jf *ju.Job
+			if ext.IsFileField{
+				if v["projectinfo"] != nil {
+					v["isextFile"] = true
+					j, jf = PreInfo(v)
+				}
+			}else {
+				j, _ = PreInfo(v)
+			}
 			ext.TaskInfo.ProcessPool <- true
 			go ext.ExtractProcess(j, jf)
 			ext.TaskInfo.LastExtId = _id
@@ -186,8 +203,13 @@ func RunExtractTask(taskId string) {
 }
 
 //信息预处理
-func PreInfo(doc map[string]interface{}, isextFile bool) (j, jf *ju.Job) {
+func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 	defer qu.Catch()
+	//判断是否有附件这个字段
+	var isextFile bool
+	if doc["isextFile"] != nil{
+		isextFile = doc["isextFile"].(bool)
+	}
 	detail := ""
 	d1, _ := doc["detail"].(string)
 	d2, _ := doc["contenthtml"].(string)
@@ -199,7 +221,10 @@ func PreInfo(doc map[string]interface{}, isextFile bool) (j, jf *ju.Job) {
 	detail = ju.CutLableStr(detail)
 	detail = cut.ClearHtml(detail)
 	doc["detail"] = detail
-	doc["detailfile"] = "" //附件文本堆一起(后期可以考虑,分开处理)
+
+	if isextFile {
+		file2text(&doc)  //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
+	}
 	toptype := qu.ObjToString(doc["toptype"])
 	if qu.ObjToString(doc["type"]) == "bid" {
 		toptype = "结果"
@@ -246,13 +271,53 @@ func PreInfo(doc map[string]interface{}, isextFile bool) (j, jf *ju.Job) {
 	return j, jf
 }
 
+//遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
+func file2text(doc *map[string]interface{}) {
+	var strfileinfo bytes.Buffer
+	if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
+		if va, ok := v["attachments"].(map[string]interface{}); ok {
+			for _, vaatt := range va {
+				if fileinfo, ok := vaatt.(map[string]interface{}); ok {
+					if qu.ObjToString(fileinfo["content"]) != "" {
+						switch fileinfo["content"].(type) {
+						case string:
+							lock.Lock()
+							strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
+							lock.Unlock()
+						case []map[string]interface{}:
+							for _, fv := range fileinfo["content"].([]map[string]interface{}) {
+								if fv["context"] != nil {
+									lock.Lock()
+									strfileinfo.WriteString(fv["context"].(string) + " \n")
+									lock.Unlock()
+								}
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"],100000 ){
+		(*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
+	}
+}
+
 //抽取
 func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
 	qu.Try(func() {
 		doc := *j.Data
+		docfile := make(map[string]interface{})
+		if jf != nil{
+			docfile = *jf.Data
+			docfile["dockey"]= "detailfile"
+		}
 		//全局前置规则,结果覆盖doc属性
 		for _, v := range e.RulePres {
 			doc = ExtRegPre(doc, j, v, e.TaskInfo)
+			if jf != nil{
+				docfile = ExtRegPre(docfile, jf, v, e.TaskInfo)
+			}
 		}
 		//抽取规则
 		for _, vc := range e.RuleCores {
@@ -286,9 +351,47 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
 			}
 			//log.Println("抽取-后置规则", tmp)
 		}
+		//抽取规则-附件
+		if jf != nil{
+			for _, vc := range e.RuleCores {
+				tmp := ju.DeepCopy(docfile).(map[string]interface{})
+				//是否进入逻辑
+				if !ju.Logic(vc.LuaLogic, tmp) {
+					continue
+				}
+				//抽取-前置规则
+				for _, v := range vc.RulePres {
+					tmp = ExtRegPre(tmp, jf, v, e.TaskInfo)
+				}
+				//log.Println("抽取-前置规则", tmp)
+
+				//抽取-规则
+				for _, v := range vc.RuleCores {
+					ExtRegCore(vc.ExtFrom, tmp, jf, v, e)
+				}
+				//log.Println("抽取-规则", tmp)
+
+				//项目名称未能抽取到,标题来凑
+				if vc.Field == "projectname" {
+					if len(jf.Result[vc.Field]) < 1 {
+						jf.Result[vc.Field] = append(jf.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, jf.Title, 0})
+					}
+				}
+
+				//抽取-后置规则
+				for _, v := range vc.RuleBacks {
+					ExtRegBack(jf, v, e.TaskInfo)
+				}
+				//log.Println("抽取-后置规则", tmp)
+			}
+		}
+
 		//全局后置规则
 		for _, v := range e.RuleBacks {
 			ExtRegBack(j, v, e.TaskInfo)
+			if jf != nil {
+				ExtRegBack(jf, v, e.TaskInfo)
+			}
 		}
 		//候选人加入
 		if len(j.Winnerorder) > 0 {
@@ -314,6 +417,32 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
 			}
 			j.Result["winner"] = winners
 		}
+		//候选人加入-附件
+		if jf != nil{
+			if len(jf.Winnerorder) > 0 {
+				winner := &ju.ExtField{
+					Field:     "winner",
+					Code:      "",
+					RuleText:  "",
+					Type:      "winnerorder",
+					MatchType: "winnerorder",
+					ExtFrom:   "",
+					Value:     jf.Winnerorder[0]["entname"],
+					Score:     0,
+				}
+				if len([]rune(qu.ObjToString(jf.Winnerorder[0]["entname"]))) < 4 {
+					winner.Score = -5
+				}
+				winners := jf.Result["winner"]
+				if winners != nil {
+					winners = append(winners, winner)
+				} else {
+					winners = []*ju.ExtField{}
+					winners = append(winners, winner)
+				}
+				jf.Result["winner"] = winners
+			}
+		}
 		//函数清理
 		for key, val := range j.Result {
 			for _, v := range val {
@@ -333,11 +462,38 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
 				lock.Unlock()
 			}
 		}
+		//函数清理-附件
+		if jf != nil{
+			for key, val := range jf.Result {
+				for _, v := range val {
+					lock.Lock()
+					cfn := e.ClearFn[key]
+					lock.Unlock()
+					data := clear.DoClearFn(cfn, []interface{}{v.Value, jf.Content})
+					v.Value = data[0]
+					//清理特殊符号
+					lock.Lock()
+					if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
+						clear.MesField[key] != nil {
+						text := qu.ObjToString(v.Value)
+						text = clear.OtherClean(key, text)
+						v.Value = text
+					}
+					lock.Unlock()
+				}
+			}
+		}
 		PackageDetail(j, e) //处理分包信息
+		if jf != nil{
+			PackageDetail(jf, e) //处理分包信息-附件
+		}
 		//		bs, _ := json.Marshal(j.Result)
 		//		log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
 		//分析抽取结果并保存 todo
 		AnalysisSaveResult(j, e)
+		if jf != nil{
+			AnalysisSaveResult(jf, e) //分析抽取结果并保存-附件
+		}
 	}, func(err interface{}) {
 		log.Println("ExtractProcess err", err)
 	})
@@ -360,7 +516,12 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
 		}
 		AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
 	} else {
-		key := qu.If(in.Field == "", "detail", in.Field).(string)
+		var key string
+		if doc["dockey"]== nil{
+			key = qu.If(in.Field == "", "detail", in.Field).(string)
+		}else {
+			key = qu.If(in.Field == "", "detailfile", in.Field).(string)
+		}
 		text := qu.ObjToString(doc[key])
 		extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
 		doc[key] = extinfo[key]                                      //结果覆盖原doc

+ 3 - 0
src/jy/extract/extractInit.go

@@ -66,6 +66,9 @@ type ExtractTask struct {
 	IsExtractCity bool                //是否开启城市抽取
 	Fields        map[string]int      //抽取属性组
 
+	IsFileField       bool      //是否开启附件抽取
+	FileFields        map[string]int      //抽取附件属性组
+
 	ResultChanel chan bool                  //抽取结果详情
 	ResultArr    [][]map[string]interface{} //抽取结果详情
 	BidChanel    chan bool                  //抽取结果

+ 27 - 3
src/jy/extract/extractudp.go

@@ -145,7 +145,15 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					}
 					_id := qu.BsonIdToSId(v["_id"])
 					log.Println(_id)
-					j, jf := PreInfo(v, false)
+					var j, jf *ju.Job
+					if ext.IsFileField{
+						if v["projectinfo"] != nil {
+							v["isextFile"] = true
+							j, jf = PreInfo(v)
+						}
+					}else {
+						j, _ = PreInfo(v)
+					}
 					ext.TaskInfo.ProcessPool <- true
 					go ext.ExtractProcess(j, jf)
 					sid = _id
@@ -165,7 +173,15 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					}
 					_id := qu.BsonIdToSId(v["_id"])
 					log.Println(_id)
-					j, jf := PreInfo(v, false)
+					var j, jf *ju.Job
+					if ext.IsFileField{
+						if v["projectinfo"] != nil {
+							v["isextFile"] = true
+							j, jf = PreInfo(v)
+						}
+					}else {
+						j, _ = PreInfo(v)
+					}
 					ext.TaskInfo.ProcessPool <- true
 					go ext.ExtractProcess(j, jf)
 					sidback = _id
@@ -199,7 +215,15 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 				}
 				_id := qu.BsonIdToSId(v["_id"])
 				log.Println(_id)
-				j, jf := PreInfo(v, false)
+				var j, jf *ju.Job
+				if ext.IsFileField{
+					if v["projectinfo"] != nil {
+						v["isextFile"] = true
+						j, jf = PreInfo(v)
+					}
+				}else {
+					j, _ = PreInfo(v)
+				}
 				ext.TaskInfo.ProcessPool <- true
 				go ext.ExtractProcess(j, jf)
 				sid = _id