zhangjinkun 6 anos atrás
pai
commit
ddb5073cf1

+ 1 - 1
src/jy/extract/extpackage.go

@@ -23,7 +23,7 @@ func PackageDetail(j *ju.Job, e *ExtractTask) {
 				sonJobResult["origin"] = pkg.Origin
 				sonJobResult["type"] = pkg.Type
 				sonJobResult["winnerorder"] = pkg.WinnerOrder
-				//分包结果暂时不用
+				//分包暂不参与选举
 				/*
 					for k, tags := range e.Tag {
 					L:

+ 13 - 10
src/jy/extract/extract.go

@@ -101,8 +101,8 @@ func StartExtractTaskId(taskId string) bool {
 		ext.Id = taskId
 		ext.InitTaskInfo()
 	}
-	ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
-	ext.TaskInfo.TDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
+	ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+	ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
 	ext.InitRulePres()
 	ext.InitRuleBacks()
 	ext.InitRuleCore()
@@ -165,14 +165,15 @@ func RunExtractTask(taskId string) {
 			if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 				continue
 			}
-			//log.Println(v["_id"])
+			_id := qu.BsonIdToSId(v["_id"])
+			log.Println(_id)
 			if !ext.IsRun {
 				break
 			}
 			j := PreInfo(v)
 			ext.TaskInfo.ProcessPool <- true
 			go ext.ExtractProcess(j)
-			ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
+			ext.TaskInfo.LastExtId = _id
 		}
 		db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
 		if !ext.IsRun {
@@ -303,12 +304,14 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
 				v.Value = data[0]
 				//清理特殊符号
+				lock.Lock()
 				if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
 					clear.MesField[key] != nil {
 					text := qu.ObjToString(v.Value)
 					text = clear.OtherClean(key, text)
 					v.Value = text
 				}
+				lock.Unlock()
 			}
 		}
 		PackageDetail(j, e) //处理分包信息
@@ -316,11 +319,10 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 		//		log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
 		//分析抽取结果并保存 todo
 		AnalysisSaveResult(j, e)
-		<-e.TaskInfo.ProcessPool
 	}, func(err interface{}) {
-		log.Println("ExtractProcess err", err, (*j.Data)["_id"])
-		<-e.TaskInfo.ProcessPool
+		log.Println("ExtractProcess err", err)
 	})
+	<-e.TaskInfo.ProcessPool
 }
 
 //前置过滤
@@ -648,9 +650,10 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 				tmp := j.Result[in.Field]
 				exts := []interface{}{}
 				for k, v := range tmp {
-					if v.Type == "table" && v.Field != "projectname" { //table抽取到的数据不清理
-						continue
-					}
+					//table抽取到的数据不清理
+					//					if v.Type == "table" && v.Field != "projectname" {
+					//						continue
+					//					}
 					text := qu.ObjToString(v.Value)
 					if text != "" {
 						text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)

+ 10 - 7
src/jy/extract/extractudp.go

@@ -88,7 +88,7 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 	ext := &ExtractTask{}
 	ext.Id = qu.ObjToString(ju.Config["udptaskid"])
 	ext.InitTaskInfo()
-	ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+	ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
 	ext.InitRulePres()
 	ext.InitRuleBacks()
 	ext.InitRuleCore()
@@ -140,11 +140,12 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 						continue
 					}
-					//log.Println(v["_id"])
+					_id := qu.BsonIdToSId(v["_id"])
+					log.Println(_id)
 					j := PreInfo(v)
 					ext.TaskInfo.ProcessPool <- true
 					go ext.ExtractProcess(j)
-					sid = qu.BsonIdToSId(v["_id"])
+					sid = _id
 				}
 				db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
 					map[string]interface{}{"$set": map[string]interface{}{
@@ -159,11 +160,12 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 					if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 						continue
 					}
-					//log.Println(v["_id"])
+					_id := qu.BsonIdToSId(v["_id"])
+					log.Println(_id)
 					j := PreInfo(v)
 					ext.TaskInfo.ProcessPool <- true
 					go ext.ExtractProcess(j)
-					sidback = qu.BsonIdToSId(v["_id"])
+					sidback = _id
 				}
 				db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
 					map[string]interface{}{"$set": map[string]interface{}{
@@ -192,11 +194,12 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 				if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
 					continue
 				}
-				//log.Println(v["_id"])
+				_id := qu.BsonIdToSId(v["_id"])
+				log.Println(_id)
 				j := PreInfo(v)
 				ext.TaskInfo.ProcessPool <- true
 				go ext.ExtractProcess(j)
-				sid = qu.BsonIdToSId(v["_id"])
+				sid = _id
 			}
 
 		}

+ 3 - 2
src/jy/pretreated/tablev2.go

@@ -185,7 +185,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 						tb1.AddKey(k, v)
 					} else {
 						bp := tb1.Map[k].(*u.BlockPackage)
-						if v1.TableKV != nil && v1.TableKV.Kv != nil {
+						if bp != nil && v1.TableKV != nil && v1.TableKV.Kv != nil {
 							for k2, v2 := range v1.TableKV.Kv {
 								if bp.TableKV.Kv != nil && bp.TableKV.Kv[k2] == "" {
 									bp.TableKV.Kv[k2] = v2
@@ -753,7 +753,9 @@ strtype 1全文 2块文本
 func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
 	defer qutil.Catch()
 	doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
+	cons := doc.Text()
 	tables := doc.Find("table")
+	doc = nil
 	if tables.Size() > 0 {
 		tabs = []*goquery.Selection{}
 		for i := 0; i < tables.Size(); i++ {
@@ -769,7 +771,6 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 			}
 		}
 		tlen := 0
-		cons := doc.Text()
 		for _, t := range tabs {
 			tlen += len(t.Text())
 		}

+ 1 - 1
src/jy/util/script.go

@@ -103,7 +103,7 @@ func (s *LuaScript) RunScript(stype string) map[string]interface{} {
 			}
 		}
 	}, func(err interface{}) {
-		log.Println("lua err:", data["err"])
+		log.Println("lua err:", err)
 	})
 	return data
 }