|
@@ -79,9 +79,6 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
|
|
for _, v := range *list {
|
|
for _, v := range *list {
|
|
- //if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
|
|
|
|
- // continue
|
|
|
|
- //}
|
|
|
|
if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
|
|
if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
@@ -191,9 +188,6 @@ func RunExtractTask(taskId string) {
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
|
|
fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
|
|
fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
|
|
for _, v := range *list {
|
|
for _, v := range *list {
|
|
- //if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
|
|
|
|
- // continue
|
|
|
|
- //}
|
|
|
|
//根据标题判断是否抽取
|
|
//根据标题判断是否抽取
|
|
b := IsExtract("title", qu.ObjToString(v["title"]), "")
|
|
b := IsExtract("title", qu.ObjToString(v["title"]), "")
|
|
if !b {
|
|
if !b {
|
|
@@ -244,9 +238,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
|
|
d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
|
|
if len(d1) >= len(d2) || d2 == "" {
|
|
if len(d1) >= len(d2) || d2 == "" {
|
|
detail = d1
|
|
detail = d1
|
|
- } else {
|
|
|
|
|
|
+ } else { //选用contenthtml有一种特殊情况与detail不一致,综合考虑选取逻辑
|
|
detail = d2
|
|
detail = d2
|
|
- //选用contenthtml有一种特殊情况与detail不一致,综合考虑选取逻辑
|
|
|
|
if SelectDetailSourceText(d1, d2) {
|
|
if SelectDetailSourceText(d1, d2) {
|
|
detail = d1
|
|
detail = d1
|
|
}
|
|
}
|
|
@@ -278,7 +271,6 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
toptype, subtype = "招标", "招标" //暂时按照"招标"
|
|
toptype, subtype = "招标", "招标" //暂时按照"招标"
|
|
}
|
|
}
|
|
toMap := qu.ObjToMap(doc["jsondata"])
|
|
toMap := qu.ObjToMap(doc["jsondata"])
|
|
- //log.Debug("toMap", toMap)
|
|
|
|
if (*toMap) != nil {
|
|
if (*toMap) != nil {
|
|
if (*toMap)["extweight"] == nil {
|
|
if (*toMap)["extweight"] == nil {
|
|
(*toMap)["extweight"] = ju.Config["jsondata_extweight"]
|
|
(*toMap)["extweight"] = ju.Config["jsondata_extweight"]
|
|
@@ -390,7 +382,6 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
}
|
|
}
|
|
} else {
|
|
} else {
|
|
if tmpk == "winner" && j.Category == "招标" && j.CategorySecond != "单一" {
|
|
if tmpk == "winner" && j.Category == "招标" && j.CategorySecond != "单一" {
|
|
- //log.Debug("不采用~招标类附件中标信息")
|
|
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
|
|
j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
|
|
@@ -406,7 +397,6 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
}
|
|
}
|
|
if !isUsed {
|
|
if !isUsed {
|
|
if j.Category == "招标" && j.CategorySecond != "单一" {
|
|
if j.Category == "招标" && j.CategorySecond != "单一" {
|
|
- //log.Debug("不采用~招标类附件中标信息~")
|
|
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
|
|
j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
|
|
@@ -416,7 +406,7 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
}
|
|
}
|
|
if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
|
|
if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
|
|
if j.Category == "招标" && j.CategorySecond != "单一" {
|
|
if j.Category == "招标" && j.CategorySecond != "单一" {
|
|
- //log.Debug("不采用~招标类附件中标信息~~")
|
|
|
|
|
|
+
|
|
} else {
|
|
} else {
|
|
j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
|
|
j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
|
|
}
|
|
}
|
|
@@ -451,9 +441,6 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
|
|
log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
|
|
})
|
|
})
|
|
e.ExtractDetail(tmpj, false, "")
|
|
e.ExtractDetail(tmpj, false, "")
|
|
- //if jf != nil && jf.IsFile {
|
|
|
|
- // e.ExtractFile(jf, false, "")
|
|
|
|
- //}
|
|
|
|
//合并数据
|
|
//合并数据
|
|
j.Block = append(j.Block, tmpj.Block...)
|
|
j.Block = append(j.Block, tmpj.Block...)
|
|
j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
|
|
j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
|
|
@@ -478,10 +465,6 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
qu.Try(func() {
|
|
qu.Try(func() {
|
|
doc := *j.Data
|
|
doc := *j.Data
|
|
- //全局前置规则,结果覆盖doc属性
|
|
|
|
- //for _, v := range e.RulePres {
|
|
|
|
- // doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
|
|
- //}
|
|
|
|
tmprules := map[string][]*RuleCore{}
|
|
tmprules := map[string][]*RuleCore{}
|
|
lockrule.Lock()
|
|
lockrule.Lock()
|
|
//加载分类抽取配置
|
|
//加载分类抽取配置
|
|
@@ -557,8 +540,6 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
for _, v := range vc.KVRuleCores {
|
|
for _, v := range vc.KVRuleCores {
|
|
ExtRuleKV(j, v, e.TaskInfo)
|
|
ExtRuleKV(j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
- // log.Debug("抽取-后置规则", tmp)
|
|
|
|
-
|
|
|
|
//项目名称未能抽取到,标题来凑
|
|
//项目名称未能抽取到,标题来凑
|
|
if vc.Field == "projectname" {
|
|
if vc.Field == "projectname" {
|
|
if vc.ExtFrom == "title" {
|
|
if vc.ExtFrom == "title" {
|
|
@@ -632,8 +613,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
before, _ := v.Value.(string)
|
|
before, _ := v.Value.(string)
|
|
v.Value = data[0]
|
|
v.Value = data[0]
|
|
BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
|
|
BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
|
|
- //添加行数清理的日志
|
|
|
|
- //清理特殊符号
|
|
|
|
|
|
+ //添加行数清理的日志 , 清理特殊符号
|
|
lockclear.Lock()
|
|
lockclear.Lock()
|
|
if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
|
|
if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
|
|
text := qu.ObjToString(v.Value)
|
|
text := qu.ObjToString(v.Value)
|
|
@@ -651,8 +631,8 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
|
|
PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
|
|
- // bs, _ := json.Marshal(j.Result)
|
|
|
|
- // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
|
|
|
|
+ //bs, _ := json.Marshal(j.Result)
|
|
|
|
+ //log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
log.Debug("ExtractProcess err", err, j.SourceMid)
|
|
log.Debug("ExtractProcess err", err, j.SourceMid)
|
|
})
|
|
})
|
|
@@ -660,12 +640,6 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
qu.Try(func() {
|
|
qu.Try(func() {
|
|
doc := *j.Data
|
|
doc := *j.Data
|
|
- //全局前置规则,结果覆盖doc属性
|
|
|
|
- // for _, v := range e.RulePres {
|
|
|
|
- // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
|
- // doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
|
|
- // }
|
|
|
|
- // }
|
|
|
|
//抽取规则
|
|
//抽取规则
|
|
tmprules := map[string][]*RuleCore{}
|
|
tmprules := map[string][]*RuleCore{}
|
|
lockrule.Lock()
|
|
lockrule.Lock()
|
|
@@ -687,30 +661,23 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
//抽取-前置规则
|
|
//抽取-前置规则
|
|
- // for _, v := range vc.RulePres {
|
|
|
|
- // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
|
- // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
|
|
- // }
|
|
|
|
- // }
|
|
|
|
- // log.Debug("抽取-前置规则", tmp)
|
|
|
|
-
|
|
|
|
|
|
+ //for _, v := range vc.RulePres {
|
|
|
|
+ // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
|
+ // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
|
|
+ // }
|
|
|
|
+ //}
|
|
//抽取-规则
|
|
//抽取-规则
|
|
if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
ExtRuleCore(tmp, e, vc, j, isSite)
|
|
ExtRuleCore(tmp, e, vc, j, isSite)
|
|
}
|
|
}
|
|
-
|
|
|
|
- // log.Debug("抽取-规则", tmp)
|
|
|
|
-
|
|
|
|
//抽取-后置规则
|
|
//抽取-后置规则
|
|
for _, v := range vc.RuleBacks {
|
|
for _, v := range vc.RuleBacks {
|
|
if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
ExtRegBack(j, v, e.TaskInfo, vc)
|
|
ExtRegBack(j, v, e.TaskInfo, vc)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- // log.Debug("抽取-后置规则", tmp)
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-
|
|
|
|
//全局后置规则
|
|
//全局后置规则
|
|
for _, v := range e.RuleBacks {
|
|
for _, v := range e.RuleBacks {
|
|
if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
@@ -749,8 +716,8 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
}
|
|
}
|
|
|
|
|
|
PackageDetail(j, e, isSite, codeSite) //处理分包信息
|
|
PackageDetail(j, e, isSite, codeSite) //处理分包信息
|
|
- // bs, _ := json.Marshal(j.Result)
|
|
|
|
- // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
|
|
|
|
+ //bs, _ := json.Marshal(j.Result)
|
|
|
|
+ //log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
log.Debug("ExtractProcess err", err)
|
|
log.Debug("ExtractProcess err", err)
|
|
})
|
|
})
|