|
@@ -30,7 +30,7 @@ var (
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
saveLimit = 200 //抽取日志批量保存
|
|
saveLimit = 200 //抽取日志批量保存
|
|
PageSize = 5000 //查询分页
|
|
PageSize = 5000 //查询分页
|
|
- Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
|
|
|
|
+ Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
)
|
|
)
|
|
|
|
|
|
@@ -231,10 +231,10 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
|
|
} else {
|
|
} else {
|
|
detail = d2
|
|
detail = d2
|
|
}
|
|
}
|
|
- detail = ju.CutLableStr(detail)
|
|
|
|
- detail = cut.ClearHtml(detail)
|
|
|
|
|
|
+ d3, _ := doc["summary"].(string)
|
|
|
|
+ detail = ju.CutLableStr(d3 + "\n" + detail)
|
|
|
|
+ detail = cut.ClearHtml(d3 + "\n" + detail)
|
|
doc["detail"] = detail
|
|
doc["detail"] = detail
|
|
-
|
|
|
|
if isextFile {
|
|
if isextFile {
|
|
file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
}
|
|
}
|
|
@@ -385,11 +385,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
//项目名称未能抽取到,标题来凑
|
|
//项目名称未能抽取到,标题来凑
|
|
if vc.Field == "projectname" {
|
|
if vc.Field == "projectname" {
|
|
//if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
|
|
//if len(j.Result[vc.Field]) < 1 {//如果抽取有结果,不走标题。待验证,暂时标题加入选举逻辑
|
|
- items := make([]*ju.ScoreItem, 1)
|
|
|
|
- items[0] = &ju.ScoreItem{Des: "项目名称未能抽取到,标题来凑初始化", ExtFrom: "title", Value: j.Title, Score: 4}
|
|
|
|
- field := &ju.ExtField{Field: vc.Field, Code: "title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title, Score: 4, ScoreItem: items}
|
|
|
|
|
|
+ field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
|
|
if tmp["blocktag"] != nil {
|
|
if tmp["blocktag"] != nil {
|
|
- field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
|
|
|
+ btag := make(map[string]string)
|
|
|
|
+ for k := range tmp["blocktag"].(map[string]bool) {
|
|
|
|
+ btag[k] = TagConfigDesc[k]
|
|
|
|
+ }
|
|
|
|
+ field.BlockTag = btag
|
|
}
|
|
}
|
|
j.Result[vc.Field] = append(j.Result[vc.Field], field)
|
|
j.Result[vc.Field] = append(j.Result[vc.Field], field)
|
|
//}
|
|
//}
|
|
@@ -625,23 +627,13 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
}
|
|
}
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
for _, tmp := range tmps {
|
|
for _, tmp := range tmps {
|
|
- field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
|
|
|
|
- if extfrom == "title" {
|
|
|
|
- field.Score = 4
|
|
|
|
- }
|
|
|
|
|
|
+ field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
|
|
if tmp["blocktag"] != nil {
|
|
if tmp["blocktag"] != nil {
|
|
- field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
|
- }
|
|
|
|
- item := &ju.ScoreItem{Des: "初始化", Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
|
|
|
|
- if extfrom == "title" {
|
|
|
|
- item.Score = 4
|
|
|
|
- }
|
|
|
|
- if tmp["scoreitem"] == nil {
|
|
|
|
- scoreItems := make([]*ju.ScoreItem, 0)
|
|
|
|
- scoreItems = append(scoreItems, item)
|
|
|
|
- field.ScoreItem = scoreItems
|
|
|
|
- } else {
|
|
|
|
- field.ScoreItem = append(field.ScoreItem, item)
|
|
|
|
|
|
+ btag := make(map[string]string)
|
|
|
|
+ for k := range tmp["blocktag"].(map[string]bool) {
|
|
|
|
+ btag[k] = TagConfigDesc[k]
|
|
|
|
+ }
|
|
|
|
+ field.BlockTag = btag
|
|
}
|
|
}
|
|
j.Result[k] = append(j.Result[k], field)
|
|
j.Result[k] = append(j.Result[k], field)
|
|
}
|
|
}
|
|
@@ -663,13 +655,17 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
|
|
//块抽取
|
|
//块抽取
|
|
if in.Field != "" {
|
|
if in.Field != "" {
|
|
if extfrom == "title" {
|
|
if extfrom == "title" {
|
|
- extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]bool{"title": true}, j, in)
|
|
|
|
|
|
+ extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in)
|
|
if len(extinfo) > 0 {
|
|
if len(extinfo) > 0 {
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
}
|
|
}
|
|
} else {
|
|
} else {
|
|
for _, v := range j.Block {
|
|
for _, v := range j.Block {
|
|
- extinfo := extRegCoreToResult(extfrom, v.Text, &v.Classify, j, in)
|
|
|
|
|
|
+ btag := make(map[string]string)
|
|
|
|
+ for k := range v.Classify {
|
|
|
|
+ btag[k] = TagConfigDesc[k]
|
|
|
|
+ }
|
|
|
|
+ extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
|
|
if len(extinfo) > 0 {
|
|
if len(extinfo) > 0 {
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
}
|
|
}
|
|
@@ -720,7 +716,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
}
|
|
}
|
|
|
|
|
|
//正则提取结果
|
|
//正则提取结果
|
|
-func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
|
|
|
|
|
|
+func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
extinfo := map[string][]map[string]interface{}{}
|
|
extinfo := map[string][]map[string]interface{}{}
|
|
if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
|
|
if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
|
|
@@ -733,6 +729,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
val := text[pos[p]:pos[p+1]]
|
|
val := text[pos[p]:pos[p+1]]
|
|
|
|
+ sourcevalue := val
|
|
if val == "招标公告" {
|
|
if val == "招标公告" {
|
|
return extinfo
|
|
return extinfo
|
|
}
|
|
}
|
|
@@ -756,27 +753,9 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
if v.RegCore.NumSign == -1 { //正负值修正
|
|
if v.RegCore.NumSign == -1 { //正负值修正
|
|
val = "-" + val
|
|
val = "-" + val
|
|
}
|
|
}
|
|
- exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
|
|
|
|
- if extfrom == "title" {
|
|
|
|
- exfield.Score = 4
|
|
|
|
- }
|
|
|
|
|
|
+ exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: sourcevalue, Value: val}
|
|
if tmp["blocktag"] != nil {
|
|
if tmp["blocktag"] != nil {
|
|
- exfield.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
|
- }
|
|
|
|
- item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
|
- if extfrom == "title" {
|
|
|
|
- item.Score = 4
|
|
|
|
- }
|
|
|
|
- if strings.Contains(val, "\n") {
|
|
|
|
- item.Score -= 1
|
|
|
|
- exfield.Score -= 1
|
|
|
|
- }
|
|
|
|
- if tmp["scoreitem"] == nil {
|
|
|
|
- sitems := make([]*ju.ScoreItem, 0)
|
|
|
|
- sitems = append(sitems, &item)
|
|
|
|
- exfield.ScoreItem = sitems
|
|
|
|
- } else {
|
|
|
|
- exfield.ScoreItem = append(exfield.ScoreItem, &item)
|
|
|
|
|
|
+ exfield.BlockTag = tmp["blocktag"].(map[string]string)
|
|
}
|
|
}
|
|
j.Result[k] = append(j.Result[k], &exfield)
|
|
j.Result[k] = append(j.Result[k], &exfield)
|
|
//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
@@ -797,7 +776,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
}
|
|
}
|
|
tmp := map[string]interface{}{
|
|
tmp := map[string]interface{}{
|
|
"field": v.Field,
|
|
"field": v.Field,
|
|
- "code": v.Code + "去除__*后",
|
|
|
|
|
|
+ "code": v.Code,
|
|
"ruletext": regArr[0],
|
|
"ruletext": regArr[0],
|
|
"extfrom": extfrom,
|
|
"extfrom": extfrom,
|
|
"value": value,
|
|
"value": value,
|
|
@@ -807,28 +786,9 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
}
|
|
}
|
|
tmps = append(tmps, tmp)
|
|
tmps = append(tmps, tmp)
|
|
extinfo[v.Field] = tmps
|
|
extinfo[v.Field] = tmps
|
|
-
|
|
|
|
exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
|
|
exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
|
|
- if extfrom == "title" {
|
|
|
|
- exfield.Score = 4
|
|
|
|
- }
|
|
|
|
if tmp["blocktag"] != nil {
|
|
if tmp["blocktag"] != nil {
|
|
- exfield.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
|
- }
|
|
|
|
- item := ju.ScoreItem{Des: "初始化抽取规则去除__*", Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: value}
|
|
|
|
- if extfrom == "title" {
|
|
|
|
- item.Score = 4
|
|
|
|
- }
|
|
|
|
- if strings.Contains(value, "\n") {
|
|
|
|
- item.Score -= 1
|
|
|
|
- exfield.Score -= 1
|
|
|
|
- }
|
|
|
|
- if tmp["scoreitem"] == nil {
|
|
|
|
- sitems := make([]*ju.ScoreItem, 0)
|
|
|
|
- sitems = append(sitems, &item)
|
|
|
|
- exfield.ScoreItem = sitems
|
|
|
|
- } else {
|
|
|
|
- exfield.ScoreItem = append(exfield.ScoreItem, &item)
|
|
|
|
|
|
+ exfield.BlockTag = tmp["blocktag"].(map[string]string)
|
|
}
|
|
}
|
|
j.Result[v.Field] = append(j.Result[v.Field], &exfield)
|
|
j.Result[v.Field] = append(j.Result[v.Field], &exfield)
|
|
//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
@@ -866,22 +826,8 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]bool, j *ju.Job, v
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
j.Result[v.Field] = [](*ju.ExtField){}
|
|
}
|
|
}
|
|
field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
|
|
field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
|
|
- if extfrom == "title" {
|
|
|
|
- field.Score = 4
|
|
|
|
- }
|
|
|
|
if tmp["blocktag"] != nil {
|
|
if tmp["blocktag"] != nil {
|
|
- field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
|
- }
|
|
|
|
- item := ju.ScoreItem{Des: "初始化", Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, Value: val}
|
|
|
|
- if extfrom == "title" {
|
|
|
|
- item.Score = 4
|
|
|
|
- }
|
|
|
|
- if tmp["scoreitem"] == nil {
|
|
|
|
- sitems := make([]*ju.ScoreItem, 0)
|
|
|
|
- sitems = append(sitems, &item)
|
|
|
|
- field.ScoreItem = sitems
|
|
|
|
- } else {
|
|
|
|
- field.ScoreItem = append(field.ScoreItem, &item)
|
|
|
|
|
|
+ field.BlockTag = tmp["blocktag"].(map[string]string)
|
|
}
|
|
}
|
|
j.Result[v.Field] = append(j.Result[v.Field], field)
|
|
j.Result[v.Field] = append(j.Result[v.Field], field)
|
|
}
|
|
}
|
|
@@ -905,15 +851,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
for _, tmp := range tmps {
|
|
for _, tmp := range tmps {
|
|
field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
|
|
field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
|
|
if tmp["blocktag"] != nil {
|
|
if tmp["blocktag"] != nil {
|
|
- field.BlockTag = tmp["blocktag"].(map[string]bool)
|
|
|
|
- }
|
|
|
|
- item := ju.ScoreItem{Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
|
|
|
|
- if tmp["scoreitem"] == nil {
|
|
|
|
- scoreItems := make([]*ju.ScoreItem, 0)
|
|
|
|
- scoreItems = append(scoreItems, &item)
|
|
|
|
- field.ScoreItem = scoreItems
|
|
|
|
- } else {
|
|
|
|
- field.ScoreItem = append(field.ScoreItem, &item)
|
|
|
|
|
|
+ field.BlockTag = tmp["blocktag"].(map[string]string)
|
|
}
|
|
}
|
|
j.Result[k] = append(j.Result[k], field)
|
|
j.Result[k] = append(j.Result[k], field)
|
|
//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
//j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0})
|
|
@@ -935,7 +873,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
// continue
|
|
// continue
|
|
// }
|
|
// }
|
|
text := qu.ObjToString(v.Value)
|
|
text := qu.ObjToString(v.Value)
|
|
- if text != "" && v.ExtFrom != "title" {
|
|
|
|
|
|
+ if text != "" {
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
}
|
|
}
|
|
j.Result[in.Field][k].Value = text
|
|
j.Result[in.Field][k].Value = text
|
|
@@ -1189,6 +1127,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
if len(blocks) > 0 {
|
|
if len(blocks) > 0 {
|
|
tmp["blocks"] = blocks
|
|
tmp["blocks"] = blocks
|
|
}
|
|
}
|
|
|
|
+ //tmp["extract_content"] = j.Content
|
|
if e.TaskInfo.TestColl == "" {
|
|
if e.TaskInfo.TestColl == "" {
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
for field, _ := range e.Fields {
|
|
for field, _ := range e.Fields {
|
|
@@ -1326,7 +1265,7 @@ func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
|
|
|
|
|
|
//去重冗余字段
|
|
//去重冗余字段
|
|
func delFiled(k string) bool {
|
|
func delFiled(k string) bool {
|
|
- return k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
|
|
|
|
|
|
+ return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
|
|
}
|
|
}
|
|
|
|
|
|
func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
|
|
func funcAnalysis(j *ju.Job) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
|