|
@@ -172,6 +172,30 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
|
|
|
if len(extinfo) > 0 {
|
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
}
|
|
|
+ } else if in.Field == "enterprise_qualification" || in.Field == "personnel_qualification" || in.Field == "performance_qualification" || in.Field == "enterprise_credit" {
|
|
|
+ hasResult := false
|
|
|
+ //1.通过文中的资质要求抽取四个资质
|
|
|
+ qualifications := ju.GetQualifications(pretreated.HtmlToText(qu.ObjToString(doc[extfrom])))
|
|
|
+ if qualifications != "" {
|
|
|
+ extinfo := extRegCoreToResult(extfrom, qualifications, &map[string]string{}, j, in, isSite, "")
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ hasResult = true
|
|
|
+ AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //2.通过抽取资质要求段落匹配不到时,匹配全文或者附件内容,
|
|
|
+ if !hasResult {
|
|
|
+ content := ""
|
|
|
+ if j.IsFile {
|
|
|
+ content = j.Content
|
|
|
+ } else {
|
|
|
+ content = doc["detail"].(string)
|
|
|
+ }
|
|
|
+ extinfo := extRegCoreToResult(extfrom, content, &map[string]string{}, j, in, isSite, "")
|
|
|
+ if len(extinfo) > 0 {
|
|
|
+ AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
+ }
|
|
|
+ }
|
|
|
} else {
|
|
|
for _, v := range j.Block {
|
|
|
btag := make(map[string]string)
|
|
@@ -651,14 +675,43 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
|
|
|
}
|
|
|
}
|
|
|
} else {
|
|
|
- pos := vre.RegCore.Reg.FindStringIndex(text)
|
|
|
val := ""
|
|
|
- if len(pos) == 2 {
|
|
|
- text = text[pos[1]:]
|
|
|
- rs := regexp.MustCompile("[^\r\n\t]+")
|
|
|
- tmp := rs.FindAllString(text, -1)
|
|
|
- if len(tmp) > 0 {
|
|
|
- val = tmp[0]
|
|
|
+ //正则表达式含有"<<",表示需要所有匹配数据
|
|
|
+ regs := strings.Split(vre.RuleText, "<<")
|
|
|
+ if len(regs) == 2 {
|
|
|
+ pattern := regs[0]
|
|
|
+ parses := regs[1] //定义的排除关键词,含有这些关键词,则视为作废
|
|
|
+ reg := regexp.MustCompile(pattern)
|
|
|
+ results := reg.FindAllString(text, -1)
|
|
|
+ if len(results) > 0 {
|
|
|
+ results = ju.RemoveDuplicates(results)
|
|
|
+ result := ""
|
|
|
+ //有排除词时需要过滤
|
|
|
+ if len(parses) > 0 {
|
|
|
+ var res []string
|
|
|
+ for _, v := range results {
|
|
|
+ reg2 := regexp.MustCompile(parses)
|
|
|
+ res2 := reg2.FindAllString(v, -1)
|
|
|
+ if len(res2) == 0 {
|
|
|
+ res = append(res, v)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ result = strings.Join(res, "\n")
|
|
|
+ } else {
|
|
|
+ result = strings.Join(results, "\n")
|
|
|
+ }
|
|
|
+
|
|
|
+ val = result
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ pos := vre.RegCore.Reg.FindStringIndex(text)
|
|
|
+ if len(pos) == 2 {
|
|
|
+ text = text[pos[1]:]
|
|
|
+ rs := regexp.MustCompile("[^\r\n\t]+")
|
|
|
+ tmp := rs.FindAllString(text, -1)
|
|
|
+ if len(tmp) > 0 {
|
|
|
+ val = tmp[0]
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
if val != "" {
|