Ver código fonte

添加清洗后的正文

wcc 1 ano atrás
pai
commit
3cd15b8ae0

+ 2 - 8
src/jy/extract/extractrule.go

@@ -175,7 +175,7 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 		} else if in.Field == "enterprise_qualification" || in.Field == "personnel_qualification" || in.Field == "performance_qualification" || in.Field == "enterprise_credit" {
 			hasResult := false
 			//1.通过文中的资质要求抽取四个资质
-			qualifications := ju.GetQualifications(pretreated.HtmlToText(qu.ObjToString(doc[extfrom])))
+			qualifications := ju.GetQualifications(j.ContentClean)
 			if qualifications != "" {
 				extinfo := extRegCoreToResult(extfrom, qualifications, &map[string]string{}, j, in, isSite, "")
 				if len(extinfo) > 0 {
@@ -185,13 +185,7 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 			}
 			//2.通过抽取资质要求段落匹配不到时,匹配全文或者附件内容,
 			if !hasResult {
-				content := ""
-				if j.IsFile {
-					content = j.Content
-				} else {
-					content = doc["detail"].(string)
-				}
-				extinfo := extRegCoreToResult(extfrom, content, &map[string]string{}, j, in, isSite, "")
+				extinfo := extRegCoreToResult(extfrom, j.ContentClean, &map[string]string{}, j, in, isSite, "")
 				if len(extinfo) > 0 {
 					AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
 				}

+ 1 - 0
src/jy/pretreated/analymethod.go

@@ -345,6 +345,7 @@ func AnalyStart(job *u.Job, isSite bool, codeSite string) {
 		}
 	}
 	con = formatText(con, "all")
+	job.ContentClean = HtmlToText(job.Content)
 	job.Content = con
 	//计算表格占比,返回表格数组、占比
 	tabs, _ := ComputeConRatio(con, 1)

+ 1 - 0
src/jy/util/article.go

@@ -9,6 +9,7 @@ type Job struct {
 	Category          string                            //类别
 	CategorySecond    string                            //二级分类
 	Content           string                            //正文
+	ContentClean      string                            //清洗后的正文
 	Title             string                            //标题
 	SpiderCode        string                            //爬虫代码
 	Site              string                            //站点