Sfoglia il codice sorgente

1、实体识别调整
2、bidding携带字段限制

zhengkun 1 anno fa
parent
commit
6b82e312c8

+ 7 - 2
src/jy/extract/extractsave.go

@@ -379,6 +379,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if j.IsUnRulesTab {
 			tmp["is_UnRules_Tab"] = j.IsUnRulesTab
 		}
+		//补充源表数据的数据
 		for k, v := range *doc {
 			if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
 				(*doc)[k] = []rune(qu.ObjToString(v))[:100000]
@@ -387,7 +388,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			if delFiled(k) {
 				continue
 			}
-			if tmp[k] == nil && k != "project_completedate" && k != "project_startdate" {
+			if tmp[k] == nil && BiddingFields[k] != nil {
 				tmp[k] = v
 			}
 		}
@@ -396,7 +397,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			e.QualityAudit(tmp)
 		}
 		//落款识别
-		e.inscribeRecognize(&tmp, *j.Data)
+		jf_text := ""
+		if jf != nil {
+			jf_text = jf.ContentClean
+		}
+		e.inscribeRecognize(&tmp, *j.Data, jf_text)
 		//根据正文获取资质要求
 		e.getQualifications(&tmp, *j.Data)
 		//城市抽取

+ 86 - 43
src/jy/extract/extraxtmethod.go

@@ -26,17 +26,30 @@ type scoreIndex struct {
 }
 
 var (
-	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
-	JYUrl                                        = "https://www.jianyu360.cn/article/content/%s.html"
-	cut                                          = ju.NewCut()                                 //获取正文并清理
-	ExtLogs                                      map[*TaskInfo][]map[string]interface{}        //抽取日志
-	TaskList                                     map[string]*ExtractTask                       //任务列表
-	ClearTaskList                                map[string]*ClearTask                         //清理任务列表
-	saveLimit                                                                           = 100  //抽取日志批量保存
-	PageSize                                                                            = 5000 //查询分页
-	Fields                                                                              = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
-	Fields2                                                                             = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
-	NiJianField                                                                         = []string{
+	lock, lockrule     sync.RWMutex
+	lockclear, locktag sync.RWMutex
+	blocktag           sync.RWMutex
+	JYUrl              = "https://www.jianyu360.cn/article/content/%s.html"
+	cut                = ju.NewCut()                                 //获取正文并清理
+	ExtLogs            map[*TaskInfo][]map[string]interface{}        //抽取日志
+	TaskList           map[string]*ExtractTask                       //任务列表
+	ClearTaskList      map[string]*ClearTask                         //清理任务列表
+	saveLimit                                                 = 100  //抽取日志批量保存
+	PageSize                                                  = 5000 //查询分页
+	Fields                                                    = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
+	BiddingFields                                             = map[string]interface{}{
+		"title":       1,
+		"site":        1,
+		"spidercode":  1,
+		"toptype":     1,
+		"subtype":     1,
+		"comeintime":  1,
+		"publishtime": 1,
+		"href":        1,
+		"dataging":    1,
+	}
+	Fields2     = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
+	NiJianField = []string{
 		"string#approvecode",
 		"string#total_investment",
 		"string#funds",
@@ -81,12 +94,13 @@ var (
 		"ah_whsggzyjyfww_kbxx_cgxm":       true,
 		"ah_whsggzyjyfww_kbxx_gcxm":       true,
 	}
-	clearMoneyReg                *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
-	sortStrReg                   *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
-	clearStrReg                  *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
-	clearbondReg                 *regexp.Regexp = regexp.MustCompile("(无|不|否|金额)") //保证金
-	textSelectReg                *regexp.Regexp = regexp.MustCompile("(中标(单位|供应商|金额|价格))")
-	winorderLock, jfwinorderLock sync.Mutex
+	clearMoneyReg  *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
+	sortStrReg     *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
+	clearStrReg    *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
+	clearbondReg   *regexp.Regexp = regexp.MustCompile("(无|不|否|金额)") //保证金
+	textSelectReg  *regexp.Regexp = regexp.MustCompile("(中标(单位|供应商|金额|价格))")
+	winorderLock   sync.Mutex
+	jfwinorderLock sync.Mutex
 )
 
 var clearWinnerReg = regexp.MustCompile("(名称|施工|拟定供应商名称|[::])")
@@ -107,7 +121,13 @@ var effectivefirm = regexp.MustCompile("^[\u4E00-\u9FA5]{4,15}(公司|集团|委
 
 // 发布时间识别
 var inscribe_publishtime_1 = regexp.MustCompile("(\\d{4}[年-]\\d{1,2}[月-]\\d{1,2}[日-]*)")
-var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
+var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
+
+//var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
+
+// 实体通用企业
+var entdfa_entity = regexp.MustCompile("^([\u4E00-\u9FA5]{4,25}(公司|集团|委员会|机构|企业|设计|厂|场|院|所|店|中心|局|站|城|处|行|部|队|联合[会|体]|总站|管委会|联合会|联合体|医院|卫计委|机关|社区|中心站|中心校|分校|办公室|学校|幼儿园|动物园|管理站|馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|[初高]中|银行|[大中小]学|段|社|室|厅|监狱|监测站|血站|检查站|工作站|供应站|分行|文明办)|.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位)|(联通|移动|电信))|某部|某单位)$")
+var entdfa_clean = regexp.MustCompile("([\\s \n]+)")
 
 // 清洗正文
 func CleanDetailText(detail string, summary string) string {
@@ -246,7 +266,7 @@ func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[
 }
 
 // 落款识别~采购单位
-func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}) {
+func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}, jf_text string) {
 	//落款实体
 	if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
 		!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
@@ -261,20 +281,18 @@ func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[
 			(*tmp)["buyer"] = new_buyer
 		}
 	}
-	//暂时关闭实体识别
+	//实体服务识别
 	//if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
 	//	!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
-	//	if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]),*tmp); new_buyer != "" {
+	//	if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
 	//		(*tmp)["buyer"] = new_buyer
 	//	}
 	//}
-
 	//拟建不能存buyer
 	if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
 		qu.ObjToString((*tmp)["subtype"]) == "拟建" {
 		delete((*tmp), "buyer")
 	}
-
 	//识别发布时间
 	if qu.IntAll(j_data["publishtime"]) == -1 {
 		if qu.IntAll((*tmp)["ext_publishtime"]) == 0 {
@@ -320,37 +338,62 @@ func InscribeEntity(detail string, tmp map[string]interface{}) string {
 }
 
 // 识别实体
-func InscribeEntityDfa(detail string, tmp map[string]interface{}) string {
+func InscribeEntityDfa(detail string, jf_detail string, tmp map[string]interface{}) string {
 	new_str := ""
 	projectname := qu.ObjToString(tmp["projectname"])
 	title := qu.ObjToString(tmp["title"])
 	winner := qu.ObjToString(tmp["winner"])
 	agency := qu.ObjToString(tmp["agency"])
+	//采用-排除表格的文本识别
 	new_detail := pretreated.TextAfterRemoveTable(detail)
-	if len(new_detail) > 200 {
-		new_detail = detail[len(new_detail)-200:]
+	new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
+	if len(new_detail) > 500 {
+		new_detail = new_detail[len(new_detail)-500:]
 	}
-	dfa_info1, l_1 := EmployPostEntDfa(bson.M{"detail": new_detail}), 0
-	if res_1 := ju.ConvertInterface(dfa_info1["result"]); len(res_1) > 0 {
-		for _, v := range res_1 {
-			if cl := utf8.RuneCountInString(v); cl > l_1 && cl > 3 && !exclude_entity.MatchString(v) {
-				l_1 = cl
-				new_str = v
-			}
-		}
+	if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
+		return new_str
+	}
+
+	//采用-去除标签的纯文本(含表格)
+	new_detail = pretreated.HtmlToText(detail)
+	new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
+	if len(new_detail) > 500 {
+		new_detail = new_detail[len(new_detail)-500:]
+	}
+	if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
+		return new_str
 	}
-	if new_str != "" {
+
+	//采用-标题项目名称
+	if new_str = EmployEntDfaText(title+"\n"+projectname, winner, agency); new_str != "" {
+		return new_str
+	}
+
+	//采用-附件识别
+	if len(jf_detail) > 500 {
+		jf_detail = jf_detail[len(jf_detail)-500:]
+	}
+	if new_str = EmployEntDfaText(jf_detail, winner, agency); new_str != "" {
+		return new_str
+	}
+
+	return new_str
+}
+
+// 实体识别方法
+func EmployEntDfaText(text string, winner string, agency string) string {
+	new_str := ""
+	if text == "" {
 		return new_str
 	}
-	dfa_info2, l_2 := EmployPostEntDfa(bson.M{"detail": title + "\n" + projectname}), 0
-	if res_2 := ju.ConvertInterface(dfa_info2["result"]); len(res_2) > 0 {
-		for _, v := range res_2 {
-			if cl := utf8.RuneCountInString(v); v != "" && cl > l_2 && cl > 3 && !exclude_entity.MatchString(v) {
-				if v != "" && (v == winner || v == agency) {
-					continue //识别异常
+	dfa_info, l := EmployPostEntDfa(bson.M{"detail": text}), 0
+	if res := ju.ConvertInterface(dfa_info["result"]); len(res) > 0 {
+		for _, v := range res {
+			if cl := utf8.RuneCountInString(v); cl > l && cl > 3 && !exclude_entity.MatchString(v) && entdfa_entity.MatchString(v) {
+				if !(v == winner || v == agency) {
+					l = cl
+					new_str = v
 				}
-				l_2 = cl
-				new_str = v
 			}
 		}
 	}

+ 1 - 0
src/jy/pretreated/analymethod.go

@@ -444,6 +444,7 @@ func AnalyStart(job *u.Job, isSite bool, codeSite string) {
 				}
 			}
 		}
+
 		//特殊-指定处理-结构转化formattext100
 		if formattext100.MatchString(bl.Text) {
 			new_str := formattext100.FindString(bl.Text)