|
@@ -26,17 +26,30 @@ type scoreIndex struct {
|
|
}
|
|
}
|
|
|
|
|
|
var (
|
|
var (
|
|
- lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
|
|
- JYUrl = "https://www.jianyu360.cn/article/content/%s.html"
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
|
- saveLimit = 100 //抽取日志批量保存
|
|
|
|
- PageSize = 5000 //查询分页
|
|
|
|
- Fields = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
|
|
|
|
- Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
|
- NiJianField = []string{
|
|
|
|
|
|
+ lock, lockrule sync.RWMutex
|
|
|
|
+ lockclear, locktag sync.RWMutex
|
|
|
|
+ blocktag sync.RWMutex
|
|
|
|
+ JYUrl = "https://www.jianyu360.cn/article/content/%s.html"
|
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
|
+ saveLimit = 100 //抽取日志批量保存
|
|
|
|
+ PageSize = 5000 //查询分页
|
|
|
|
+ Fields = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
|
|
|
|
+ BiddingFields = map[string]interface{}{
|
|
|
|
+ "title": 1,
|
|
|
|
+ "site": 1,
|
|
|
|
+ "spidercode": 1,
|
|
|
|
+ "toptype": 1,
|
|
|
|
+ "subtype": 1,
|
|
|
|
+ "comeintime": 1,
|
|
|
|
+ "publishtime": 1,
|
|
|
|
+ "href": 1,
|
|
|
|
+ "dataging": 1,
|
|
|
|
+ }
|
|
|
|
+ Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
|
+ NiJianField = []string{
|
|
"string#approvecode",
|
|
"string#approvecode",
|
|
"string#total_investment",
|
|
"string#total_investment",
|
|
"string#funds",
|
|
"string#funds",
|
|
@@ -81,12 +94,13 @@ var (
|
|
"ah_whsggzyjyfww_kbxx_cgxm": true,
|
|
"ah_whsggzyjyfww_kbxx_cgxm": true,
|
|
"ah_whsggzyjyfww_kbxx_gcxm": true,
|
|
"ah_whsggzyjyfww_kbxx_gcxm": true,
|
|
}
|
|
}
|
|
- clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
|
|
|
|
- sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
|
|
|
|
- clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
|
|
|
|
- clearbondReg *regexp.Regexp = regexp.MustCompile("(无|不|否|金额)") //保证金
|
|
|
|
- textSelectReg *regexp.Regexp = regexp.MustCompile("(中标(单位|供应商|金额|价格))")
|
|
|
|
- winorderLock, jfwinorderLock sync.Mutex
|
|
|
|
|
|
+ clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)")
|
|
|
|
+ sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)")
|
|
|
|
+ clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)")
|
|
|
|
+ clearbondReg *regexp.Regexp = regexp.MustCompile("(无|不|否|金额)") //保证金
|
|
|
|
+ textSelectReg *regexp.Regexp = regexp.MustCompile("(中标(单位|供应商|金额|价格))")
|
|
|
|
+ winorderLock sync.Mutex
|
|
|
|
+ jfwinorderLock sync.Mutex
|
|
)
|
|
)
|
|
|
|
|
|
var clearWinnerReg = regexp.MustCompile("(名称|施工|拟定供应商名称|[::])")
|
|
var clearWinnerReg = regexp.MustCompile("(名称|施工|拟定供应商名称|[::])")
|
|
@@ -107,7 +121,13 @@ var effectivefirm = regexp.MustCompile("^[\u4E00-\u9FA5]{4,15}(公司|集团|委
|
|
|
|
|
|
// 发布时间识别
|
|
// 发布时间识别
|
|
var inscribe_publishtime_1 = regexp.MustCompile("(\\d{4}[年-]\\d{1,2}[月-]\\d{1,2}[日-]*)")
|
|
var inscribe_publishtime_1 = regexp.MustCompile("(\\d{4}[年-]\\d{1,2}[月-]\\d{1,2}[日-]*)")
|
|
-var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
|
|
|
|
|
|
+var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
|
|
|
|
+
|
|
|
|
+//var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
|
|
|
|
+
|
|
|
|
+// 实体通用企业
|
|
|
|
+var entdfa_entity = regexp.MustCompile("^([\u4E00-\u9FA5]{4,25}(公司|集团|委员会|机构|企业|设计|厂|场|院|所|店|中心|局|站|城|处|行|部|队|联合[会|体]|总站|管委会|联合会|联合体|医院|卫计委|机关|社区|中心站|中心校|分校|办公室|学校|幼儿园|动物园|管理站|馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|[初高]中|银行|[大中小]学|段|社|室|厅|监狱|监测站|血站|检查站|工作站|供应站|分行|文明办)|.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位)|(联通|移动|电信))|某部|某单位)$")
|
|
|
|
+var entdfa_clean = regexp.MustCompile("([\\s \n]+)")
|
|
|
|
|
|
// 清洗正文
|
|
// 清洗正文
|
|
func CleanDetailText(detail string, summary string) string {
|
|
func CleanDetailText(detail string, summary string) string {
|
|
@@ -246,7 +266,7 @@ func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[
|
|
}
|
|
}
|
|
|
|
|
|
// 落款识别~采购单位
|
|
// 落款识别~采购单位
|
|
-func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}) {
|
|
|
|
|
|
+func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}, jf_text string) {
|
|
//落款实体
|
|
//落款实体
|
|
if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
|
|
if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
|
|
!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
|
|
!(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
|
|
@@ -261,20 +281,18 @@ func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[
|
|
(*tmp)["buyer"] = new_buyer
|
|
(*tmp)["buyer"] = new_buyer
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- //暂时关闭实体识别
|
|
|
|
|
|
+ //实体服务识别
|
|
//if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
|
|
//if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe &&
|
|
// !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
|
|
// !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") {
|
|
- // if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]),*tmp); new_buyer != "" {
|
|
|
|
|
|
+ // if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" {
|
|
// (*tmp)["buyer"] = new_buyer
|
|
// (*tmp)["buyer"] = new_buyer
|
|
// }
|
|
// }
|
|
//}
|
|
//}
|
|
-
|
|
|
|
//拟建不能存buyer
|
|
//拟建不能存buyer
|
|
if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
|
|
if qu.ObjToString((*tmp)["toptype"]) == "拟建" &&
|
|
qu.ObjToString((*tmp)["subtype"]) == "拟建" {
|
|
qu.ObjToString((*tmp)["subtype"]) == "拟建" {
|
|
delete((*tmp), "buyer")
|
|
delete((*tmp), "buyer")
|
|
}
|
|
}
|
|
-
|
|
|
|
//识别发布时间
|
|
//识别发布时间
|
|
if qu.IntAll(j_data["publishtime"]) == -1 {
|
|
if qu.IntAll(j_data["publishtime"]) == -1 {
|
|
if qu.IntAll((*tmp)["ext_publishtime"]) == 0 {
|
|
if qu.IntAll((*tmp)["ext_publishtime"]) == 0 {
|
|
@@ -320,37 +338,62 @@ func InscribeEntity(detail string, tmp map[string]interface{}) string {
|
|
}
|
|
}
|
|
|
|
|
|
// 识别实体
|
|
// 识别实体
|
|
-func InscribeEntityDfa(detail string, tmp map[string]interface{}) string {
|
|
|
|
|
|
+func InscribeEntityDfa(detail string, jf_detail string, tmp map[string]interface{}) string {
|
|
new_str := ""
|
|
new_str := ""
|
|
projectname := qu.ObjToString(tmp["projectname"])
|
|
projectname := qu.ObjToString(tmp["projectname"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
title := qu.ObjToString(tmp["title"])
|
|
winner := qu.ObjToString(tmp["winner"])
|
|
winner := qu.ObjToString(tmp["winner"])
|
|
agency := qu.ObjToString(tmp["agency"])
|
|
agency := qu.ObjToString(tmp["agency"])
|
|
|
|
+ //采用-排除表格的文本识别
|
|
new_detail := pretreated.TextAfterRemoveTable(detail)
|
|
new_detail := pretreated.TextAfterRemoveTable(detail)
|
|
- if len(new_detail) > 200 {
|
|
|
|
- new_detail = detail[len(new_detail)-200:]
|
|
|
|
|
|
+ new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
|
|
|
|
+ if len(new_detail) > 500 {
|
|
|
|
+ new_detail = new_detail[len(new_detail)-500:]
|
|
}
|
|
}
|
|
- dfa_info1, l_1 := EmployPostEntDfa(bson.M{"detail": new_detail}), 0
|
|
|
|
- if res_1 := ju.ConvertInterface(dfa_info1["result"]); len(res_1) > 0 {
|
|
|
|
- for _, v := range res_1 {
|
|
|
|
- if cl := utf8.RuneCountInString(v); cl > l_1 && cl > 3 && !exclude_entity.MatchString(v) {
|
|
|
|
- l_1 = cl
|
|
|
|
- new_str = v
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+ if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
|
|
|
|
+ return new_str
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ //采用-去除标签的纯文本(含表格)
|
|
|
|
+ new_detail = pretreated.HtmlToText(detail)
|
|
|
|
+ new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n")
|
|
|
|
+ if len(new_detail) > 500 {
|
|
|
|
+ new_detail = new_detail[len(new_detail)-500:]
|
|
|
|
+ }
|
|
|
|
+ if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" {
|
|
|
|
+ return new_str
|
|
}
|
|
}
|
|
- if new_str != "" {
|
|
|
|
|
|
+
|
|
|
|
+ //采用-标题项目名称
|
|
|
|
+ if new_str = EmployEntDfaText(title+"\n"+projectname, winner, agency); new_str != "" {
|
|
|
|
+ return new_str
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ //采用-附件识别
|
|
|
|
+ if len(jf_detail) > 500 {
|
|
|
|
+ jf_detail = jf_detail[len(jf_detail)-500:]
|
|
|
|
+ }
|
|
|
|
+ if new_str = EmployEntDfaText(jf_detail, winner, agency); new_str != "" {
|
|
|
|
+ return new_str
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return new_str
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+// 实体识别方法
|
|
|
|
+func EmployEntDfaText(text string, winner string, agency string) string {
|
|
|
|
+ new_str := ""
|
|
|
|
+ if text == "" {
|
|
return new_str
|
|
return new_str
|
|
}
|
|
}
|
|
- dfa_info2, l_2 := EmployPostEntDfa(bson.M{"detail": title + "\n" + projectname}), 0
|
|
|
|
- if res_2 := ju.ConvertInterface(dfa_info2["result"]); len(res_2) > 0 {
|
|
|
|
- for _, v := range res_2 {
|
|
|
|
- if cl := utf8.RuneCountInString(v); v != "" && cl > l_2 && cl > 3 && !exclude_entity.MatchString(v) {
|
|
|
|
- if v != "" && (v == winner || v == agency) {
|
|
|
|
- continue //识别异常
|
|
|
|
|
|
+ dfa_info, l := EmployPostEntDfa(bson.M{"detail": text}), 0
|
|
|
|
+ if res := ju.ConvertInterface(dfa_info["result"]); len(res) > 0 {
|
|
|
|
+ for _, v := range res {
|
|
|
|
+ if cl := utf8.RuneCountInString(v); cl > l && cl > 3 && !exclude_entity.MatchString(v) && entdfa_entity.MatchString(v) {
|
|
|
|
+ if !(v == winner || v == agency) {
|
|
|
|
+ l = cl
|
|
|
|
+ new_str = v
|
|
}
|
|
}
|
|
- l_2 = cl
|
|
|
|
- new_str = v
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|