|
@@ -33,9 +33,26 @@ var (
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
saveLimit = 100 //抽取日志批量保存
|
|
saveLimit = 100 //抽取日志批量保存
|
|
PageSize = 5000 //查询分页
|
|
PageSize = 5000 //查询分页
|
|
- Fields = `{"jyfb_data":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
|
|
|
|
- //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}`
|
|
|
|
- Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
|
|
|
+ Fields = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
|
|
|
|
+ Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
|
+ NiJianField = []string{
|
|
|
|
+ "string#approvecode",
|
|
|
|
+ "string#total_investment",
|
|
|
|
+ "string#funds",
|
|
|
|
+ "string#owner",
|
|
|
|
+ "string#projectaddr",
|
|
|
|
+ "string#projectperiod",
|
|
|
|
+ "string#project_scale",
|
|
|
|
+ "string#project_person",
|
|
|
|
+ "string#project_phone",
|
|
|
|
+ "string#approvenumber",
|
|
|
|
+ "string#projecttype",
|
|
|
|
+ "string#approvestatus",
|
|
|
|
+ "time#project_startdate",
|
|
|
|
+ "time#project_completedate",
|
|
|
|
+ "map#construction_area",
|
|
|
|
+ "map#floor_area",
|
|
|
|
+ }
|
|
spidercode = map[string]bool{
|
|
spidercode = map[string]bool{
|
|
"gd_zhsggzyjyzx_jsgc_fjczbgg": true,
|
|
"gd_zhsggzyjyzx_jsgc_fjczbgg": true,
|
|
"js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
|
|
"js_szgyyqggzyjyzx_jsgc_zjfbgs": true,
|
|
@@ -65,8 +82,6 @@ var (
|
|
}
|
|
}
|
|
)
|
|
)
|
|
|
|
|
|
-//var packageUnUsedReg = regexp.MustCompile("1[0-9].投标报价\n1[0-9].1")
|
|
|
|
-
|
|
|
|
//启动测试抽取-、、、、结果追踪
|
|
//启动测试抽取-、、、、结果追踪
|
|
func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
|
|
func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
|
|
defer qu.Catch()
|
|
defer qu.Catch()
|
|
@@ -131,7 +146,7 @@ func BsonTOStringId(id interface{}) string {
|
|
return id.(primitive.ObjectID).Hex()
|
|
return id.(primitive.ObjectID).Hex()
|
|
}
|
|
}
|
|
|
|
|
|
-//开始测试任务抽取
|
|
|
|
|
|
+//开始测试任务抽取~结果追踪
|
|
func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
n, _ := strconv.Atoi(num)
|
|
n, _ := strconv.Atoi(num)
|
|
id := IdTrans(startId)
|
|
id := IdTrans(startId)
|
|
@@ -161,6 +176,7 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
} else {
|
|
} else {
|
|
return false
|
|
return false
|
|
}
|
|
}
|
|
|
|
+
|
|
}
|
|
}
|
|
|
|
|
|
//启动抽取
|
|
//启动抽取
|
|
@@ -312,12 +328,17 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
detail = d2
|
|
detail = d2
|
|
}
|
|
}
|
|
detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
|
|
detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
|
|
|
|
+
|
|
d3, _ := doc["summary"].(string)
|
|
d3, _ := doc["summary"].(string)
|
|
//全文的需要修复表格
|
|
//全文的需要修复表格
|
|
detail = pretreated.RepairCon(detail)
|
|
detail = pretreated.RepairCon(detail)
|
|
detail = ju.CutLableStr(d3 + "\n" + detail)
|
|
detail = ju.CutLableStr(d3 + "\n" + detail)
|
|
detail = cut.ClearHtml(d3 + "\n" + detail)
|
|
detail = cut.ClearHtml(d3 + "\n" + detail)
|
|
|
|
|
|
|
|
+ if len(detail) < 30 && len(d1) > len(detail) {
|
|
|
|
+ detail = d1
|
|
|
|
+ }
|
|
|
|
+
|
|
doc["detail"] = detail
|
|
doc["detail"] = detail
|
|
isClearnMoney := !clearMoneyReg.MatchString(detail)
|
|
isClearnMoney := !clearMoneyReg.MatchString(detail)
|
|
if isClearnMoney {
|
|
if isClearnMoney {
|
|
@@ -686,7 +707,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
|
|
|
|
- if vc.Field == "addressing" {
|
|
|
|
|
|
+ if vc.Field == "winner" {
|
|
//log.Debug("调试抽取字段")
|
|
//log.Debug("调试抽取字段")
|
|
}
|
|
}
|
|
////抽取-前置规则
|
|
////抽取-前置规则
|
|
@@ -1907,13 +1928,20 @@ var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:
|
|
var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
|
|
var unPackageWinnerReg = regexp.MustCompile("(重新招标)")
|
|
|
|
|
|
//包含字母的实体单位
|
|
//包含字母的实体单位
|
|
-var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体])$")
|
|
|
|
|
|
+var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体])$")
|
|
|
|
+
|
|
|
|
+//落款单位抽取
|
|
|
|
+var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))\n([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)")
|
|
|
|
+var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))([\\s]+)?([0-9]+年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))")
|
|
|
|
+
|
|
|
|
+var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)")
|
|
|
|
|
|
//特殊金额-处理判断-倍率关系
|
|
//特殊金额-处理判断-倍率关系
|
|
func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) {
|
|
func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) {
|
|
//金额结果只有两种 - 倍率关系10000 - 过10E
|
|
//金额结果只有两种 - 倍率关系10000 - 过10E
|
|
moneyIndex := []int{}
|
|
moneyIndex := []int{}
|
|
moneyArr := []float64{}
|
|
moneyArr := []float64{}
|
|
|
|
+ first_money := float64(0)
|
|
difValue := map[string]interface{}{}
|
|
difValue := map[string]interface{}{}
|
|
for k, v := range val { //取第一个非负数,项目名称除外
|
|
for k, v := range val { //取第一个非负数,项目名称除外
|
|
if v.IsTrue && v.Score > -1 {
|
|
if v.IsTrue && v.Score > -1 {
|
|
@@ -1928,9 +1956,9 @@ func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) {
|
|
if difValue[key] == nil {
|
|
if difValue[key] == nil {
|
|
difValue[key] = 1
|
|
difValue[key] = 1
|
|
}
|
|
}
|
|
- if len(difValue) > 2 {
|
|
|
|
- return false, 0
|
|
|
|
- }
|
|
|
|
|
|
+ //if len(difValue) > 2 {
|
|
|
|
+ // return false, 0
|
|
|
|
+ //}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//计算金额数组
|
|
//计算金额数组
|
|
@@ -1965,7 +1993,25 @@ func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ } else if len(difValue) > 2 { //多组金额
|
|
|
|
+ is_exists := false
|
|
|
|
+ for _, v := range moneyArr {
|
|
|
|
+ if v >= 1000000000 {
|
|
|
|
+ is_exists = true
|
|
|
|
+ first_money = v
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if is_exists {
|
|
|
|
+ for k, v := range moneyArr {
|
|
|
|
+ if v*10000 == first_money {
|
|
|
|
+ return true, moneyIndex[k]
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+
|
|
}
|
|
}
|
|
|
|
+
|
|
return false, 0
|
|
return false, 0
|
|
}
|
|
}
|
|
|
|
|
|
@@ -2230,7 +2276,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- } else if tmp["winner"] != nil && tmp["winner"] != "" {
|
|
|
|
|
|
+ } else if tmp["winner"] != nil {
|
|
//没有分包取winner
|
|
//没有分包取winner
|
|
tmp["s_winner"] = tmp["winner"]
|
|
tmp["s_winner"] = tmp["winner"]
|
|
fieldSource["s_winner"] = fieldSource["winner"]
|
|
fieldSource["s_winner"] = fieldSource["winner"]
|
|
@@ -2434,13 +2480,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- //只要项目名称
|
|
|
|
- //p_name := qu.ObjToString(tmp["projectname"])
|
|
|
|
- //tmp = map[string]interface{}{}
|
|
|
|
- //if p_name!="" {
|
|
|
|
- // tmp["projectname"] = p_name
|
|
|
|
- //}
|
|
|
|
-
|
|
|
|
if e.TaskInfo.TestColl == "" {
|
|
if e.TaskInfo.TestColl == "" {
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
delete(tmp, "_id")
|
|
delete(tmp, "_id")
|
|
@@ -2497,6 +2536,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
|
|
|
//检查字段-
|
|
//检查字段-
|
|
func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[string]interface{} {
|
|
func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[string]interface{} {
|
|
|
|
+
|
|
delete(tmp, "contenthtml")
|
|
delete(tmp, "contenthtml")
|
|
delete(tmp, "detail")
|
|
delete(tmp, "detail")
|
|
|
|
|
|
@@ -2534,12 +2574,19 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
|
|
//金额比例异常-
|
|
//金额比例异常-
|
|
if _, ok := tmp["bidamount"].(string); ok {
|
|
if _, ok := tmp["bidamount"].(string); ok {
|
|
delete(tmp, "bidamount")
|
|
delete(tmp, "bidamount")
|
|
- } else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/10 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) {
|
|
|
|
- if fb > 1000.0 && fb < 100000000.0 {
|
|
|
|
- } else {
|
|
|
|
- delete(tmp, "bidamount")
|
|
|
|
- }
|
|
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/10 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) {
|
|
|
|
+ //比例限制打开
|
|
|
|
+ if fb > 1000.0 && fb < 100000000.0 {
|
|
|
|
+
|
|
|
|
+ } else {
|
|
|
|
+ delete(tmp, "bidamount")
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ */
|
|
|
|
+
|
|
if _, ok := tmp["budget"].(string); ok {
|
|
if _, ok := tmp["budget"].(string); ok {
|
|
delete(tmp, "budget")
|
|
delete(tmp, "budget")
|
|
}
|
|
}
|
|
@@ -2566,7 +2613,7 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
|
|
delete(tmp, k)
|
|
delete(tmp, k)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 || strings.Contains(fmt.Sprint(v), "**") {
|
|
|
|
|
|
+ if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 {
|
|
delete(tmp, k)
|
|
delete(tmp, k)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -2663,9 +2710,108 @@ func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[
|
|
tmp[k] = v
|
|
tmp[k] = v
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ //return tmp
|
|
|
|
+
|
|
|
|
+ //针对拟建单位~需要验证~各种字段优先级
|
|
|
|
+ if qu.ObjToString(tmp["toptype"]) == "拟建" &&
|
|
|
|
+ qu.ObjToString(tmp["subtype"]) == "拟建" {
|
|
|
|
+ nj_record := map[string]interface{}{}
|
|
|
|
+ for _, v := range NiJianField {
|
|
|
|
+ arr := strings.Split(v, "#")
|
|
|
|
+ k_type, k_field := "", ""
|
|
|
|
+ if len(arr) == 2 {
|
|
|
|
+ k_type, k_field = arr[0], arr[1]
|
|
|
|
+ } else {
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ tmpValue := tmp[k_field]
|
|
|
|
+ is_use := false
|
|
|
|
+ if k_type == "string" {
|
|
|
|
+ if qu.ObjToString(j_data[k_field]) != "" {
|
|
|
|
+ is_use = true
|
|
|
|
+ tmp[k_field] = qu.ObjToString(j_data[k_field])
|
|
|
|
+ }
|
|
|
|
+ } else if k_type == "time" {
|
|
|
|
+ //开竣工日期~采集为字符串
|
|
|
|
+ if qu.ObjToString(j_data[k_field]) != "" {
|
|
|
|
+ //特殊~需要转换
|
|
|
|
+ new_data := clear.ObjToTimestamp([]interface{}{j_data[k_field]}, "")
|
|
|
|
+ if len(new_data) > 0 {
|
|
|
|
+ if qu.Int64All(new_data[0]) > 0 {
|
|
|
|
+ is_use = true
|
|
|
|
+ tmp[k_field] = qu.Int64All(new_data[0])
|
|
|
|
+ //记录历史日期值
|
|
|
|
+ new_k := "s_" + k_field
|
|
|
|
+ nj_record[new_k] = map[string]interface{}{
|
|
|
|
+ k_field: j_data[k_field],
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } else if k_type == "map" {
|
|
|
|
+ p_info := *qu.ObjToMap(j_data["project_scale_info"])
|
|
|
|
+ if qu.ObjToString(p_info[k_field]) != "" {
|
|
|
|
+ is_use = true
|
|
|
|
+ tmp[k_field] = qu.ObjToString(p_info[k_field])
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if tmpValue != nil {
|
|
|
|
+ nj_record[k_field] = map[string]interface{}{
|
|
|
|
+ k_field: tmpValue,
|
|
|
|
+ "is_use": is_use,
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if len(nj_record) > 0 {
|
|
|
|
+ tmp["nj_record"] = nj_record
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ //落款实体
|
|
|
|
+ if qu.ObjToString(tmp["buyer"]) == "" && ju.Inscribe &&
|
|
|
|
+ !(qu.ObjToString(tmp["toptype"]) == "拟建" && qu.ObjToString(tmp["subtype"]) == "拟建") {
|
|
|
|
+ new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"]))
|
|
|
|
+ if new_buyer != "" {
|
|
|
|
+ tmp["buyer"] = new_buyer
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ //拟建不能存buyer
|
|
|
|
+ if qu.ObjToString(tmp["toptype"]) == "拟建" &&
|
|
|
|
+ qu.ObjToString(tmp["subtype"]) == "拟建" {
|
|
|
|
+ delete(tmp, "buyer")
|
|
|
|
+ }
|
|
|
|
+
|
|
return tmp
|
|
return tmp
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+func InscribeEntity(detail string) string {
|
|
|
|
+ //去除标签
|
|
|
|
+ new_str := ""
|
|
|
|
+ new_detail := pretreated.TextAfterRemoveTable(detail)
|
|
|
|
+ if len(new_detail) > 200 {
|
|
|
|
+ new_detail = detail[len(new_detail)-200:]
|
|
|
|
+ }
|
|
|
|
+ new_str = inscribe_entity_1.FindString(new_detail)
|
|
|
|
+ if new_str == "" {
|
|
|
|
+ new_str = inscribe_entity_2.FindString(new_detail)
|
|
|
|
+ if new_str != "" {
|
|
|
|
+ str1 := inscribe_entity_2.ReplaceAllString(new_str, "${2}")
|
|
|
|
+ str2 := inscribe_entity_2.ReplaceAllString(new_str, "${6}")
|
|
|
|
+ if str1 == str2 && str1 != "" {
|
|
|
|
+ new_str = str1
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ new_str = inscribe_entity_1.ReplaceAllString(new_str, "${2}")
|
|
|
|
+ }
|
|
|
|
+ if new_str != "" && exclude_entity.MatchString(new_str) {
|
|
|
|
+ new_str = ""
|
|
|
|
+ }
|
|
|
|
+ return new_str
|
|
|
|
+}
|
|
|
|
+
|
|
//处理折扣系数-
|
|
//处理折扣系数-
|
|
func dealWithDiscountBid(tmp map[string]interface{}) float64 {
|
|
func dealWithDiscountBid(tmp map[string]interface{}) float64 {
|
|
biddiscount := qu.Float64All(tmp["biddiscount"])
|
|
biddiscount := qu.Float64All(tmp["biddiscount"])
|