package extract import ( "fmt" "jy/clear" ju "jy/util" qu "qfw/util" "regexp" "strings" "unicode/utf8" ) // 去重冗余字段 func delFiled(k string) bool { return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata" } // 检查字段- func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[string]interface{} { delete(tmp, "contenthtml") delete(tmp, "detail") //剑鱼链接方便查阅 jyhref := fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"]))) tmp["jytest_href"] = jyhref //对于招标类信息~若winner没有值~过滤掉中标相关信息 if qu.ObjToString(tmp["toptype"]) == "招标" && qu.ObjToString(tmp["subtype"]) != "单一" { delete(tmp, "winner") delete(tmp, "s_winner") delete(tmp, "bidamount") delete(tmp, "winnerorder") } tmp["repeat"] = 0 //指定爬虫-金额处理-预算-中标金额异常 if qu.ObjToString(tmp["spidercode"]) == "xz_xzzzqjzscjgycxxxpt_zbtzs" { if budget, ok := tmp["budget"].(float64); ok && budget > 0 && budget < 1000000 { tmp["budget"] = budget * 10000.0 } if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 { tmp["bidamount"] = bidamount / 10000.0 } } if qu.ObjToString(tmp["spidercode"]) == "js_jsszbtbw_zbhxrgs" { if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 { tmp["bidamount"] = bidamount / 10000.0 } } //异常金额类型清洗- if _, ok := tmp["bidamount"].(string); ok { delete(tmp, "bidamount") } if _, ok := tmp["budget"].(string); ok { delete(tmp, "budget") } //budget bidamount 阈值限定 if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 { tmp["budget_threshold"] = bg delete(tmp, "budget") } if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 { tmp["bidamount_threshold"] = bg delete(tmp, "bidamount") } //对分包存储校验···package if tmp["package"] != nil { if isExistsPackage(tmp["package"].(map[string]map[string]interface{})) { tmp["is_exist_package"] = true } else { tmp["package_c"] = tmp["package"] delete(tmp, "package") } } //对于单位,金额与候选信息进行相互校验与选取 if winner := qu.ObjToString(tmp["winner"]); winner != "" { if winnerorder := ju.IsMarkInterfaceMap(tmp["winnerorder"]); len(winnerorder) > 0 { isWin := false if tmp["package"] == nil { isWin = true } else { if !isUsedMultiPackage(tmp["package"].(map[string]map[string]interface{})) || winner == qu.ObjToString(tmp["s_winner"]) { isWin = true } } if isWin { isExists := false for k, v := range winnerorder { if k >= 2 { //仅对比前两名 break } if winner == qu.ObjToString(v["entname"]) && qu.Float64All(v["price"]) > float64(0) { tmp["bidamount"] = qu.Float64All(v["price"]) isExists = true break } } //单位不在候选人里面--金额一致 if !isExists && len(winnerorder) > 1 && len(winnerorder) < 4 { //单位未在候选人里面找到- if entname := qu.ObjToString(winnerorder[0]["entname"]); entname != "" && qu.IntAll(winnerorder[0]["sort"]) == 1 { if price := qu.Float64All(winnerorder[0]["price"]); price > 0.0 && qu.Float64All(tmp["bidamount"]) == price { if !(strings.Contains(entname, winner) || strings.Contains(winner, entname)) { if effectivefirm.MatchString(entname) { tmp["winner"] = entname tmp["s_winner"] = entname } } } } } } } } //快速过滤一遍特殊字段 for k, v := range tmp { if k == "qualifies" { continue } if k == "contract_guarantee" || k == "bid_guarantee" || k == "is_acquire_tender" { if len(fmt.Sprint(v)) > 0 { tmp[k] = true } else { delete(tmp, k) } } if k == "is_joint_bidding" || k == "is_payment_deposit" { if fmt.Sprint(v) == "true" { tmp[k] = true } else { delete(tmp, k) } } if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 { delete(tmp, k) } } //特殊字段~根据其他字段处理 bid_bond := qu.ObjToString(tmp["bid_bond"]) if bid_bond != "" && tmp["is_payment_deposit"] == nil { if strings.Contains(bid_bond, "保证金") && !clearbondReg.MatchString(bid_bond) { tmp["is_payment_deposit"] = true } } //特殊字段~根据其他字段处理 bidopenaddress := qu.ObjToString(tmp["bidopenaddress"]) if bidopenaddress != "" && tmp["bidopen_shape"] == nil { if utf8.RuneCountInString(bidopenaddress) > 5 { tmp["bidopen_shape"] = "线下开标" } } //项目周期-有效值 projectperiod := qu.ObjToString(tmp["projectperiod"]) if projectperiod != "" { //项目周期包含日期,数字及日期单位可保留,其余可清洗 isNeedValueReg := regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`) if !isNeedValueReg.MatchString(projectperiod) { delete(tmp, "projectperiod") } } //工期单位是否有效-清理 if project_timeunit, ok := tmp["project_timeunit"].(string); ok { dateReg := regexp.MustCompile(`[年|月|日|天|周]`) if !dateReg.MatchString(project_timeunit) || utf8.RuneCountInString(project_timeunit) > 4 { delete(tmp, "project_timeunit") } //年-0 >5 删除 if project_timeunit == "年" && (qu.Int64All(tmp["project_duration"]) == 0 || qu.Int64All(tmp["project_duration"]) > 5) { delete(tmp, "project_timeunit") } } //中标单位统一 if tmp["winner"] != nil && tmp["s_winner"] != nil { strwin := qu.ObjToString(tmp["winner"]) strwin_s := qu.ObjToString(tmp["s_winner"]) if !strings.Contains(strwin_s, strwin) { tmp["s_winner"] = strwin } } else if qu.ObjToString(tmp["s_winner"]) == "" && qu.ObjToString(tmp["winner"]) != "" { tmp["s_winner"] = tmp["winner"] } //投标方式- bidway := qu.IntAll(tmp["bidway"]) if bidway == 1 { tmp["bidway"] = "纸质投标" } else if bidway == 2 { tmp["bidway"] = "电子投标" } else { delete(tmp, "bidway") } //折扣系数 discount := dealWithDiscountBid(tmp) if discount > 0.0 { tmp["biddiscount"] = discount } else { delete(tmp, "biddiscount") } delete(tmp, "biddiscount_up") delete(tmp, "biddiscount_down") //budget bidamount 阈值限定再次 if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 { tmp["budget_threshold"] = bg delete(tmp, "budget") } if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 { tmp["bidamount_threshold"] = bg delete(tmp, "bidamount") } //检查剑鱼发布-爬虫 jyfb_data := *qu.ObjToMap(j_data["jyfb_data"]) if jyfb_data != nil { for k, v := range jyfb_data { if k == "area" { delete(tmp, "district") } tmp[k] = v } } //针对拟建单位~需要验证~各种字段优先级 if qu.ObjToString(tmp["toptype"]) == "拟建" && qu.ObjToString(tmp["subtype"]) == "拟建" { nj_record := map[string]interface{}{} for _, v := range NiJianField { arr := strings.Split(v, "#") k_type, k_field := "", "" if len(arr) == 2 { k_type, k_field = arr[0], arr[1] } else { continue } tmpValue := tmp[k_field] is_use := false if k_type == "string" { if qu.ObjToString(j_data[k_field]) != "" { is_use = true tmp[k_field] = qu.ObjToString(j_data[k_field]) } } else if k_type == "time" { if j_data[k_field] != nil { tmp["s_"+k_field] = j_data[k_field] } //开竣工日期,采集为字符串 if qu.ObjToString(j_data[k_field]) != "" { new_data := clear.ObjToTimestamp([]interface{}{j_data[k_field]}, "") if len(new_data) > 0 { if qu.Int64All(new_data[0]) > 0 { is_use = true tmp[k_field] = qu.Int64All(new_data[0]) } } } else { if qu.Int64All(j_data[k_field]) > int64(0) { is_use = true tmp[k_field] = qu.Int64All(j_data[k_field]) } } } else if k_type == "map" { p_info := *qu.ObjToMap(j_data["project_scale_info"]) if qu.ObjToString(p_info[k_field]) != "" { is_use = true tmp[k_field] = qu.ObjToString(p_info[k_field]) } } else { } if tmpValue != nil { nj_record[k_field] = map[string]interface{}{ k_field: tmpValue, "is_use": is_use, } } } if len(nj_record) > 0 { tmp["nj_record"] = nj_record } } //投标截止日期与开始日期-核对 publishtime := qu.Int64All(tmp["publishtime"]) bidopentime := qu.Int64All(tmp["bidopentime"]) bidendtime := qu.Int64All(tmp["bidendtime"]) if qu.ObjToString(tmp["toptype"]) == "招标" && qu.Int64All(tmp["dataging"]) == 0 { if publishtime-bidopentime > 7*86400 && publishtime > 0 && bidopentime > 0 { delete(tmp, "bidopentime") } if publishtime-bidendtime > 7*86400 && publishtime > 0 && bidopentime > 0 { delete(tmp, "bidopentime") } } //企业资质检验,不含有资质时删除 if enterprise_qualification, ok := tmp["enterprise_qualification"]; ok { special := `(甲级|乙级|丙级|丁级|一级|二级|三级|叁级|壹级|贰级|四级|五级|专业承包资质|贰 级|叁 级|二类|一类|三类|综合资质|工程设计|市政公用工程|铁路工程|建筑工程|公路工程|人防工程|工程勘察|岩土工程|水文地质勘察|工程测量|工程钻探|承装(修、试)|电力工程|大地测量|消防设施工程|特种工程|房屋建筑工程|房屋建筑监理|信息技术服务|信息系统安全|机电工程|建筑机电安装工程|消防设施工程|建筑智能化|水利水电工程|城乡规划资质|水利工程|环境工程|市政工程|公路行业|交通工程|建筑行业|电子与智能化工程|工程监理|建筑工程|土地规划|地基基础工程)` reg := regexp.MustCompile(special) var res = make([]string, 0) datas := strings.Split(qu.ObjToString(enterprise_qualification), "\n") for _, data := range datas { results := reg.FindAllString(data, -1) if len(results) > 0 { res = append(res, data) } } if len(res) == 0 { delete(tmp, "enterprise_qualification") } else { tmp["enterprise_qualification"] = strings.Join(res, "\n") } } return tmp }