123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314 |
- package extract
- import (
- "fmt"
- "jy/clear"
- ju "jy/util"
- qu "qfw/util"
- "regexp"
- "strings"
- "unicode/utf8"
- )
- // 去重冗余字段
- func delFiled(k string) bool {
- return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
- }
- // 检查字段-
- func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[string]interface{} {
- delete(tmp, "contenthtml")
- delete(tmp, "detail")
- //剑鱼链接方便查阅
- jyhref := fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"])))
- tmp["jytest_href"] = jyhref
- //对于招标类信息~若winner没有值~过滤掉中标相关信息
- if qu.ObjToString(tmp["toptype"]) == "招标" &&
- qu.ObjToString(tmp["subtype"]) != "单一" {
- delete(tmp, "winner")
- delete(tmp, "s_winner")
- delete(tmp, "bidamount")
- delete(tmp, "winnerorder")
- }
- tmp["repeat"] = 0
- //指定爬虫-金额处理-预算-中标金额异常
- if qu.ObjToString(tmp["spidercode"]) == "xz_xzzzqjzscjgycxxxpt_zbtzs" {
- if budget, ok := tmp["budget"].(float64); ok && budget > 0 && budget < 1000000 {
- tmp["budget"] = budget * 10000.0
- }
- if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 {
- tmp["bidamount"] = bidamount / 10000.0
- }
- }
- if qu.ObjToString(tmp["spidercode"]) == "js_jsszbtbw_zbhxrgs" {
- if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount > 0 && bidamount > 1000000000 {
- tmp["bidamount"] = bidamount / 10000.0
- }
- }
- //异常金额类型清洗-
- if _, ok := tmp["bidamount"].(string); ok {
- delete(tmp, "bidamount")
- }
- if _, ok := tmp["budget"].(string); ok {
- delete(tmp, "budget")
- }
- //budget bidamount 阈值限定
- if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 {
- tmp["budget_threshold"] = bg
- delete(tmp, "budget")
- }
- if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 {
- tmp["bidamount_threshold"] = bg
- delete(tmp, "bidamount")
- }
- //对分包存储校验···package
- if tmp["package"] != nil {
- if isExistsPackage(tmp["package"].(map[string]map[string]interface{})) {
- tmp["is_exist_package"] = true
- } else {
- tmp["package_c"] = tmp["package"]
- delete(tmp, "package")
- }
- }
- //对于单位,金额与候选信息进行相互校验与选取
- if winner := qu.ObjToString(tmp["winner"]); winner != "" {
- if winnerorder := ju.IsMarkInterfaceMap(tmp["winnerorder"]); len(winnerorder) > 0 {
- isWin := false
- if tmp["package"] == nil {
- isWin = true
- } else {
- if !isUsedMultiPackage(tmp["package"].(map[string]map[string]interface{})) || winner == qu.ObjToString(tmp["s_winner"]) {
- isWin = true
- }
- }
- if isWin {
- isExists := false
- for k, v := range winnerorder {
- if k >= 2 { //仅对比前两名
- break
- }
- if winner == qu.ObjToString(v["entname"]) && qu.Float64All(v["price"]) > float64(0) {
- tmp["bidamount"] = qu.Float64All(v["price"])
- isExists = true
- break
- }
- }
- //单位不在候选人里面--金额一致
- if !isExists && len(winnerorder) > 1 && len(winnerorder) < 4 { //单位未在候选人里面找到-
- if entname := qu.ObjToString(winnerorder[0]["entname"]); entname != "" && qu.IntAll(winnerorder[0]["sort"]) == 1 {
- if price := qu.Float64All(winnerorder[0]["price"]); price > 0.0 && qu.Float64All(tmp["bidamount"]) == price {
- if !(strings.Contains(entname, winner) || strings.Contains(winner, entname)) {
- if effectivefirm.MatchString(entname) {
- tmp["winner"] = entname
- tmp["s_winner"] = entname
- }
- }
- }
- }
- }
- }
- }
- }
- //快速过滤一遍特殊字段
- for k, v := range tmp {
- if k == "qualifies" {
- continue
- }
- if k == "contract_guarantee" || k == "bid_guarantee" ||
- k == "is_acquire_tender" {
- if len(fmt.Sprint(v)) > 0 {
- tmp[k] = true
- } else {
- delete(tmp, k)
- }
- }
- if k == "is_joint_bidding" || k == "is_payment_deposit" {
- if fmt.Sprint(v) == "true" {
- tmp[k] = true
- } else {
- delete(tmp, k)
- }
- }
- if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 {
- delete(tmp, k)
- }
- }
- //特殊字段~根据其他字段处理
- bid_bond := qu.ObjToString(tmp["bid_bond"])
- if bid_bond != "" && tmp["is_payment_deposit"] == nil {
- if strings.Contains(bid_bond, "保证金") && !clearbondReg.MatchString(bid_bond) {
- tmp["is_payment_deposit"] = true
- }
- }
- //特殊字段~根据其他字段处理
- bidopenaddress := qu.ObjToString(tmp["bidopenaddress"])
- if bidopenaddress != "" && tmp["bidopen_shape"] == nil {
- if utf8.RuneCountInString(bidopenaddress) > 5 {
- tmp["bidopen_shape"] = "线下开标"
- }
- }
- //项目周期-有效值
- projectperiod := qu.ObjToString(tmp["projectperiod"])
- if projectperiod != "" {
- //项目周期包含日期,数字及日期单位可保留,其余可清洗
- isNeedValueReg := regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`)
- if !isNeedValueReg.MatchString(projectperiod) {
- delete(tmp, "projectperiod")
- }
- }
- //工期单位是否有效-清理
- if project_timeunit, ok := tmp["project_timeunit"].(string); ok {
- dateReg := regexp.MustCompile(`[年|月|日|天|周]`)
- if !dateReg.MatchString(project_timeunit) || utf8.RuneCountInString(project_timeunit) > 4 {
- delete(tmp, "project_timeunit")
- }
- //年-0 >5 删除
- if project_timeunit == "年" && (qu.Int64All(tmp["project_duration"]) == 0 || qu.Int64All(tmp["project_duration"]) > 5) {
- delete(tmp, "project_timeunit")
- }
- }
- //中标单位统一
- if tmp["winner"] != nil && tmp["s_winner"] != nil {
- strwin := qu.ObjToString(tmp["winner"])
- strwin_s := qu.ObjToString(tmp["s_winner"])
- if !strings.Contains(strwin_s, strwin) {
- tmp["s_winner"] = strwin
- }
- } else if qu.ObjToString(tmp["s_winner"]) == "" && qu.ObjToString(tmp["winner"]) != "" {
- tmp["s_winner"] = tmp["winner"]
- }
- //投标方式-
- bidway := qu.IntAll(tmp["bidway"])
- if bidway == 1 {
- tmp["bidway"] = "纸质投标"
- } else if bidway == 2 {
- tmp["bidway"] = "电子投标"
- } else {
- delete(tmp, "bidway")
- }
- //折扣系数
- discount := dealWithDiscountBid(tmp)
- if discount > 0.0 {
- tmp["biddiscount"] = discount
- } else {
- delete(tmp, "biddiscount")
- }
- delete(tmp, "biddiscount_up")
- delete(tmp, "biddiscount_down")
- //budget bidamount 阈值限定再次
- if bg, ok := tmp["budget"].(float64); ok && bg >= 1000000000 {
- tmp["budget_threshold"] = bg
- delete(tmp, "budget")
- }
- if bg, ok := tmp["bidamount"].(float64); ok && bg >= 1000000000 {
- tmp["bidamount_threshold"] = bg
- delete(tmp, "bidamount")
- }
- //检查剑鱼发布-爬虫
- jyfb_data := *qu.ObjToMap(j_data["jyfb_data"])
- if jyfb_data != nil {
- for k, v := range jyfb_data {
- if k == "area" {
- delete(tmp, "district")
- }
- tmp[k] = v
- }
- }
- //针对拟建单位~需要验证~各种字段优先级
- if qu.ObjToString(tmp["toptype"]) == "拟建" &&
- qu.ObjToString(tmp["subtype"]) == "拟建" {
- nj_record := map[string]interface{}{}
- for _, v := range NiJianField {
- arr := strings.Split(v, "#")
- k_type, k_field := "", ""
- if len(arr) == 2 {
- k_type, k_field = arr[0], arr[1]
- } else {
- continue
- }
- tmpValue := tmp[k_field]
- is_use := false
- if k_type == "string" {
- if qu.ObjToString(j_data[k_field]) != "" {
- is_use = true
- tmp[k_field] = qu.ObjToString(j_data[k_field])
- }
- } else if k_type == "time" {
- if j_data[k_field] != nil {
- tmp["s_"+k_field] = j_data[k_field]
- }
- //开竣工日期,采集为字符串
- if qu.ObjToString(j_data[k_field]) != "" {
- new_data := clear.ObjToTimestamp([]interface{}{j_data[k_field]}, "")
- if len(new_data) > 0 {
- if qu.Int64All(new_data[0]) > 0 {
- is_use = true
- tmp[k_field] = qu.Int64All(new_data[0])
- }
- }
- } else {
- if qu.Int64All(j_data[k_field]) > int64(0) {
- is_use = true
- tmp[k_field] = qu.Int64All(j_data[k_field])
- }
- }
- } else if k_type == "map" {
- p_info := *qu.ObjToMap(j_data["project_scale_info"])
- if qu.ObjToString(p_info[k_field]) != "" {
- is_use = true
- tmp[k_field] = qu.ObjToString(p_info[k_field])
- }
- } else {
- }
- if tmpValue != nil {
- nj_record[k_field] = map[string]interface{}{
- k_field: tmpValue,
- "is_use": is_use,
- }
- }
- }
- if len(nj_record) > 0 {
- tmp["nj_record"] = nj_record
- }
- }
- //投标截止日期与开始日期-核对
- publishtime := qu.Int64All(tmp["publishtime"])
- bidopentime := qu.Int64All(tmp["bidopentime"])
- bidendtime := qu.Int64All(tmp["bidendtime"])
- if qu.ObjToString(tmp["toptype"]) == "招标" && qu.Int64All(tmp["dataging"]) == 0 {
- if publishtime-bidopentime > 7*86400 && publishtime > 0 && bidopentime > 0 {
- delete(tmp, "bidopentime")
- }
- if publishtime-bidendtime > 7*86400 && publishtime > 0 && bidopentime > 0 {
- delete(tmp, "bidopentime")
- }
- }
- //企业资质检验,不含有资质时删除
- if enterprise_qualification, ok := tmp["enterprise_qualification"]; ok {
- special := `(甲级|乙级|丙级|丁级|一级|二级|三级|叁级|壹级|贰级|四级|五级|专业承包资质|贰 级|叁 级|二类|一类|三类|综合资质|工程设计|市政公用工程|铁路工程|建筑工程|公路工程|人防工程|工程勘察|岩土工程|水文地质勘察|工程测量|工程钻探|承装(修、试)|电力工程|大地测量|消防设施工程|特种工程|房屋建筑工程|房屋建筑监理|信息技术服务|信息系统安全|机电工程|建筑机电安装工程|消防设施工程|建筑智能化|水利水电工程|城乡规划资质|水利工程|环境工程|市政工程|公路行业|交通工程|建筑行业|电子与智能化工程|工程监理|建筑工程|土地规划|地基基础工程)`
- reg := regexp.MustCompile(special)
- var res = make([]string, 0)
- datas := strings.Split(qu.ObjToString(enterprise_qualification), "\n")
- for _, data := range datas {
- results := reg.FindAllString(data, -1)
- if len(results) > 0 {
- res = append(res, data)
- }
- }
- if len(res) == 0 {
- delete(tmp, "enterprise_qualification")
- } else {
- tmp["enterprise_qualification"] = strings.Join(res, "\n")
- }
- }
- return tmp
- }
|