package extract import ( "bytes" "encoding/json" "fmt" "github.com/shopspring/decimal" "gopkg.in/mgo.v2/bson" "io" "jy/clear" "jy/pretreated" ju "jy/util" "net/http" qu "qfw/util" "qfw/util/redis" "regexp" "strings" "sync" "time" "unicode/utf8" ) type scoreIndex struct { Score float64 Index int } var ( lock, lockrule sync.RWMutex lockclear, locktag sync.RWMutex blocktag sync.RWMutex JYUrl = "https://www.jianyu360.cn/article/content/%s.html" cut = ju.NewCut() //获取正文并清理 ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志 TaskList map[string]*ExtractTask //任务列表 ClearTaskList map[string]*ClearTask //清理任务列表 saveLimit = 100 //抽取日志批量保存 PageSize = 5000 //查询分页 Fields = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}` BiddingFields = map[string]interface{}{ "_id": 1, "title": 1, "site": 1, "spidercode": 1, "toptype": 1, "subtype": 1, "comeintime": 1, "publishtime": 1, "href": 1, "dataging": 1, } Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}` NiJianField = []string{ "string#approvecode", "string#total_investment", "string#funds", "string#owner", "string#projectaddr", "string#projectperiod", "string#project_scale", "string#project_person", "string#project_phone", "string#approvenumber", "string#projecttype", "string#approvestatus", "time#project_startdate", "time#project_completedate", "map#construction_area", "map#floor_area", } spidercode = map[string]bool{ "gd_zhsggzyjyzx_jsgc_fjczbgg": true, "js_szgyyqggzyjyzx_jsgc_zjfbgs": true, "zj_tzsyhggzyjyzx_jsgc_kbqk": true, "hb_tmsggzyjyxxw_jsgc_kbqk": true, "zj_nbsyyggzyjyw_jsgc_kbqk": true, "zj_zjsggzyjyzx_jyxx_kbjg": true, "zj_zjzdgcjyw_ztbjglxx_kbjg": true, "zj_lssggzyjyw_jsgc_kbsk": true, "zj_qzslyxggzyjyzx_gggs_xkbjl": true, "sc_mssggzydzjypt_jsgc_kbjl": true, "sc_pzhsggzyjyfwzx_jsgc_kbylb": true, "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true, "a_hbszbtbggfwpt_kbjl": true, "a_szsjsgcjyfwzxbafzx_kbqkgs": true, "a_szldzbyxgs_kbxx": true, "zj_zssssxggzyjyw_gcjs_kbjggs": true, "gd_szszfhjsj_kbqkgs": true, "a_gjggzyjypt_gcjs_kbjl": true, "a_gjggzyjypt_gcjs_kbjl_new": true, "zj_tzsyhggzyjyzx_kbjggg": true, "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true, "ah_czsggzyjyw_jsgc_kbjl": true, "ah_czsggzyjyw_zfcg_kbxx": true, "ah_whsggzyjyfww_kbxx_cgxm": true, "ah_whsggzyjyfww_kbxx_gcxm": true, } clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)") sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)") clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)") clearbondReg *regexp.Regexp = regexp.MustCompile("(无|不|否|金额)") //保证金 textSelectReg *regexp.Regexp = regexp.MustCompile("(中标(单位|供应商|金额|价格))") winorderLock sync.Mutex jfwinorderLock sync.Mutex ) var clearWinnerReg = regexp.MustCompile("(名称|施工|拟定供应商名称|[::])") var unPackageWinnerReg = regexp.MustCompile("(重新招标)") // 包含字母的实体单位 var letter_entity = regexp.MustCompile("^[\u4E00-\u9FA5]{1,10}[A-Za-z]{1,5}[\u4E00-\u9FA5]{1,10}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$") // 落款单位抽取 var inscribe_entity_1 = regexp.MustCompile("\n([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s  ]*[\n]+([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)") var inscribe_entity_2 = regexp.MustCompile("[\n。]([\\s]+)?([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府|段))[\\s  ]*([\\s ]+|发布时间[::\\s ]+)?([0-9]+[\\s ]*年[0-9]+月[0-9]+日|[0-9]+[-][0-9]+[-][0-9]+)\n([\u4E00-\u9FA5].{4,20}(公司|集团|单位|委员会|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会|体]))") // 特殊实体 var inscribe_entity_3 = regexp.MustCompile("(招标组织部门|招标机构)[::]([\u4E00-\u9FA5].{2,25}(公司|集团|单位|委员会|办公室|车务段|机构|企业|厂|场|院|所|店|中心|市|校|学|局|站|城|处|行|部|队|联合[会体]|政府))") // 有效企业 var effectivefirm = regexp.MustCompile("^[\u4E00-\u9FA5]{4,15}(公司|集团|委员会|办公室|车务段|机构|企业|设计|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合[会体]|政府)$") // 发布时间识别 var inscribe_publishtime_1 = regexp.MustCompile("(\\d{4}[年-]\\d{1,2}[月-]\\d{1,2}[日-]*)") var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)") //var exclude_entity = regexp.MustCompile("(咨询|工程造价|交易|代理|投资|(管理|工程)有限|(项目|工程)管理|采购|监理|服务|招标|招投标)") // 实体通用企业 var entdfa_entity = regexp.MustCompile("^([\u4E00-\u9FA5]{4,25}(公司|集团|委员会|机构|企业|设计|厂|场|院|所|店|中心|局|站|城|处|行|部|队|联合[会|体]|总站|管委会|联合会|联合体|医院|卫计委|机关|社区|中心站|中心校|分校|办公室|学校|幼儿园|动物园|管理站|馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|[初高]中|银行|[大中小]学|段|社|室|厅|监狱|监测站|血站|检查站|工作站|供应站|分行|文明办)|.{2}([大小中学][学院]|公司|某部|学社|大队|党校|某(部|中心|单位)|(联通|移动|电信))|某部|某单位)$") var entdfa_clean = regexp.MustCompile("([\\s \n]+)") var entdfa_filtration = regexp.MustCompile("(开标记录)") // 周期有效 var isNeedValueReg = regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`) // 清洗正文 func CleanDetailText(detail string, summary string) string { detail = regexp.MustCompile(``).ReplaceAllString(detail, "") detail = pretreated.RepairCon(detail) detail = ju.CutLableStr(summary + "\n" + detail) detail = cut.ClearHtml(summary + "\n" + detail) return detail } // 综合选取detail与contenthtml情况 true采用正文 func SelectDetailSourceText(detail string, contenthtml string) bool { if len(detail) < 1000 { return false } if textSelectReg.MatchString(detail) && !textSelectReg.MatchString(contenthtml) { return true } return false } // 综合选取detail与contenthtml情况 true采用源码 func SelectSourceStructText(detail string, contenthtml string) bool { arr1 := strings.Split(detail, "\n") arr2 := strings.Split(contenthtml, "\n") //正文长度相差不大且源码有效 if len(detail)-len(contenthtml) < 500 && len(contenthtml) > 500 && len(arr1) == 1 && len(arr2) > len(arr1) { return true } return false } // 遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果 func file2text(doc *map[string]interface{}) { mnameone := map[string]bool{} mname := map[string]bool{} murl := map[string]string{} //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok { if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok { for _, attachs := range attach_text { if fileinfos, ok := attachs.(map[string]interface{}); ok { for _, fileinfo := range fileinfos { if ff, ok := fileinfo.(map[string]interface{}); ok { attach_url := qu.ObjToString(ff["attach_url"]) ffname := qu.ObjToString(ff["file_name"]) if clearStrReg.MatchString(ffname) { continue } mname[ffname] = true murl[ffname] = attach_url if sortStrReg.MatchString(ffname) { mnameone[ffname] = true } } } } } } tmpstr := "" for k := range mnameone { if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) { (*doc)["detailfile"] = tmpstr return } bs := ju.OssGetObject(murl[k]) if utf8.RuneCountInString(bs) <= qu.IntAllDef(ju.Config["filelength"], 150000) { tmpstr += bs + "\n" } else { tmpstr += bs[:qu.IntAllDef(ju.Config["filelength"], 150000)] + "\n" } } for k := range mname { if mnameone[k] { continue } if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) { (*doc)["detailfile"] = tmpstr return } bs := ju.OssGetObject(murl[k]) if utf8.RuneCountInString(bs) <= qu.IntAllDef(ju.Config["filelength"], 150000) { tmpstr += bs + "\n" } else { tmpstr += bs[:qu.IntAllDef(ju.Config["filelength"], 150000)] + "\n" } } (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "") } // 判断-附件分包是否无效判定(不通用) func isUsedPackageJF(jf_package map[string]map[string]interface{}) bool { if jf_package == nil || len(jf_package) == 0 { return false } for _, pack := range jf_package { budget := qu.Float64All(pack["budget"]) bidamount := qu.Float64All(pack["bidamount"]) if budget > 0.0 && budget <= 1.0 { return false } if bidamount > 0.0 && bidamount <= 1.0 { return false } } return true } // 是否有效分包 func isUsedMultiPackage(pkg map[string]map[string]interface{}) bool { if pkg == nil || len(pkg) == 0 { return false } for _, v := range pkg { p_winner := qu.ObjToString(v["winner"]) p_budget := qu.Float64All(v["budget"]) p_bidamout := qu.Float64All(v["bidamount"]) if (p_winner != "" && effectivefirm.MatchString(p_winner)) || p_budget > float64(0) || p_bidamout > float64(0) { return true } } return false } // 判断-附件分包是否无效判定(不通用) func isExistsPackage(pkg map[string]map[string]interface{}) bool { if pkg == nil || len(pkg) == 0 { return false } if len(pkg) == 1 { for _, v := range pkg { winner := qu.ObjToString(v["winner"]) budget := qu.Float64All(v["budget"]) bidamout := qu.Float64All(v["bidamount"]) if winner != "" || budget > float64(0) || bidamout > float64(0) { return true } } return false } return true } // getQualifications 添加所有资质新字段 func (e *ExtractTask) getQualifications(tmp *map[string]interface{}, j_data map[string]interface{}) { /** qualifications 资质要求 */ detail := qu.ObjToString(j_data["detail"]) new_detail := pretreated.HtmlToText(detail) qualifications := ju.GetQualifications(new_detail) if qualifications != "" { (*tmp)["qualifications"] = qualifications } } // 落款识别~采购单位 func (e *ExtractTask) inscribeRecognize(tmp *map[string]interface{}, j_data map[string]interface{}, jf_text string) { //落款实体 if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe && !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") { if new_buyer := InscribeEntity(qu.ObjToString(j_data["detail"]), *tmp); new_buyer != "" { (*tmp)["buyer"] = new_buyer (*tmp)["inscribe_buyer"] = "落款结构实体" } } //落款特殊实体 if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe && qu.ObjToString(j_data["spidercode"]) == "a_zgwkjtyxgscgdzswpt_cgxx_qb" && !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") { if new_buyer := InscribeSpecEntity(qu.ObjToString(j_data["detail"])); new_buyer != "" { (*tmp)["buyer"] = new_buyer (*tmp)["inscribe_buyer"] = "落款特殊实体" } } //实体服务识别 if qu.ObjToString((*tmp)["buyer"]) == "" && ju.Inscribe && !(qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建") { if new_buyer := InscribeEntityDfa(qu.ObjToString(j_data["detail"]), jf_text, *tmp); new_buyer != "" { (*tmp)["buyer"] = new_buyer (*tmp)["inscribe_buyer"] = "实体识别服务" } } //拟建不能存buyer if qu.ObjToString((*tmp)["toptype"]) == "拟建" && qu.ObjToString((*tmp)["subtype"]) == "拟建" { delete((*tmp), "buyer") } //识别发布时间 if qu.IntAll(j_data["publishtime"]) == -1 { if qu.IntAll((*tmp)["ext_publishtime"]) == 0 { if ext_publishtime := InscribePublishtime(j_data); ext_publishtime > int64(0) { (*tmp)["ext_publishtime"] = ext_publishtime } } } else { delete((*tmp), "ext_publishtime") } } // 识别实体 func InscribeEntity(detail string, tmp map[string]interface{}) string { new_str := "" new_detail := pretreated.TextAfterRemoveTable(detail) if len(new_detail) > 200 { new_detail = detail[len(new_detail)-200:] } new_str = inscribe_entity_1.FindString(new_detail) if new_str == "" { new_str = inscribe_entity_2.FindString(new_detail) if new_str != "" { str1 := inscribe_entity_2.ReplaceAllString(new_str, "${2}") str2 := inscribe_entity_2.ReplaceAllString(new_str, "${6}") if str1 == str2 && str1 != "" { new_str = str1 } } } else { new_str = inscribe_entity_1.ReplaceAllString(new_str, "${2}") } winner := qu.ObjToString(tmp["winner"]) agency := qu.ObjToString(tmp["agency"]) //与其它单位发生了重叠 if new_str != "" && (new_str == winner || new_str == agency) { new_str = "" } if new_str != "" && exclude_entity.MatchString(new_str) { new_str = "" } return new_str } // 识别实体 func InscribeEntityDfa(detail string, jf_detail string, tmp map[string]interface{}) string { new_str := "" projectname := qu.ObjToString(tmp["projectname"]) title := qu.ObjToString(tmp["title"]) winner := qu.ObjToString(tmp["winner"]) agency := qu.ObjToString(tmp["agency"]) toptype := qu.ObjToString(tmp["toptype"]) //采用-标题项目名称 if new_str = EmployEntDfaText(title+"\n"+projectname, winner, agency); new_str != "" { return new_str } if !entdfa_filtration.MatchString(title) { //采用-排除表格的文本识别 new_detail := pretreated.TextAfterRemoveTable(detail) new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n") if len(new_detail) > 500 { new_detail = new_detail[len(new_detail)-500:] } if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" { return new_str } if toptype != "结果" { //采用-去除标签的纯文本(含表格) new_detail = pretreated.HtmlToText(detail) new_detail = entdfa_clean.ReplaceAllString(new_detail, "\n") if len(new_detail) > 500 { new_detail = new_detail[len(new_detail)-500:] } if new_str = EmployEntDfaText(new_detail, winner, agency); new_str != "" { return new_str } } } //采用-附件识别 if !entdfa_filtration.MatchString(title) { if len(jf_detail) > 500 { jf_detail = jf_detail[len(jf_detail)-500:] } if new_str = EmployEntDfaText(jf_detail, winner, agency); new_str != "" { return new_str } } return new_str } // 实体识别方法 func EmployEntDfaText(text string, winner string, agency string) string { new_str := "" if text == "" { return new_str } dfa_info, l := EmployPostEntDfa(bson.M{"detail": text}), 0 if res := ju.ConvertInterface(dfa_info["result"]); len(res) > 0 { for _, v := range res { if cl := utf8.RuneCountInString(v); cl > l && cl > 3 && !exclude_entity.MatchString(v) && entdfa_entity.MatchString(v) { if !(v == winner || v == agency) { l = cl new_str = v } } } } return new_str } // 识别发布时间 func InscribePublishtime(j_data map[string]interface{}) int64 { //落款文本识别 detail := pretreated.TextAfterRemoveTable(qu.ObjToString(j_data["detail"])) if len(detail) > 200 { detail = detail[len(detail)-200:] } new_str := inscribe_entity_1.FindString(detail) if new_str == "" { new_str = inscribe_entity_2.FindString(detail) if new_str != "" { new_str = inscribe_entity_2.ReplaceAllString(new_str, "${5}") } } else { new_str = inscribe_entity_1.ReplaceAllString(new_str, "${5}") } if data := clear.ObjToTimestamp([]interface{}{new_str}, ""); len(data) > 0 { if ext_publishtime := qu.Int64All(data[0]); ext_publishtime > int64(0) { return ext_publishtime } } //附件名称识别 projectinfo := *qu.ObjToMap(j_data["projectinfo"]) attachments := *qu.ObjToMap(projectinfo["attachments"]) for _, v := range attachments { info := *qu.ObjToMap(v) filename := qu.ObjToString(info["filename"]) if pt_str := inscribe_publishtime_1.FindString(filename); pt_str != "" { if data := clear.ObjToTimestamp([]interface{}{pt_str}, ""); len(data) > 0 { if ext_publishtime := qu.Int64All(data[0]); ext_publishtime > int64(0) { return ext_publishtime } } } } return int64(0) } // 识别特殊采购单位 func InscribeSpecEntity(detail string) string { new_str := "" new_detail := pretreated.TextAfterRemoveTable(detail) if len(new_detail) > 200 { new_detail = detail[len(new_detail)-200:] } find_str := inscribe_entity_3.FindString(new_detail) if find_str != "" { new_str = inscribe_entity_3.ReplaceAllString(find_str, "${2}") } return new_str } func EmployPostEntDfa(data map[string]interface{}) map[string]interface{} { info := map[string]interface{}{} client := &http.Client{Timeout: 2 * time.Second} jsonStr, _ := json.Marshal(data) //172.17.4.238:9996,extcity.spdata.jianyu360.com resp, err := client.Post("http://172.17.4.238:9996/service/entity/", "application/json", bytes.NewBuffer(jsonStr)) if err != nil { return info } res, err := io.ReadAll(resp.Body) if err != nil { return info } err = json.Unmarshal(res, &info) if err != nil { return info } return info } // 处理折扣系数- func dealWithDiscountBid(tmp map[string]interface{}) float64 { biddiscount := qu.Float64All(tmp["biddiscount"]) biddiscount_up := qu.Float64All(tmp["biddiscount_up"]) biddiscount_down := qu.Float64All(tmp["biddiscount_down"]) baseCount := float64(1) if biddiscount_down > 0.0 { num1 := decimal.NewFromFloat(baseCount) num2 := decimal.NewFromFloat(biddiscount_down) decimalValue := num1.Sub(num2) res, _ := decimalValue.Float64() return res } if biddiscount_up > 0.0 { num1 := decimal.NewFromFloat(baseCount) num2 := decimal.NewFromFloat(biddiscount_up) decimalValue := num1.Add(num2) res, _ := decimalValue.Float64() return res } if biddiscount > 0.0 { if biddiscount > 1.0 && biddiscount <= 10.0 { num1 := decimal.NewFromFloat(10.0) num2 := decimal.NewFromFloat(biddiscount) decimalValue := num2.Div(num1) res, _ := decimalValue.Float64() return res } else if biddiscount > 10.0 { num1 := decimal.NewFromFloat(100.0) num2 := decimal.NewFromFloat(biddiscount) decimalValue := num2.Div(num1) res, _ := decimalValue.Float64() return res } else { return biddiscount } } return 0.0 } // 精度丢失-相加 func precisionAddFloat(tmp1, tmp2 float64) float64 { num1 := decimal.NewFromFloat(tmp1) num2 := decimal.NewFromFloat(tmp2) decimalValue := num2.Add(num1) res, _ := decimalValue.Float64() return res } // 特殊金额-处理判断-倍率关系 func calculateAbnormalMoney(val []*ju.ExtField) (bool, int) { //金额结果只有两种 - 倍率关系10000 - 过10E moneyIndex := []int{} moneyArr := []float64{} first_money := float64(0) difValue := map[string]interface{}{} for k, v := range val { //取第一个非负数,项目名称除外 if v.IsTrue && v.Score > -1 { moneyArr = append(moneyArr, qu.Float64All(v.Value)) moneyIndex = append(moneyIndex, k) key := "" if m, ok := v.Value.(float64); ok { key = fmt.Sprintf("%f", m) } else { key = qu.ObjToString(v.Value) } if difValue[key] == nil { difValue[key] = 1 } //if len(difValue) > 2 { // return false, 0 //} } } //计算金额数组 if len(difValue) == 2 { money_1, money_2 := float64(0), float64(0) for k, v := range moneyArr { if k == 0 { money_1 = v } else { if v != money_1 { money_2 = v break } } } isRatio, new_money := false, float64(0) //判断金额是否为倍率关系 if money_1 != float64(0) && money_2 != float64(0) { if money_1 == money_2*float64(10000) && money_1 >= 100000000 { isRatio = true new_money = money_2 } if money_2 == money_1*float64(10000) && money_2 >= 100000000 { isRatio = true new_money = money_1 } if isRatio { //采用新值 for k, v := range moneyArr { if v == new_money { return true, moneyIndex[k] } } } } } else if len(difValue) > 2 { //多组金额 is_exists := false for _, v := range moneyArr { if v >= 1000000000 { is_exists = true first_money = v } } if is_exists { for k, v := range moneyArr { if v*10000 == first_money { return true, moneyIndex[k] } } } } else { } return false, 0 } // 筛选重复候选人-相关 func filterRepeatWinArr(j *ju.Job) { if j.SpiderCode == "sh_shszfhcxjsglwyh_jsgc_zhbhxrgs" { sort_WinOrder_Arr := make([][]map[string]interface{}, 0) sort_arr := make([]map[string]interface{}, 0) for _, v := range j.Winnerorder { sort := qu.IntAll(v["sort"]) if sort == 1 { //为一组 if len(sort_arr) > 0 { sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr) } sort_arr = make([]map[string]interface{}, 0) } sort_arr = append(sort_arr, v) } if len(sort_arr) > 0 { sort_WinOrder_Arr = append(sort_WinOrder_Arr, sort_arr) } if len(sort_WinOrder_Arr) > 0 { //有重复排序组-开始筛选清理 isIndex := 0 for index, winArr := range sort_WinOrder_Arr { if len(winArr) > 0 { if qu.ObjToString(winArr[0]["price"]) != "" && qu.ObjToString(winArr[0]["entname"]) != "" { isIndex = index break } } } j.Winnerorder = sort_WinOrder_Arr[isIndex] } } } // 中标候选人经过清理之后,重新取出赋值 func (e *ExtractTask) ResetWinnerorder(j *ju.Job) { if len(j.Winnerorder) == 0 { return } maxlen := len(j.Winnerorder) - 1 //中标单位 //i := 0 winners := []*ju.ExtField{} bidamounts := []*ju.ExtField{} //对候选人单位名称进行清洗 winorderLock.Lock() ruleArr := []*RuleCore{} ruleArr = e.RuleCores["all_all"]["winner"] for _, v := range j.Winnerorder { new_winner := qu.ObjToString(v["entname"]) if new_winner != "" { for _, v1 := range ruleArr { for _, v2 := range v1.KVRuleCores { if new_winner == "" { break } new_winner = v2.RegPreBac.Reg.ReplaceAllString(new_winner, v2.RegPreBac.Replace) } } } v["entname"] = new_winner } winorderLock.Unlock() if maxlen > 0 { //新增-指定爬虫中标候选人过滤 filterRepeatWinArr(j) if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 { return } winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5}) if j.Winnerorder[0]["price"] != nil { tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney) if tmpPrice[len(tmpPrice)-1].(bool) { bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true}) } } } if j.Result["winner"] == nil && len(winners) > 0 { j.Result["winner"] = winners } else if len(winners) > 0 { j.Result["winner"] = append(j.Result["winner"], winners...) } if j.Result["bidamount"] == nil && len(bidamounts) > 0 { j.Result["bidamount"] = bidamounts } else if len(bidamounts) > 0 { j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...) } if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 { winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5}) j.Result["winner"] = winners if j.Winnerorder[0]["price"] != nil { tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney) if tmpPrice[len(tmpPrice)-1].(bool) { bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true}) } j.Result["bidamount"] = bidamounts } } } func RemoveReplicaSliceString(slc []string) []string { result := make([]string, 0) tempMap := make(map[string]bool, len(slc)) for _, e := range slc { if tempMap[e] == false { tempMap[e] = true result = append(result, e) } } return result } // 分包中标单位是否-合理 func isValidPkgWinner(winner string) bool { if utf8.RuneCountInString(winner) < 4 { return false } return true } // 组装kv func assembleKVText(j *ju.Job, tmp *map[string]interface{}) { var kvtext bytes.Buffer blocks := make([]ju.BlockAndTag, 0) for _, v := range j.Block { //分包和标签 if ju.SaveBlock { xx, _ := json.Marshal(v) tmpblock := new(ju.TmpBlock) err := json.Unmarshal(xx, &tmpblock) if err != nil { if v.BPackage != nil { bpb, _ := json.Marshal(v.BPackage) tmpblock.BPackage = string(bpb) } tmpblock = rangeBlockToJson(v, *tmpblock) } blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock}) } //把所有kv组装成一个字符串,存库 for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} { if jv == nil { continue } for jv_k, jv_v := range jv.KvTags { for _, jv_vv := range jv_v { kvtext.WriteString(jv_k) kvtext.WriteString(":") kvtext.WriteString(jv_vv.Value) kvtext.WriteString("\n") } } } } if kvtext.Len() > 0 { (*tmp)["kvtext"] = kvtext.String() } if len(blocks) > 0 { if blocksBytes, err := json.Marshal(blocks); err == nil { if utf8.RuneCount(blocksBytes) < 100000 { (*tmp)["blocks"] = string(blocksBytes) } } } } // 辅助信息,如果没有排序先排序 func auxInfo(j *ju.Job) map[string][]map[string]interface{} { fieldalls := map[string][]map[string]interface{}{} if j == nil { return fieldalls } qykredis := redis.RedisPool[ju.QYK_RedisName].Get() defer qykredis.Close() db := 0 for field, val := range j.Result { //ju.Sort(val) if field == "buyer" { db = ju.BuyerDB } else if field == "winner" { db = ju.WinnerDB } else if field == "agency" { db = ju.AgencyDB } sfields := []map[string]interface{}{} for _, v := range val { standardized := false if _, err := qykredis.Do("SELECT", db); err != nil { fmt.Println("redis select err", err) } else { rep, err := qykredis.Do("GET", v.Value) if rep != nil && err == nil { standardized = true } } if field == "budget" || field == "bidamount" { if !v.IsTrue { continue } } sfield := map[string]interface{}{ "val": v.Value, "type": v.Type, "score": v.Score, "blocktag": v.BlockTag, "sourceval": v.SourceValue, "standardized": standardized, } sfields = append(sfields, sfield) } fieldalls[field] = sfields } return fieldalls }