package extract import ( "bytes" "encoding/json" "fmt" "github.com/shopspring/decimal" "go.mongodb.org/mongo-driver/bson/primitive" "jy/clear" db "jy/mongodbutil" "jy/pretreated" ju "jy/util" qu "qfw/util" "qfw/util/redis" "regexp" "sort" "strconv" "strings" "sync" "time" "unicode/utf8" log "github.com/donnie4w/go-logger/logger" "gopkg.in/mgo.v2/bson" ) var ( lock, lockrule, lockclear, locktag, blocktag sync.RWMutex JYUrl = "https://www.jianyu360.com/article/content/%s.html" cut = ju.NewCut() //获取正文并清理 ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志 TaskList map[string]*ExtractTask //任务列表 ClearTaskList map[string]*ClearTask //清理任务列表 saveLimit = 100 //抽取日志批量保存 PageSize = 5000 //查询分页 Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}` //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1,"review_experts":1,"purchasing":1}` Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}` /*f = map[string]bool{ "T": true, "_d": true, "area": true, "channel": true, "comeintime": true, "competehref": true, "href": true, "l_np_publishtime": true, "publishtime": true, "sendflag": true, "site": true, "spidercode": true, "title": true, "projectname": true, }*/ /*f = map[string]bool{ "contentid": true, "progName": true, "updateTime": true, "url": true, "areaId": true, "areaName": true, "popTitle": true, "showTitle": true, "progId": true, "catid": true, "isConcern": true, "followCount": true, "followSuggestion": true, "isBoutique": true, "canTj": true, "tenderAmountNumber": true, "tenderAmountUnit": true, "bidderAmountNumber": true, "bidderAmountUnit": true, "registrationBeginTime": true, "registrationEndTime": true, "starNum": true, "title": true, "proInvested": true, "projectname": true, }*/ spidercode = map[string]bool{ "gd_zhsggzyjyzx_jsgc_fjczbgg": true, "js_szgyyqggzyjyzx_jsgc_zjfbgs": true, "zj_tzsyhggzyjyzx_jsgc_kbqk": true, "hb_tmsggzyjyxxw_jsgc_kbqk": true, "zj_nbsyyggzyjyw_jsgc_kbqk": true, "zj_zjsggzyjyzx_jyxx_kbjg": true, "zj_zjzdgcjyw_ztbjglxx_kbjg": true, "zj_lssggzyjyw_jsgc_kbsk": true, "zj_qzslyxggzyjyzx_gggs_xkbjl": true, "sc_mssggzydzjypt_jsgc_kbjl": true, "sc_pzhsggzyjyfwzx_jsgc_kbylb": true, "a_zgzbtbggfwpt_wasjgf_ss_kbjl": true, "a_hbszbtbggfwpt_kbjl": true, "a_szsjsgcjyfwzxbafzx_kbqkgs": true, "a_szldzbyxgs_kbxx": true, "zj_zssssxggzyjyw_gcjs_kbjggs": true, "gd_szszfhjsj_kbqkgs": true, "a_gjggzyjypt_gcjs_kbjl": true, "a_gjggzyjypt_gcjs_kbjl_new": true, "zj_tzsyhggzyjyzx_kbjggg": true, "a_zgzbtbggfwpy_wasjgf_kbjl_lsbl": true, "ah_czsggzyjyw_jsgc_kbjl": true, "ah_czsggzyjyw_zfcg_kbxx": true, "ah_whsggzyjyfww_kbxx_cgxm": true, "ah_whsggzyjyfww_kbxx_gcxm": true, } ) //启动测试抽取-、、、、结果追踪 func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool { defer qu.Catch() ext := TaskList[taskId] if ext == nil { ext = &ExtractTask{} ext.Id = taskId ext.InitTestTaskInfo(resultcoll, trackcoll) ext.IsRun = true ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB) } ext.InitSite() ext.InitRulePres() ext.InitRuleBacks(false) ext.InitRuleBacks(true) ext.InitRuleCore(false) ext.InitRuleCore(true) ext.InitPkgCore() ext.InitBlockRule() ext.InfoTypeList() ext.InitTag(false) ext.InitTag(true) ext.InitClearFn(false) ext.InitClearFn(true) ext.Lock() //ext.IsExtractCity = false if ext.IsExtractCity { //版本上控制是否开始城市抽取 //初始化城市DFA信息 ext.InitCityInfo() //ext.InitCityDFA() ext.InitAreaCode() ext.InitPostCode() } ext.Unlock() //质量审核 ext.InitAuditFields() ext.InitAuditRule() ext.InitAuditClass() ext.InitAuditRecogField() //品牌抽取是否开启 ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool) //价格个数抽取是否开启 ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool) //附件抽取是否开启 ext.InitFile() ext.TaskInfo.TestColl = resultcoll TaskList[taskId] = ext return RunExtractTestTask(ext, startId, num) } func IdTrans(startId string) bson.ObjectId { defer qu.Catch() return bson.ObjectIdHex(startId) } func StringTOBsonId(id string) primitive.ObjectID { objectId, _ := primitive.ObjectIDFromHex(id) return objectId } func BsonTOStringId(id interface{}) string { return id.(primitive.ObjectID).Hex() } //开始测试任务抽取 func RunExtractTestTask(ext *ExtractTask, startId, num string) bool { n, _ := strconv.Atoi(num) id := IdTrans(startId) if id.Valid() { //query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}} query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}} list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n) for _, v := range *list { //if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据 // continue //} if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录 continue } var j, jf *ju.Job var isSite bool if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) { v["isextFile"] = true j, jf, isSite = ext.PreInfo(v) } else {//无附件 j, _, isSite = ext.PreInfo(v) } go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存 ext.TaskInfo.ProcessPool <- true } return true } else { return false } } //启动抽取 func StartExtractTaskId(taskId string) bool { defer qu.Catch() isgo := false ext := TaskList[taskId] if ext == nil { ext = &ExtractTask{} ext.Id = taskId ext.InitTaskInfo() isgo = true } else { ext.Id = taskId ext.InitTaskInfo() } ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB) ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB) ext.InitSite() ext.InitRulePres() ext.InitRuleBacks(false) ext.InitRuleBacks(true) ext.InitRuleCore(false) ext.InitRuleCore(true) ext.InitPkgCore() ext.InitBlockRule() ext.InfoTypeList() ext.InitTag(false) ext.InitTag(true) ext.InitClearFn(false) ext.InitClearFn(true) ext.Lock() if ext.IsExtractCity { //版本上控制是否开始城市抽取 //初始化城市DFA信息 //ext.InitCityDFA() ext.InitCityInfo() ext.InitAreaCode() ext.InitPostCode() } ext.Unlock() //质量审核 ext.InitAuditFields() ext.InitAuditRule() ext.InitAuditClass() ext.InitAuditRecogField() //品牌抽取是否开启 ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool) //价格个数抽取是否开启 ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool) //附件抽取是否开启 ext.InitFile() ext.IsRun = true go ext.ResultSave(true) go ext.BidSave(true) if isgo { go RunExtractTask(taskId) } TaskList[taskId] = ext return true } //停止抽取 func StopExtractTaskId(taskId string) bool { defer qu.Catch() ext := TaskList[taskId] if ext != nil { ext.IsRun = false TaskList[taskId] = ext } //更新task.s_extlastid db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`) return true } //开始抽取 func RunExtractTask(taskId string) { defer qu.Catch() ext := TaskList[taskId] query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}} count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query) pageNum := (count + PageSize - 1) / PageSize limit := PageSize if count < PageSize { limit = count } fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query) for i := 0; i < pageNum; i++ { query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}} list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit) fmt.Printf("page=%d,query=%v", i+1, query, len(*list)) for _, v := range *list { //if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据 // continue //} //根据标题判断是否抽取 b := IsExtract("title", qu.ObjToString(v["title"]), "") if !b { continue } _id := qu.BsonIdToSId(v["_id"]) //log.Debug(_id) if !ext.IsRun { break } var j, jf *ju.Job var isSite bool if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) { v["isextFile"] = true j, jf, isSite = ext.PreInfo(v) } else { j, _, isSite = ext.PreInfo(v) } go ext.ExtractProcess(j, jf, isSite) ext.TaskInfo.LastExtId = _id ext.TaskInfo.ProcessPool <- true } db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`) if !ext.IsRun { break } } //更新task.s_extlastid time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) }) } //信息预处理-不和版本关联,取最新版本的配置项 func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) { return (&ExtractTask{}).PreInfo(doc) } var clearMoneyReg *regexp.Regexp = regexp.MustCompile("(PPP[\\s]?项目|新城镇建设|国土资源局|基金管理|高速公路|水系生态治理|水生态建设|棚改旧改|棚户区改造|棚改项目|危房改造项目|土地整理|高速公路项目)") //信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人 func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) { defer qu.Catch() //判断是否有附件这个字段 var isextFile bool if doc["isextFile"] != nil { isextFile = doc["isextFile"].(bool) } detail := "" d1, _ := doc["detail"].(string) d2, _ := doc["contenthtml"].(string) if len(d1) >= len(d2) || d2 == "" { detail = d1 } else { detail = d2 } detail = regexp.MustCompile(``).ReplaceAllString(detail, "") d3, _ := doc["summary"].(string) //全文的需要修复表格 detail = pretreated.RepairCon(detail) detail = ju.CutLableStr(d3 + "\n" + detail) detail = cut.ClearHtml(d3 + "\n" + detail) doc["detail"] = detail isClearnMoney := !clearMoneyReg.MatchString(detail) if isClearnMoney { isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"])) } isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", "")) if isextFile { file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果 } //正文小于200个字,有附件把附件内容加到正文 //tmpDeatil := detail //tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil)) //if err == nil { // conlen := utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " ")) // if conlen < 2000 { // if isextFile { // detail += qu.ObjToString(doc["detailfile"]) // doc["detail"] = detail // } // } else if conlen > qu.IntAllDef(ju.Config["filelength"], 1000000) { // //防止文本过长,造成抽取阻塞 // log.Debug("文本太长", doc["_id"], conlen) // doc["detail"] = d3 // } //} toptype := qu.ObjToString(doc["toptype"]) subtype := qu.ObjToString(doc["subtype"]) if qu.ObjToString(doc["type"]) == "bid" { toptype = "结果" } if toptype == "" { toptype = "all" } if subtype == "" { subtype = "all" } if subtype == "其他" { subtype = "其它" } toMap := qu.ObjToMap(doc["jsondata"]) //log.Debug("toMap", toMap) if (*toMap) != nil { if (*toMap)["extweight"] == nil { (*toMap)["extweight"] = ju.Config["jsondata_extweight"] } if (*toMap)["jsoncontent"] != nil { delete(*toMap, "jsoncontent") } for k, v := range *toMap { if _, ok := v.(float64); ok { continue } else if _, ok := v.(int64); ok { continue } else if _, ok2 := v.(string); ok2 { continue } else { delete(*toMap, k) } } } j = &ju.Job{ SourceMid: qu.BsonIdToSId(doc["_id"]), Category: toptype, CategorySecond: subtype, Content: qu.ObjToString(doc["detail"]), SpiderCode: qu.ObjToString(doc["spidercode"]), Site: qu.ObjToString(doc["site"]), //Domain: qu.ObjToString(doc["domain"]), //Href: qu.ObjToString(doc["href"]), Title: qu.ObjToString(doc["title"]), Data: &doc, City: qu.ObjToString(doc["city"]), Province: qu.ObjToString(doc["area"]), Jsondata: toMap, Result: map[string][]*ju.ExtField{}, BuyerAddr: qu.ObjToString(doc["buyeraddr"]), RuleBlock: e.RuleBlock, Dataging: qu.IntAll(doc["dataging"]), IsClearnMoney: isClearnMoneystr, } if isextFile { jf = &ju.Job{ SourceMid: qu.BsonIdToSId(doc["_id"]), Category: toptype, CategorySecond: subtype, Content: qu.ObjToString(doc["detailfile"]), SpiderCode: qu.ObjToString(doc["spidercode"]), Site: qu.ObjToString(doc["site"]), Title: qu.ObjToString(doc["title"]), Data: &doc, City: qu.ObjToString(doc["city"]), Province: qu.ObjToString(doc["area"]), Jsondata: toMap, Result: map[string][]*ju.ExtField{}, BuyerAddr: qu.ObjToString(doc["buyeraddr"]), RuleBlock: e.RuleBlock, IsFile: isextFile, Dataging: qu.IntAll(doc["dataging"]), IsClearnMoney: isClearnMoneystr, } } codeSite := j.SpiderCode //是否启用站点 if value, ok := e.SiteMerge.Load(codeSite); ok { isSite = value.(bool) } if isSite { //是否配置站点 exp, isSite := e.Luacodes.Load(codeSite) if isSite { if exp.(map[string]interface{})["e.SiteClearFn"] != nil { e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string) } if exp.(map[string]interface{})["e.SiteTag"] != nil { e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag) } if exp.(map[string]interface{})["e.SiteRuleCores"] != nil { e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore) } if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil { e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo) } } } qu.Try(func() { pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块 if isextFile && strings.TrimSpace(jf.Content) != "" { pretreated.AnalyStart(jf, isSite, codeSite) } }, func(err interface{}) { log.Debug("pretreated.AnalyStart", err, j.SourceMid) }) return j, jf, isSite } var sortStrReg *regexp.Regexp = regexp.MustCompile("(招标|采购|需求|投标|[竞询议]报价|公示|单一来源|询价|成交|中标)") var clearStrReg *regexp.Regexp = regexp.MustCompile("((设计|施工|招标)图|业绩|图纸)") //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果 func file2text(doc *map[string]interface{}) { mnameone := map[string]bool{} mname := map[string]bool{} murl := map[string]string{} //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok { if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok { for _, attachs := range attach_text { if fileinfos, ok := attachs.(map[string]interface{}); ok { for _, fileinfo := range fileinfos { if ff, ok := fileinfo.(map[string]interface{}); ok { attach_url := qu.ObjToString(ff["attach_url"]) ffname := qu.ObjToString(ff["file_name"]) if clearStrReg.MatchString(ffname) { continue } mname[ffname] = true murl[ffname] = attach_url if sortStrReg.MatchString(ffname) { mnameone[ffname] = true } } } } } } tmpstr := "" for k := range mnameone { if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) { (*doc)["detailfile"] = tmpstr return } bs := ju.OssGetObject(murl[k]) if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) { tmpstr += bs + "\n" } } for k := range mname { if utf8.RuneCountInString(tmpstr) > qu.IntAllDef(ju.Config["filelength"], 150000) { (*doc)["detailfile"] = tmpstr return } bs := ju.OssGetObject(murl[k]) if utf8.RuneCountInString(bs) < qu.IntAllDef(ju.Config["filelength"], 150000) { tmpstr += bs + "\n" } } (*doc)["detailfile"] = strings.ReplaceAll(tmpstr, "附件", "") } //抽取-正文 func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) { e.ExtractDetail(j, isSite, j.SpiderCode) //正文抽取属性 if jf != nil && jf.IsFile { //附件jf → j 合并 e.ExtractDetail(jf, isSite, j.SpiderCode) for tmpk, xs := range jf.Result { if len(j.Result[tmpk]) == 0 { if tmpk == "budget" || tmpk == "bidamount" { for _, v := range xs { if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 { j.Result[tmpk] = append(j.Result[tmpk], v) } } } else { j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...) } }else { if tmpk=="winner" && len(j.Result[tmpk]) == 1 { if j.Result[tmpk][0].Value == "" { j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...) } } //if tmpk=="buyer" { //附件数据-没有正文靠谱 // j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...) //} } } if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 { j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...) } if len(j.PackageInfo) == 0 && jf.PackageInfo != nil && len(jf.PackageInfo) > 0 { j.PackageInfo = jf.PackageInfo } } if isSite { ismerge, ok := e.SiteMerge.Load(j.SpiderCode) if ok && ismerge.(bool) { tmpj := &ju.Job{ SourceMid: j.SourceMid, Category: j.Category, CategorySecond: j.CategorySecond, Content: j.Content, SpiderCode: j.SpiderCode, //Domain: qu.ObjToString(doc["domain"]), //Href: qu.ObjToString(doc["href"]), Title: j.Title, Data: j.Data, City: j.City, Province: j.Province, Jsondata: j.Jsondata, Result: map[string][]*ju.ExtField{}, BuyerAddr: j.BuyerAddr, RuleBlock: e.RuleBlock, } qu.Try(func() { pretreated.AnalyStart(tmpj, false, "") //job.Block分块 }, func(err interface{}) { log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid) }) e.ExtractDetail(tmpj, false, "") //if jf != nil && jf.IsFile { // e.ExtractFile(jf, false, "") //} //合并数据 j.Block = append(j.Block, tmpj.Block...) j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...) for tmpk, _ := range j.Result { if len(tmpj.Result[tmpk]) > 0 { j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...) } } for tmpk, _ := range tmpj.Result { if len(j.Result[tmpk]) == 0 { j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...) } } } } //分析抽取结果并保存 AnalysisSaveResult(j, jf, e) <-e.TaskInfo.ProcessPool } //抽取-正文-规则等 detail func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) { qu.Try(func() { doc := *j.Data //全局前置规则,结果覆盖doc属性 //for _, v := range e.RulePres { // doc = ExtRegPre(doc, j, v, e.TaskInfo) //} tmprules := map[string][]*RuleCore{} lockrule.Lock() //加载分类抽取配置 if j.Category == "all" || j.CategorySecond == "all" { if isSite { for k, vc1 := range e.SiteRuleCores["all_all"] { tmprules[k] = vc1 } } else { for k, vc1 := range e.RuleCores["all_all"] { tmprules[k] = vc1 } } } else { if isSite { for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] { tmprules[k] = vc1 } } else { for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] { tmprules[k] = vc1 } } } if len(tmprules) < 1 { //分类未覆盖部分 if isSite { for k, vc1 := range e.RuleCores["all_all"] { tmprules[k] = vc1 } } else { for k, vc1 := range e.SiteRuleCores["all_all"] { tmprules[k] = vc1 } } } lockrule.Unlock() //抽取规则 for _, vc1 := range tmprules { for _, vc := range vc1 { tmp := ju.DeepCopy(doc).(map[string]interface{}) //是否进入逻辑 if !ju.Logic(vc.LuaLogic, tmp) { continue } if vc.Field =="buyer" { //log.Debug("调试抽取字段") } ////抽取-前置规则 //for _, v := range vc.RulePres { // tmp = ExtRegPre(tmp, j, v, e.TaskInfo) //} // log.Debug("抽取-前置规则", tmp) //抽取-规则 ExtRuleCore(tmp, e, vc, j, isSite) // log.Debug("抽取-规则", tmp) //抽取-后置规则 for _, v := range vc.RuleBacks { ExtRegBack(j, v, e.TaskInfo, vc) } //kv规则 for _, v := range vc.KVRuleCores { ExtRuleKV(j, v, e.TaskInfo) } // log.Debug("抽取-后置规则", tmp) //项目名称未能抽取到,标题来凑 if vc.Field == "projectname" { if vc.ExtFrom == "title" { isextitle := true for _, v := range j.Result[vc.Field] { if len([]rune(qu.ObjToString(v.Value))) > 5 { isextitle = false break } } if isextitle { //标题加入选举 field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title} if isSite { field.Score = 1 } j.Result[vc.Field] = append(j.Result[vc.Field], field) } } for i := 0; i < 3; i++ { for _, v := range vc.RuleBacks { ExtRegBack(j, v, e.TaskInfo, vc) } } } } } //全局后置规则 if isSite { for _, v := range e.SiteRuleBacks { ExtRegBack(j, v, e.TaskInfo, nil) } } else { for _, v := range e.RuleBacks { ExtRegBack(j, v, e.TaskInfo, nil) } } //函数清理 for key, val := range j.Result { for i, v := range val { if v.Field == "projectname" && v.Type == "table" { break } if key == "budget" || key == "bidamount" { if _, ok := v.Value.(float64); ok && !v.IsTrue { continue } } lockclear.Lock() var cfn = []string{} if isSite { cfn = e.SiteClearFn[key] if len(cfn) == 0 { cfn = e.ClearFn[key] } } else { cfn = e.ClearFn[key] } lockclear.Unlock() if len(cfn) == 0 { continue } data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney) if key == "budget" || key == "bidamount" { if istrue, ok := data[len(data)-1].(bool); istrue && ok { j.Result[key][i].IsTrue = true } else { j.Result[key][i].Value = data[0] continue } } before, _ := v.Value.(string) v.Value = data[0] BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e) //添加行数清理的日志 //清理特殊符号 lockclear.Lock() if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil { text := qu.ObjToString(v.Value) before = text //指定清理--新增-函数清理-其他清理 if key=="winner"||key=="agency"||key=="buyer" { text = strings.ReplaceAll(text,"【","") text = strings.ReplaceAll(text,"】","") } v.Value = clear.OtherClean(key, text) BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e) } //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo) lockclear.Unlock() } } PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重 // bs, _ := json.Marshal(j.Result) // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs)) }, func(err interface{}) { log.Debug("ExtractProcess err", err, j.SourceMid) }) } func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) { qu.Try(func() { doc := *j.Data //全局前置规则,结果覆盖doc属性 // for _, v := range e.RulePres { // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 { // doc = ExtRegPre(doc, j, v, e.TaskInfo) // } // } //抽取规则 tmprules := map[string][]*RuleCore{} lockrule.Lock() if j.Category == "all" || j.CategorySecond == "all" { for k, vc1 := range e.RuleCores["all_all"] { tmprules[k] = vc1 } } else { for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] { tmprules[k] = vc1 } } lockrule.Unlock() for _, vc1 := range tmprules { for _, vc := range vc1 { tmp := ju.DeepCopy(doc).(map[string]interface{}) //是否进入逻辑 if !ju.Logic(vc.LuaLogic, tmp) { continue } //抽取-前置规则 // for _, v := range vc.RulePres { // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 { // tmp = ExtRegPre(tmp, j, v, e.TaskInfo) // } // } // log.Debug("抽取-前置规则", tmp) //抽取-规则 if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 { ExtRuleCore(tmp, e, vc, j, isSite) } // log.Debug("抽取-规则", tmp) //抽取-后置规则 for _, v := range vc.RuleBacks { if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 { ExtRegBack(j, v, e.TaskInfo, vc) } } // log.Debug("抽取-后置规则", tmp) } } //全局后置规则 for _, v := range e.RuleBacks { if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 { ExtRegBack(j, v, e.TaskInfo, nil) } } //函数清理 for key, val := range j.Result { for _, v := range val { lockclear.Lock() var cfn = []string{} if isSite { cfn = e.SiteClearFn[key] if len(cfn) == 0 { cfn = e.ClearFn[key] } } else { cfn = e.ClearFn[key] } lockclear.Unlock() if len(cfn) == 0 { continue } data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney) v.Value = data[0] //清理特殊符号 lockclear.Lock() if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil { text := qu.ObjToString(v.Value) text = clear.OtherClean(key, text) v.Value = text } lockclear.Unlock() } } PackageDetail(j, e, isSite, codeSite) //处理分包信息 // bs, _ := json.Marshal(j.Result) // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs)) }, func(err interface{}) { log.Debug("ExtractProcess err", err) }) } //前置过滤 func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} { defer qu.Catch() before := ju.DeepCopy(doc).(map[string]interface{}) extinfo := map[string]interface{}{} if in.IsLua { lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText} if j != nil { lua.Block = j.Block } extinfo = lua.RunScript("pre") for k, v := range extinfo { //结果覆盖原doc doc[k] = v } AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志 } else { var key string if !j.IsFile { key = qu.If(in.Field == "", "detail", in.Field).(string) } else { key = qu.If(in.Field == "", "detailfile", in.Field).(string) } text := qu.ObjToString(doc[key]) extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "") doc[key] = extinfo[key] //结果覆盖原doc AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志 } return doc } //抽取-规则 func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) { //候选人加入 var kvMap map[string][]map[string]interface{} extByReg := true if vc.ExtFrom != "title" { kvMap, extByReg = getKvByLuaFields(vc, j, e) } for _, v := range vc.RuleCores { if v.IsLua { ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e) } else if extByReg { ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite) } } //如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面 if vc.Field == "budget" && len(kvMap) == 0 { if len(j.BlockPackage) == 1 { for _, bp := range j.BlockPackage { for fieldname, field := range vc.LFields { if field != vc.Field { continue } tp := "" for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} { if k == 0 { tp = "colon" } else if k == 1 { tp = "space" } else if k == 2 { tp = "table" } if v == nil || v.KvTags == nil { continue } for _, vv := range v.KvTags[fieldname] { text := ju.TrimLRSpace(vv.Value, "") if text != "" { tmp := &ju.ExtField{ ExtFrom: "package", Field: vc.Field, Code: "CL_分包", Type: tp, MatchType: "package", RuleText: bp.Text, SourceValue: vv.Key, Value: text, } if isSite { tmp.Score = 1 } j.Result[vc.Field] = append(j.Result[vc.Field], tmp) } } } } break } } } else { for k, v := range kvMap { if j.Result[k] == nil { j.Result[k] = [](*ju.ExtField){} } for _, tmp := range v { field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]} if k == "bidamount" && field.ExtFrom == "第一候选人" { field.Score = 1 } if isSite { field.Score = 1 } if (field.Field == "bidamount" || field.Field == "budget") && field.Type == "table" { moneys := clear.ObjToMoney([]interface{}{field.Value, ""}, j.SpiderCode, j.IsClearnMoney) if len(moneys) > 0 { if vf, ok := moneys[0].(float64); ok { field.Value = vf field.IsTrue = moneys[len(moneys)-1].(bool) } else if vi, ok := moneys[0].(int); ok { field.Value = float64(vi) field.IsTrue = moneys[len(moneys)-1].(bool) } } } if tmp["blocktag"] != nil { btag := make(map[string]string) for k := range tmp["blocktag"].(map[string]bool) { blocktag.Lock() if TagConfigDesc[k] != "" { btag[k] = TagConfigDesc[k] } blocktag.Unlock() } field.BlockTag = btag } j.Result[k] = append(j.Result[k], field) } } } } //抽取-规则-kv func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) { defer qu.Catch() if extfrom == "title" || !in.IsLua { return } lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText} lua.KvMap = *kvMap lua.Block = j.Block extinfo := lua.RunScript("core") if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok { for _, v := range tmps { v["core"] = in.Code } (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...) } if len(extinfo) > 0 { AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志 } } //抽取-规则-正则 func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) { defer qu.Catch() //根据field配置项目,是否抽取。例如:废标、流标等跳过, b := IsExtract(in.Field, j.Title, j.Content) if !b { return } //全文正则 //text := qu.ObjToString(doc[extfrom]) //if in.Field != "" { // extinfo := extRegCoreToResult(extfrom, text, j, in) // if len(extinfo) > 0 { // AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志 // } //} //块抽取 if in.Field != "" { if extfrom == "title" { extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite) if len(extinfo) > 0 { AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志 } } else if in.Field == "qualifies" { extinfo := extRegCoreToResult(extfrom, pretreated.HtmlToText(qu.ObjToString(doc[extfrom])), &map[string]string{}, j, in, isSite) if len(extinfo) > 0 { AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志 } } else { for _, v := range j.Block { btag := make(map[string]string) for k := range v.Classify { blocktag.Lock() btag[k] = TagConfigDesc[k] blocktag.Unlock() } extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite) if len(extinfo) > 0 { AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志 } } } } } //pkg抽取-规则-正则 func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) { defer qu.Catch() //根据field配置项目,是否抽取。例如:废标、流标等跳过, b := IsExtract(in.Field, j.Title, j.Content) if !b { return } //块抽取 if in.Field != "" { for k, vbpkg := range j.BlockPackage { rep := map[string]string{} if in.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线) if in.Field == "budget" && vbpkg.Budget > 0 { continue } if in.Field == "agencyfee" && vbpkg.Agencyfee > 0 { continue } if in.Field == "bidamount" && vbpkg.Bidamount > 0 { continue } if in.Field == "winner" && vbpkg.Winner != "" { continue } if in.Field == "bidstatus" && vbpkg.BidStatus != "" { continue } if in.Field == "projectname" && vbpkg.Name != "" { continue } if in.Field == "winner" && vbpkg.Winner != "" { continue } if in.Field == "winnerperson" { if vbpkg.Winner == "" || len(vbpkg.Winner) < 4 { continue } if !strings.Contains(vbpkg.Text, vbpkg.Winner) { continue } } if in.Field == "winnertel" { if vbpkg.WinnerPerson == "" { continue } } //处理正负数修正 ptmp := strings.Split(in.RuleText, "#") sign := 0 if len(ptmp) == 2 { if ptmp[1] == "正" { sign = 1 } else if ptmp[1] == "负" { sign = -1 } } tmp := strings.Split(ptmp[0], "__") if len(tmp) == 2 { epos := strings.Split(tmp[1], ",") posm := map[string]int{} for _, v := range epos { ks := strings.Split(v, ":") if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area posm[ks[1]] = qu.IntAll(ks[0]) } else { posm[in.Field] = qu.IntAll(ks[0]) } } var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } //log.Debug("pattern", pattern) //fmt.Println(text) reg := regexp.MustCompile(pattern) apos := reg.FindAllStringSubmatchIndex(vbpkg.Text, -1) for i, _ := range apos { pos := apos[i] for k, p := range posm { if len(pos) > p { if pos[p] == -1 || pos[p+1] == -1 { continue } val := vbpkg.Text[pos[p]:pos[p+1]] if string(val) == "" { continue } if sign == -1 { rep[k+"_"+fmt.Sprint(i)] = "-" + val } else { rep[k+"_"+fmt.Sprint(i)] = val } } } } //fmt.Println(text) for i := 0; i < len(apos); i++ { if strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]) != "" { if in.Field == "budget" && vbpkg.Budget <= 0 { lock.Lock() cfn := e.ClearFn[in.Field] lock.Unlock() data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney) if data[len(data)-1].(bool) { j.BlockPackage[k].Budget = qu.Float64All(data[0]) j.BlockPackage[k].IsTrueBudget = true } break } else if in.Field == "agencyfee" && vbpkg.Agencyfee <= 0 { lock.Lock() cfn := e.ClearFn[in.Field] lock.Unlock() data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney) if data[len(data)-1].(bool) { j.BlockPackage[k].Agencyfee = qu.Float64All(data[0]) j.BlockPackage[k].IsTrueAgencyfee = true } break }else if in.Field == "bidamount" && vbpkg.Bidamount <= 0 { lock.Lock() cfn := e.ClearFn[in.Field] lock.Unlock() data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode, j.IsClearnMoney) if data[len(data)-1].(bool) { j.BlockPackage[k].Bidamount = qu.Float64All(data[0]) j.BlockPackage[k].IsTrueBidamount = true } break } else if in.Field == "winner" { if j.BlockPackage[k].Winner == "" { j.BlockPackage[k].Winner = rep[in.Field+"_"+fmt.Sprint(i)] break } } else if in.Field == "winnertel" { if j.BlockPackage[k].WinnerTel == "" { j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)] break } } else if in.Field == "winnerperson" { if j.BlockPackage[k].WinnerPerson == "" { j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)] break } } else if in.Field == "bidstatus" { if j.BlockPackage[k].BidStatus == "" { j.BlockPackage[k].BidStatus = rep[in.Field+"_"+fmt.Sprint(i)] break } } else if in.Field == "projectname" { if j.BlockPackage[k].Name == "" { j.BlockPackage[k].Name = rep[in.Field+"_"+fmt.Sprint(i)] break } } else if in.Field == "winnerperson" { if j.BlockPackage[k].WinnerPerson == "" { j.BlockPackage[k].WinnerPerson = rep[in.Field+"_"+fmt.Sprint(i)] break } } else if in.Field == "winnertel" { if j.BlockPackage[k].WinnerTel == "" && j.BlockPackage[k].Winner != "" && j.BlockPackage[k].WinnerPerson != "" { j.BlockPackage[k].WinnerTel = rep[in.Field+"_"+fmt.Sprint(i)] break } } } } } } else { pos := in.RegCore.Reg.FindStringIndex(vbpkg.Text) val := "" if len(pos) == 2 { //"text" = "text"[pos[1]:] val = "text"[pos[1]:] rs := regexp.MustCompile("[^\r\n\t]+") tmp := rs.FindAllString("text", -1) if len(tmp) > 0 { val = tmp[0] } } if val != "" { if in.Field == "budget" && vbpkg.Budget <= 0 { lock.Lock() cfn := e.ClearFn[in.Field] lock.Unlock() data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney) if data[len(data)-1].(bool) { j.BlockPackage[k].Budget = qu.Float64All(data[0]) j.BlockPackage[k].IsTrueBudget = true } break } if in.Field == "bidamount" && vbpkg.Bidamount <= 0 { lock.Lock() cfn := e.ClearFn[in.Field] lock.Unlock() data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode, j.IsClearnMoney) if data[len(data)-1].(bool) { j.BlockPackage[k].Bidamount = qu.Float64All(data[0]) j.BlockPackage[k].IsTrueBidamount = true } break } else if in.Field == "bidstatus" { if j.BlockPackage[k].BidStatus == "" { j.BlockPackage[k].BidStatus = val break } } else if in.Field == "projectname" { if j.BlockPackage[k].Name == "" { j.BlockPackage[k].Name = val break } } } } } } } //lua脚本根据属性设置提取kv值 func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) { kvmap := map[string][]map[string]interface{}{} if len(j.Winnerorder) > 1 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 { if vc.Field == "bidamount" { for _, v := range j.Winnerorder { if v["price"] == nil { continue } kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{ "code": "winnerorder", "field": vc.Field, "ruletext": "中标候选人_" + fmt.Sprint(v["sortstr"]), "extfrom": v["sortstr"], "sourcevalue": v["price"], "value": v["price"], "type": "winnerorder", "matchtype": "winnerorder", }) return kvmap, false } //候选人中标金额 if price := j.Winnerorder[0]["price"]; price != nil { kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{ "code": "CL_中标候选人", "field": vc.Field, "ruletext": "中标候选人", "extfrom": j.Winnerorder[0]["sortstr"], "sourcevalue": price, "value": price, "type": "winnerorder", "matchtype": "winnerorder", }) return kvmap, false } } } for fieldname, field := range vc.LFields { if field != vc.Field { continue } extractFromKv(field, fieldname, j.Block, vc, kvmap, j.Category) } AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志 return kvmap, true } func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}, Category string) { //qu.Debug("fieldname+++", fieldname) for _, bl := range blocks { tp := "" if strings.Contains(bl.Title, "保证金") && (field == "bid_bond" || field == "contract_bond") { if text := ju.TrimLRSpace(bl.Text, ""); text != "" { if Category == "招标" || Category == "拟建" || Category == "预告" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "code": "CL_块内容", "field": field, "ruletext": "投标保证金", "extfrom": "投标保证金_块内容", "sourcevalue": bl.Text, "value": text, "type": "投标保证金_块内容", "matchtype": "tag_string", "blocktag": bl.Classify, "weight": 0, }) } else if Category == "结果" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "code": "CL_", "field": field, "ruletext": "履约保证金", "extfrom": "履约保证金_块内容", "sourcevalue": bl.Text, "value": text, "type": "履约保证金_块内容", "matchtype": "tag_string", "blocktag": bl.Classify, "weight": 0, }) } } return } for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} { if k == 0 { tp = "colon" } else if k == 1 { tp = "space" } else if k == 2 { tp = "table" } if v == nil || v.KvTags == nil { continue } for _, vv := range v.KvTags[fieldname] { text := ju.TrimLRSpace(vv.Value, "") if text != "" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "code": "CL_" + vv.Key, "field": field, "ruletext": vv.Key, "extfrom": vc.ExtFrom, "sourcevalue": text, "value": text, "type": tp, "matchtype": "tag_string", "blocktag": bl.Classify, "weight": vv.Weight, }) //if field != "winnertel" && field != "winnerperson" { // //break //暂定取第一个 //} } } } if len(kvmap[field]) == 0 { extractFromKv(field, fieldname, bl.Block, vc, kvmap, Category) } } } //正则提取结果 func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo, isSite bool) map[string][]map[string]interface{} { defer qu.Catch() var score float64 score = vre.Score if isSite { score = score + 1.0 } extinfo := map[string][]map[string]interface{}{} rep := map[string]string{} if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线) //处理正负数修正 ptmp := strings.Split(vre.RuleText, "#") sign := 0 if len(ptmp) == 2 { if ptmp[1] == "正" { sign = 1 } else if ptmp[1] == "负" { sign = -1 } } tmp := strings.Split(ptmp[0], "__") if len(tmp) == 2 { epos := strings.Split(tmp[1], ",") posm := map[string]int{} for _, v := range epos { ks := strings.Split(v, ":") if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area posm[ks[1]] = qu.IntAll(ks[0]) } else { posm[vre.Field] = qu.IntAll(ks[0]) } } var pattern string if strings.Contains(tmp[0], "\\u") { tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1) tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1) pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`) } else { pattern = tmp[0] } //log.Debug("pattern", pattern) //fmt.Println(text) reg := regexp.MustCompile(pattern) apos := reg.FindAllStringSubmatchIndex(text, -1) for i, _ := range apos { pos := apos[i] for k, p := range posm { if len(pos) > p { if pos[p] == -1 || pos[p+1] == -1 { continue } val := text[pos[p]:pos[p+1]] if string(val) == "" { continue } if sign == -1 { rep[k+"_"+fmt.Sprint(i)] = "-" + val } else { rep[k+"_"+fmt.Sprint(i)] = val } } } } tmps := []map[string]interface{}{} for i := 0; i < len(apos); i++ { if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" { tmp := map[string]interface{}{ "field": vre.Field, "code": vre.Code, "ruletext": vre.RuleText, "extfrom": text, "value": rep[vre.Field+"_"+fmt.Sprint(i)], "type": "regexp", "matchtype": "regcontent", "blocktag": *tag, "score": score, } exfield := ju.ExtField{ BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)], Value: rep[vre.Field+"_"+fmt.Sprint(i)], Score: score, } if vre.Field == "qualifies" { if len(rep) >= 2 { tmp["ruletext"] = rep[vre.Field+"_key_"+fmt.Sprint(i)] exfield.RuleText = rep[vre.Field+"_key_"+fmt.Sprint(i)] } } tmps = append(tmps, tmp) if tmp["blocktag"] != nil { exfield.BlockTag = tmp["blocktag"].(map[string]string) } j.Result[vre.Field] = append(j.Result[vre.Field], &exfield) } } if len(tmps) > 0 { //fmt.Println(tmps) extinfo[vre.Field] = tmps } } } else { pos := vre.RegCore.Reg.FindStringIndex(text) val := "" if len(pos) == 2 { text = text[pos[1]:] rs := regexp.MustCompile("[^\r\n\t]+") tmp := rs.FindAllString(text, -1) if len(tmp) > 0 { val = tmp[0] } } if val != "" { tmps := []map[string]interface{}{} tmp := map[string]interface{}{ "field": vre.Field, "code": vre.Code, "ruletext": vre.RuleText, "extfrom": text, "value": val, "type": "regexp", "matchtype": "regcontent", "blocktag": *tag, "score": score, } tmps = append(tmps, tmp) extinfo[vre.Field] = tmps if j.Result[vre.Field] == nil { j.Result[vre.Field] = [](*ju.ExtField){} } field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val, Score: score} if tmp["blocktag"] != nil { field.BlockTag = tmp["blocktag"].(map[string]string) } j.Result[vre.Field] = append(j.Result[vre.Field], field) } } return extinfo } //后置过滤 func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) { defer qu.Catch() if in.IsLua { result := GetResultMapForLua(j) lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText} if j != nil { lua.Block = j.Block } extinfo := lua.RunScript("back") for k, v := range extinfo { if tmps, ok := v.([]map[string]interface{}); ok { j.Result[k] = [](*ju.ExtField){} for _, tmp := range tmps { field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"]} if tmp["blocktag"] != nil { field.BlockTag = tmp["blocktag"].(map[string]string) } j.Result[k] = append(j.Result[k], field) //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0}) } } } if len(extinfo) > 0 { AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志 } } else { extinfo := map[string]interface{}{} if in.Field != "" { clearByTitle := false if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理 clearByTitle = true } if j.Result[in.Field] != nil { tmp := j.Result[in.Field] exts := []interface{}{} for k, v := range tmp { if clearByTitle && v.ExtFrom != "title" { continue } //table抽取到的数据不清理 if v.Type == "table" && v.Field == "projectname" { return } text := qu.ObjToString(v.Value) if v.Field == "bidamount" || v.Field == "budget" { if (strings.Contains(qu.ObjToString(v.SourceValue), "费率")|| strings.Contains(qu.ObjToString(v.SourceValue), "税率") || strings.Contains(qu.ObjToString(v.SourceValue), "(%)") ) && !strings.Contains(qu.ObjToString(v.SourceValue), "工程设计费"){ j.Result[in.Field][k].IsTrue = false continue } } if text != "" { text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace) } if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志 continue } j.Result[in.Field][k].Value = text exts = append(exts, map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "type": v.Type, "matchtype": v.MatchType, "extfrom": v.ExtFrom, "value": text, }) } if len(exts) > 0 { extinfo[in.Field] = exts AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志 } } } else { for key, tmp := range j.Result { exts := []interface{}{} for k, v := range tmp { //table抽取到的数据不清理 if v.Type == "table" && v.Field == "projectname" { return } text := qu.ObjToString(v.Value) if text != "" { text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace) } if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志 continue } j.Result[key][k].Value = text exts = append(exts, map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "type": v.Type, "matchtype": v.MatchType, "extfrom": v.ExtFrom, "value": text, }) } if len(exts) > 0 { extinfo[key] = exts AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志 } } } } } //后置过滤 func ExtRegBackPkg(j *ju.Job, in *RegLuaInfo) { defer qu.Catch() for k, v := range j.BlockPackage { if in.Field == "winner" { j.BlockPackage[k].Winner = in.RegPreBac.Reg.ReplaceAllString(v.Winner, in.RegPreBac.Replace) } else if in.Field == "bidstatus" { j.BlockPackage[k].BidStatus = in.RegPreBac.Reg.ReplaceAllString(v.BidStatus, in.RegPreBac.Replace) } else if in.Field == "" { j.BlockPackage[k].Text = in.RegPreBac.Reg.ReplaceAllString(v.Text, in.RegPreBac.Replace) } else if in.Field == "projectname" { j.BlockPackage[k].Name = in.RegPreBac.Reg.ReplaceAllString(v.Name, in.RegPreBac.Replace) } else if in.Field == "winnerperson" { j.BlockPackage[k].WinnerPerson = in.RegPreBac.Reg.ReplaceAllString(v.WinnerPerson, in.RegPreBac.Replace) } else if in.Field == "winnertel" { j.BlockPackage[k].WinnerTel = in.RegPreBac.Reg.ReplaceAllString(v.WinnerTel, in.RegPreBac.Replace) } } } //KV过滤 func ExtRuleKV(j *ju.Job, in *RegLuaInfo, t *TaskInfo) { defer qu.Catch() extinfo := map[string]interface{}{} if in.Field != "" { if j.Result[in.Field] != nil { tmp := j.Result[in.Field] exts := []interface{}{} for k, v := range tmp { if v.Type != "table" && !strings.Contains(v.Type, "colon") && !strings.Contains(v.Type, "space") { continue } if v.Field=="中标金额" ||v.Field=="bidamount" { //log.Debug("调试字段...") } text := qu.ObjToString(v.Value) if text != "" { text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace) } if text == qu.ObjToString(v.Value) { //值未发生改变,不存日志 continue } j.Result[in.Field][k].Value = text exts = append(exts, map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "type": v.Type, "matchtype": v.MatchType, "extfrom": v.ExtFrom, "value": text, }) } if len(exts) > 0 { extinfo[in.Field] = exts AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志 } } } } //获取抽取结果map[string][]interface{},lua脚本使用 func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} { defer qu.Catch() result := map[string][]map[string]interface{}{} for key, val := range j.Result { if result[key] == nil { result[key] = []map[string]interface{}{} } for _, v := range val { tmp := map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "value": v.Value, "type": v.Type, "matchtype": v.MatchType, "extfrom": v.ExtFrom, } result[key] = append(result[key], tmp) } } return result } //抽取日志 func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) { defer qu.Catch() if !t.IsEtxLog { return } logdata := map[string]interface{}{ "code": qu.If(v.Code == "", "kv", v.Code), "name": v.Name, "type": ftype, "ruletext": v.RuleText, "islua": v.IsLua, "field": v.Field, "version": t.Version, "taskname": t.Name, "before": before, "extinfo": extinfo, "sid": sid, "comeintime": time.Now().Unix(), } lock.Lock() ExtLogs[t] = append(ExtLogs[t], logdata) lock.Unlock() } func BeforeAddClearFnLog(ftype, name, sid, before, matchtype string, ext *ju.ExtField, e *ExtractTask) { exts := []map[string]interface{}{} exts = append(exts, map[string]interface{}{ "field": ext.Field, "code": ext.Code, "type": ftype, "matchtype": matchtype, "extfrom": ext.ExtFrom, "value": ext.Value, }) extinfo := map[string]interface{}{ ext.Field: exts, } AddClearFnLog(ftype, sid, before, extinfo, ext.Code, name, ext.Field, e.TaskInfo) } func AddClearFnLog(ftype, sid string, before interface{}, extinfo interface{}, code, name, field string, t *TaskInfo) { defer qu.Catch() if !t.IsEtxLog { return } logdata := map[string]interface{}{ "code": code, "name": name, "type": ftype, "ruletext": "", "islua": false, "field": field, "version": t.Version, "taskname": t.Name, "before": before, "extinfo": extinfo, "sid": sid, "comeintime": time.Now().Unix(), } lock.Lock() ExtLogs[t] = append(ExtLogs[t], logdata) lock.Unlock() } //保存抽取日志 func SaveExtLog() { defer qu.Catch() tmpLogs := map[*TaskInfo][]map[string]interface{}{} lock.Lock() tmpLogs = ExtLogs ExtLogs = map[*TaskInfo][]map[string]interface{}{} lock.Unlock() for k, v := range tmpLogs { if len(v) < saveLimit { db.Mgo.SaveBulk(k.TrackColl, v...) } else { for { if len(v) > saveLimit { tmp := v[:saveLimit] db.Mgo.SaveBulk(k.TrackColl, tmp...) v = v[saveLimit:] } else { db.Mgo.SaveBulk(k.TrackColl, v...) break } } } } time.AfterFunc(10*time.Second, SaveExtLog) } type FieldValue struct { Value interface{} Count int } var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:|:") var unPackageWinnerReg = regexp.MustCompile("(重新招标)") //分析抽取结果并保存 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) { qu.Try(func() { if (j.Category == "招标" || j.Category == "预告") && (len(j.BlockPackage) > 0 || len(j.PackageInfo) > 0 || len(j.Result) > 0) { if j.CategorySecond != "单一" { delete(j.Result, "winner") delete(j.Result, "bidamount") for _, v := range j.BlockPackage { v.Bidamount = 0 v.IsTrueBidamount = false if v.Winner != "" { v.Winner = "" if v.SpaceKV != nil { delete(v.SpaceKV.KvTags, "中标单位") } if v.TableKV != nil { delete(v.TableKV.KvTags, "中标单位") } if v.ColonKV != nil { delete(v.ColonKV.KvTags, "中标单位") } } } for _, v := range j.PackageInfo { delete(v, "winner") delete(v, "bidamount") } j.Winnerorder = nil if jf != nil && jf.Winnerorder != nil { jf.Winnerorder = nil } } } //重新取出清理过后的中标候选人 resetWinnerorder(j) //打分 doc, result, _id := funcAnalysis(j, e) //_, result, _id := funcAnalysis(j, e) if ju.IsSaveTag { go otherNeedSave(j, result, e) } //从排序结果中取值 tmp := map[string]interface{}{} //抽取值 tmp["spidercode"] = j.SpiderCode tmp["site"] = j.Site if len(*j.Jsondata) > 0 { tmp["jsondata"] = j.Jsondata } //字段-抽取来源 fieldSource := make(map[string]interface{},0) //字段-抽取来源 for k, val := range result { if k == "qualifies" { squalifies := make([]interface{}, 0) squalifiesMap := make(map[string]*scoreIndex, 0) for _, kv := range val { skey := kv.RuleText if kv.Score > 0 { if squalifiesMap[skey] == nil { squalifiesMap = map[string]*scoreIndex{ skey: &scoreIndex{ Score: kv.Score, Index: len(squalifies), }, } squalifies = append(squalifies, map[string]interface{}{ "key": skey, "value": kv.Value, }) } else { if squalifiesMap[skey].Score < kv.Score { squalifies[squalifiesMap[skey].Index] = map[string]interface{}{ "key": skey, "value": kv.Value, } } } } } tmp[k] = squalifies continue } for _, v := range val { //取第一个非负数,项目名称除外 //存0是否有效 if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Score > -1 { tmp[v.Field] = v.Value fieldSource[v.Field] = map[string]interface{}{ "ext_type":v.Type, "ext_from":v.ExtFrom, } break } if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 { tmp[v.Field] = v.Value fieldSource[v.Field] = map[string]interface{}{ "ext_type":v.Type, "ext_from":v.ExtFrom, } break } } } tmp["winner"] = strings.ReplaceAll(qu.ObjToString(tmp["winner"]), ",,", ",") if len(j.PackageInfo) > 15 { for k, v := range j.PackageInfo { j.PackageInfo = map[string]map[string]interface{}{} j.PackageInfo[k] = v break } } if len(j.PackageInfo) > 0 { //分包信息 tmp["package"] = j.PackageInfo //包预算,中标金额合并大于抽取就覆盖 tmpBidamount, tmpBudget,tmpAgencyfee:=qu.Float64All(0),qu.Float64All(0),qu.Float64All(0) //s_winner逗号分隔拼接,分包中标人 var tmpstr, savewinner []string //按包排序 for b, v := range j.PackageInfo { if v["winner"] != nil && v["winner"] != "" { tmpstr = append(tmpstr, b) } } //包预算,中标金额合并大于抽取就覆盖 if len(j.PackageInfo) >= 1 { //包数大于1累加 for _, v := range j.PackageInfo { if v["budget"] != nil { tmpBudget = precisionAddFloat(tmpBudget,qu.Float64All(v["budget"])) } if v["bidamount"] != nil { tmpBidamount = precisionAddFloat(tmpBidamount,qu.Float64All(v["bidamount"])) } if v["agencyfee"] != nil { tmpAgencyfee = precisionAddFloat(tmpAgencyfee,qu.Float64All(v["agencyfee"])) } } if qu.Float64All(tmp["budget"]) < tmpBudget { fieldSource["budget"] = map[string]interface{}{ "ext_type":"", "ext_from":"package", } tmp["budget"] = tmpBudget } if qu.Float64All(tmp["agencyfee"]) < tmpAgencyfee { fieldSource["agencyfee"] = map[string]interface{}{ "ext_type":"", "ext_from":"package", } tmp["agencyfee"] = tmpAgencyfee } if qu.Float64All(tmp["bidamount"]) > 0 && qu.Float64All(tmp["budget"]) > 0 && (qu.Float64All(tmp["bidamount"])/10 > qu.Float64All(tmp["budget"])) { fieldSource["bidamount"] = map[string]interface{}{ "ext_type":"", "ext_from":"package", } tmp["bidamount"] = tmpBidamount } else if qu.Float64All(tmp["bidamount"]) < tmpBidamount { fieldSource["bidamount"] = map[string]interface{}{ "ext_type":"", "ext_from":"package", } tmp["bidamount"] = tmpBidamount } } else { //包数等于1,tmp没有值取包里的值 if tmp["budget"] == nil || tmp["budget"] == 0 { for _, v := range j.PackageInfo { if v["budget"] != nil { fieldSource["budget"] = map[string]interface{}{ "ext_type":"", "ext_from":"package", } tmp["budget"] = v["budget"] } } } if tmp["agencyfee"] == nil || tmp["agencyfee"] == 0 { for _, v := range j.PackageInfo { if v["agencyfee"] != nil { fieldSource["agencyfee"] = map[string]interface{}{ "ext_type":"", "ext_from":"package", } tmp["agencyfee"] = v["agencyfee"] } } } if tmp["bidamount"] == nil || tmp["bidamount"] == 0 { for _, v := range j.PackageInfo { if v["bidamount"] != nil { fieldSource["bidamount"] = map[string]interface{}{ "ext_type":"", "ext_from":"package", } tmp["bidamount"] = v["bidamount"] } } } } //s_winner逗号分隔拼接,分包中标人 sort.Strings(tmpstr) for _, v := range tmpstr { winner := qu.ObjToString(j.PackageInfo[v]["winner"]) new_winner := clearWinnerReg.ReplaceAllString(winner, "") if new_winner == "" { continue } //名称黑名单 if unPackageWinnerReg.MatchString(new_winner) { continue } savewinner = append(savewinner, new_winner) } if (savewinner == nil || len(savewinner) == 0) && tmp["winner"] != nil { tmp["s_winner"] = tmp["winner"] fieldSource["s_winner"] = fieldSource["winner"] } else if savewinner != nil { savewinner = RemoveReplicaSliceString(savewinner) tmp["s_winner"] = strings.Join(savewinner, ",") if len(savewinner)==1 { fieldSource["s_winner"] = fieldSource["winner"] }else if len(savewinner)>1{ fieldSource["s_winner"] = map[string]interface{}{ "ext_type":"", "ext_from":"package", } } } } else if tmp["winner"] != nil && tmp["winner"] != "" { //没有分包取winner tmp["s_winner"] = tmp["winner"] fieldSource["s_winner"] = fieldSource["winner"] } if len(j.Winnerorder) > 0 { //候选人信息 for i, v := range j.Winnerorder { if v["price"] != nil { tmpPrice := clear.ObjToMoney([]interface{}{v["price"], ""}, j.SpiderCode, j.IsClearnMoney) if tmpPrice[len(tmpPrice)-1].(bool) { j.Winnerorder[i]["price"] = tmpPrice[0] } else { delete(j.Winnerorder[i], "price") } } } tmp["winnerorder"] = j.Winnerorder } //处理附件 var resultf map[string][]*ju.ExtField ffield := map[string]interface{}{} if jf != nil { _, resultf, _ = funcAnalysis(jf, e) for _, val := range resultf { for _, v := range val { //取第一个非负数 if v.Score > -1 { ffield[v.Field] = v.Value if tmp[v.Field] == nil { if (v.Field == "bidamount" || v.Field == "budget") && v.IsTrue && v.Value.(float64) > 100 && v.Value.(float64) < 50000000000 { tmp[v.Field] = v.Value fieldSource[v.Field] = map[string]interface{}{ "ext_type":v.Type, "ext_from":"ff", } break } if v.Score > -1 && (v.Field != "bidamount" && v.Field != "budget") && len(strings.TrimSpace(fmt.Sprint(v.Value))) > 0 { tmp[v.Field] = v.Value fieldSource[v.Field] = map[string]interface{}{ "ext_type":v.Type, "ext_from":"ff", } break } } break } } } if len(jf.PackageInfo) > 0 { //分包信息 ffield["package"] = jf.PackageInfo } if len(jf.Winnerorder) > 0 { //候选人信息 ffield["winnerorder"] = jf.Winnerorder } } //添加字段来源 tmp["field_source"] = fieldSource //添加字段来源 for k, v := range *doc { if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 { (*doc)[k] = []rune(qu.ObjToString(v))[:100000] } //去重冗余字段 if delFiled(k) { continue } if tmp[k] == nil { tmp[k] = v } } //质量审核 if ju.QualityAudit { e.QualityAudit(tmp) } //城市抽取 if e.IsExtractCity { e.NewExtractCity(j, &tmp, _id) } //品牌抽取 if ju.IsBrandGoods { tmp["checkhas"] = map[string]int{ "hastable": j.HasTable, "hasgoods": j.HasGoods, "hasbrand": j.HasBrand, "haskey": j.HasKey, } if len(j.BrandData) > 0 { tmp["tablebrand"] = j.BrandData } } //prince和number抽取 if ju.IsPriceNumber { priceNumberLen := len(j.PriceNumberData) if priceNumberLen > 1 { //table数据去重 tmpPriceNumberData := []map[string]interface{}{} tableStrs := map[string]bool{} for _, tb := range j.PriceNumberData { has := false bytes, _ := json.Marshal(tb) str := string(bytes) if len(tableStrs) > 0 && tableStrs[str] { has = true } else { tableStrs[str] = true } if !has { for _, data := range tb { tmpPriceNumberData = append(tmpPriceNumberData, data) } } } tmp["pricenumber"] = tmpPriceNumberData } else if priceNumberLen == 1 { tmp["pricenumber"] = j.PriceNumberData[0] } } //所有kv组成的字符串 var kvtext bytes.Buffer blocks := make([]ju.BlockAndTag, 0) for _, v := range j.Block { //分包和标签 if ju.SaveBlock { xx, _ := json.Marshal(v) tmpblock := new(ju.TmpBlock) err := json.Unmarshal(xx, &tmpblock) if err != nil { if v.BPackage != nil { bpb, _ := json.Marshal(v.BPackage) tmpblock.BPackage = string(bpb) } tmpblock = rangeBlockToJson(v, *tmpblock) } blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock}) } //把所有kv组装成一个字符串,存库 for _, jv := range []*ju.JobKv{v.ColonKV, v.SpaceKV, v.TableKV} { if jv == nil { continue } for jv_k, jv_v := range jv.KvTags { for _, jv_vv := range jv_v { kvtext.WriteString(jv_k) kvtext.WriteString(":") kvtext.WriteString(jv_vv.Value) kvtext.WriteString("\n") } } } } if kvtext.Len() > 0 { tmp["kvtext"] = kvtext.String() } if len(blocks) > 0 { if blocksBytes, err := json.Marshal(blocks); err == nil { if utf8.RuneCount(blocksBytes) < 100000 { tmp["blocks"] = string(blocksBytes) } } } tmp["dataging"] = j.Dataging /*for k, v := range *j.Data { if f[k] { tmp[k] = v } } for k := range tmp { if !f[k]{ delete(tmp,k) } }*/ //检查字段 tmp = checkFields(tmp) if tmp["projectname"] == nil || tmp["projectname"] == "" { tmp["projectname"] = j.Title } tmp["repeat"] = 0 if ju.Ffield { if len(ffield) > 0 { tmp["ffield"] = ffield } } if e.TaskInfo.TestColl == "" { if len(tmp) > 0 { //保存抽取结果 delete(tmp, "_id") tmparr := []map[string]interface{}{ map[string]interface{}{ "_id": qu.StringTOBsonId(_id), }, map[string]interface{}{"$set": tmp}, } e.RWMutex.Lock() e.BidArr = append(e.BidArr, tmparr) e.BidTotal++ e.RWMutex.Unlock() } if ju.SaveResult { id := tmp["_id"] tmp["result"] = result tmp["resultf"] = resultf delete(tmp, "_id") tmparr := []map[string]interface{}{ map[string]interface{}{ "_id": id, }, map[string]interface{}{"$set": tmp}, } e.RWMutex.Lock() e.ResultArr = append(e.ResultArr, tmparr) e.RWMutex.Unlock() } } else { //测试结果 delete(tmp, "_id") delete(tmp, "fieldall") if len(j.BlockPackage) > 0 { //分包详情 if len(j.BlockPackage) > 10 { tmp["epackage"] = "分包异常" } else { bs, _ := json.Marshal(j.BlockPackage) tmp["epackage"] = string(bs) } } tmp["result"] = result //tmp["resultf"] = resultf //_,err :=db.Mgo.Get().DB("zhengkun").C("result_data").Upsert(`{"_id":"`+_id+`"}`,map[string]interface{}{"$set": tmp}) //log.Debug("save:",err) b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false) if !b { log.Debug(e.TaskInfo.TestColl, _id) } } }, func(err interface{}) { log.Debug("AnalysisSaveResult err", err) }) } //检查字段- func checkFields(tmp map[string]interface{}) map[string]interface{} { delete(tmp, "contenthtml") delete(tmp, "detail") tmp["repeat"] = 0 //指定爬虫-金额处理-预算-中标金额异常 if qu.ObjToString(tmp["spidercode"])=="xz_xzzzqjzscjgycxxxpt_zbtzs" { if budget, ok := tmp["budget"].(float64); ok && budget>0 && budget < 1000000{ tmp["budget"] = budget*10000.0 } if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount>0 && bidamount > 1000000000{ tmp["bidamount"] = bidamount/10000.0 } } if qu.ObjToString(tmp["spidercode"])=="js_jsszbtbw_zbhxrgs" { if bidamount, ok := tmp["bidamount"].(float64); ok && bidamount>0 && bidamount > 1000000000{ tmp["bidamount"] = bidamount/10000.0 } } if _, ok := tmp["bidamount"].(string); ok { delete(tmp, "bidamount") } else if fb, ok := tmp["bidamount"].(float64); ok && fb > 0 && qu.Float64All(tmp["budget"]) > 0 && (fb/5 > qu.Float64All(tmp["budget"]) || qu.Float64All(tmp["budget"])/1000 > fb) { delete(tmp, "bidamount") } if _, ok := tmp["budget"].(string); ok { delete(tmp, "budget") } if _, ok := tmp["unitprice"].(string); ok { delete(tmp, "unitprice") } if _, ok := tmp["bidopentime"].(string); ok { delete(tmp, "bidopentime") } if _, ok := tmp["signaturedate"].(string); ok { delete(tmp, "signaturedate") } if _, ok := tmp["supervisorrate"].(string); ok { delete(tmp, "supervisorrate") } for k, v := range tmp { if k == "qualifies" { continue } if k == "contract_guarantee" || k == "bid_guarantee" { if len(fmt.Sprint(v)) > 0 { tmp[k] = true } else { delete(tmp, k) } } if v == "" || len(strings.TrimSpace(fmt.Sprint(v))) == 0 || strings.Contains(fmt.Sprint(v), "**") { delete(tmp, k) } } //项目周期-有效值 projectperiod := qu.ObjToString(tmp["projectperiod"]) if projectperiod !="" { //项目周期包含日期,数字及日期单位可保留,其余可清洗 isNeedValueReg := regexp.MustCompile(`([0-9俩两一二三四五六七八九年月日天周]|合同)`) if !isNeedValueReg.MatchString(projectperiod) { delete(tmp, "projectperiod") } } //工期单位是否有效-清理 if project_timeunit, ok := tmp["project_timeunit"].(string); ok { dateReg := regexp.MustCompile(`[年|月|日|天|周]`) if !dateReg.MatchString(project_timeunit) || utf8.RuneCountInString(project_timeunit)>4 { delete(tmp, "project_timeunit") } //年-0 >5 删除 if project_timeunit == "年" && (qu.Int64All(tmp["project_duration"])==0 || qu.Int64All(tmp["project_duration"])>5 ){ delete(tmp, "project_timeunit") } } if tmp["winner"] != nil && tmp["s_winner"] != nil { strwin := qu.ObjToString(tmp["winner"]) strwin_s := qu.ObjToString(tmp["s_winner"]) if !strings.Contains(strwin_s, strwin) { tmp["s_winner"] = strwin } } //budget bidamount if bg, ok := tmp["budget"].(float64); ok { if bg >= 50000000000 { tmp["budget_max_err"] = bg delete(tmp, "budget") } } if bg, ok := tmp["bidamount"].(float64); ok && bg >= 50000000000 { code := qu.ObjToString(tmp["spidercode"]) if bg >= 50000000000 && code != "xz_xzzzqjzscjgycxxxpt_zbtzs" && code != "js_jsszbtbw_zbhxrgs"{ tmp["bidamount_max_err"] = bg delete(tmp, "bidamount") } } //投标方式- bidway := qu.IntAll(tmp["bidway"]) if bidway == 1 { tmp["bidway"] = "纸质投标" }else if bidway == 2 { tmp["bidway"] = "电子投标" }else { delete(tmp, "bidway") } //折扣系数 discount := dealWithDiscountBid(tmp) if discount >0.0 { tmp["biddiscount"] = discount }else { delete(tmp, "biddiscount") } delete(tmp, "biddiscount_up") delete(tmp, "biddiscount_down") //临时 //bidopentime := qu.Int64All(tmp["bidopentime"]) //bidendtime := qu.Int64All(tmp["bidendtime"]) //timeLayout := "2006-01-02 15:04:05" // //if bidopentime>0 { // bidopentime_str := time.Unix(bidopentime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串 // tmp["bidopentime"] = bidopentime_str //} //if bidendtime>0 { // bidendtime_str := time.Unix(bidendtime, 0).Format(timeLayout) //设置时间戳 使用模板格式化为日期字符串 // tmp["bidendtime"] = bidendtime_str //} jyhref:= fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"]))) tmp["jytest_href"] = jyhref return tmp } //处理折扣系数 func dealWithDiscountBid(tmp map[string]interface{}) float64 { biddiscount := qu.Float64All(tmp["biddiscount"]) biddiscount_up := qu.Float64All(tmp["biddiscount_up"]) biddiscount_down := qu.Float64All(tmp["biddiscount_down"]) baseCount := float64(1) if biddiscount_down >0.0 { num1:=decimal.NewFromFloat(baseCount) num2:=decimal.NewFromFloat(biddiscount_down) decimalValue := num1.Sub(num2) res,_ := decimalValue.Float64() //log.Debug("下浮后折扣系数:",res) return res } if biddiscount_up >0.0 { num1:=decimal.NewFromFloat(baseCount) num2:=decimal.NewFromFloat(biddiscount_up) decimalValue := num1.Add(num2) res,_ := decimalValue.Float64() //log.Debug("上浮后折扣系数:",res) return res } if biddiscount>0.0 { if biddiscount > 1.0 && biddiscount<=10.0 { num1:=decimal.NewFromFloat(10.0) num2:=decimal.NewFromFloat(biddiscount_up) decimalValue := num2.Div(num1) res,_ := decimalValue.Float64() //log.Debug("标准-①折扣系数:",res) return res }else if biddiscount>10.0 { num1:=decimal.NewFromFloat(100.0) num2:=decimal.NewFromFloat(biddiscount_up) decimalValue := num2.Div(num1) res,_ := decimalValue.Float64() //log.Debug("标准-⑩折扣系数:",res) return res }else { //log.Debug("标准折扣系数:",biddiscount) return biddiscount } } return 0.0 } //精度丢失-相加 func precisionAddFloat(tmp1,tmp2 float64)float64{ num1:=decimal.NewFromFloat(tmp1) num2:=decimal.NewFromFloat(tmp2) decimalValue := num2.Add(num1) res,_ := decimalValue.Float64() return res } //保存其他 //kv、表格、块上的标签凡是新的标签都入库 //val type times firstid createtime 判定field func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) { now := time.Now().Unix() coll := e.TaskInfo.TestColl if coll == "" { coll = "extract_tag_result" } else { coll += "_tag" } datas := []map[string]interface{}{} kv := map[string]int{} for _, v := range j.Block { // for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} { if vv == nil || vv.KvTags == nil { continue } for kkk, vvv := range vv.KvTags { for _, vvvv := range vvv { if vvvv.IsInvalid { kv[kkk] = kv[kkk] + 1 break } } } } for _, vv := range v.NotClassifyTitles { datas = append(datas, map[string]interface{}{ "val": vv, "times": 0, "type": "block", "firstid": j.SourceMid, "createtime": now, }) if len(datas) == saveLimit { db.Mgo.SaveBulk(coll, datas...) datas = []map[string]interface{}{} } } } for k, v := range kv { datas = append(datas, map[string]interface{}{ "val": k, "times": v, "type": "kv", "firstid": j.SourceMid, "createtime": now, }) if len(datas) == saveLimit { db.Mgo.SaveBulk(coll, datas...) datas = []map[string]interface{}{} } } if len(datas) > 0 { db.Mgo.SaveBulk(coll, datas...) } } func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) { if j == nil { return nil } if len(j.Block) > 0 { for i, v := range j.Block { rangetmp := new(ju.TmpBlock) vb, _ := json.Marshal(v) json.Unmarshal(vb, &rangetmp) tmpblock.Block[i] = rangeBlockToJson(v, *rangetmp) } } if j.ColonKV != nil { cb, _ := json.Marshal(j.ColonKV) tmpblock.ColonKV = string(cb) } if j.SpaceKV != nil { sb, _ := json.Marshal(j.SpaceKV) tmpblock.SpaceKV = string(sb) } if j.TableKV != nil { tb, _ := json.Marshal(j.TableKV) tmpblock.TableKV = string(tb) } return &tmpblock } //去重冗余字段 func delFiled(k string) bool { return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata" } //分析-打分排序 func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) { defer qu.Catch() doc := j.Data result := j.Result _id := qu.BsonIdToSId((*doc)["_id"]) result = ScoreFields(j, e.Tag) //正负面词打分 //结果排序 for _, val := range result { ju.Sort(val) } if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) { clearJd(j.Jsondata, e, j.SpiderCode, j.IsClearnMoney) marshalbt, _ := json.Marshal(j.Jsondata) tmpjddata := make(map[string]interface{}) json.Unmarshal(marshalbt, &tmpjddata) for _, jdkey := range ju.JsonData { if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 { for tmpk, tmpv := range j.Result[jdkey][:5] { if jdkey == "budget" || jdkey == "bidamount" { lockclear.Lock() cfn := e.ClearFn[jdkey] lockclear.Unlock() if len(cfn) == 0 { continue } newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode, j.IsClearnMoney) if tmpv.Value == newNum[0] { extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)} j.Result[jdkey] = append(j.Result[jdkey], extField) ju.Sort(j.Result[jdkey]) delete((*j.Jsondata), jdkey) break } } else { if (*j.Jsondata)[jdkey] == tmpv.Value { extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100} j.Result[jdkey] = append(j.Result[jdkey], extField) ju.Sort(j.Result[jdkey]) delete((*j.Jsondata), jdkey) break } } } } } if len(*j.Jsondata) > 0 { j.Result = JsonDataMergeProcessing(j, e) } j.Jsondata = &tmpjddata } return doc, result, _id } //辅助信息,如果没有排序先排序 func auxInfo(j *ju.Job) map[string][]map[string]interface{} { fieldalls := map[string][]map[string]interface{}{} if j == nil { return fieldalls } qykredis := redis.RedisPool[ju.QYK_RedisName].Get() defer qykredis.Close() db := 0 for field, val := range j.Result { //ju.Sort(val) if field == "buyer" { db = ju.BuyerDB } else if field == "winner" { db = ju.WinnerDB } else if field == "agency" { db = ju.AgencyDB } sfields := []map[string]interface{}{} for _, v := range val { standardized := false if _, err := qykredis.Do("SELECT", db); err != nil { fmt.Println("redis select err", err) } else { rep, err := qykredis.Do("GET", v.Value) if rep != nil && err == nil { standardized = true } } if field == "budget" || field == "bidamount" { if !v.IsTrue { continue } } sfield := map[string]interface{}{ "val": v.Value, "type": v.Type, "score": v.Score, "blocktag": v.BlockTag, "sourceval": v.SourceValue, "standardized": standardized, } sfields = append(sfields, sfield) } fieldalls[field] = sfields } return fieldalls } func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) { defer qu.Catch() //获取审核字段 for _, field := range e.AuditFields { //1.分包 if resulttmp["package"] != nil { packagedata := resulttmp["package"].(map[string]map[string]interface{}) for _, val := range packagedata { if val[field] != nil { fv := qu.ObjToString(val[field]) if fv != "" { if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则 e.RedisMatch(field, fv, val) //redis匹配 } else { //除了buyer和winner,其他字段走规则匹配 e.RuleMatch(field, fv, val) } } } } } //2.外围 if resulttmp[field] != nil { fv := qu.ObjToString(resulttmp[field]) if fv != "" { if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则 e.RedisMatch(field, fv, resulttmp) //redis匹配 } else { //除了buyer和winner,其他字段走规则匹配 e.RuleMatch(field, fv, resulttmp) } } } } } //Redis匹配 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) { defer qu.Catch() i := redis.GetInt(field, field+"_"+fv) //查找redis if i == 0 { //reids未找到,执行规则匹配 val[field+"_isredis"] = false e.RuleMatch(field, fv, val) //规则匹配 } else { //redis找到,打标识存库 val[field+"_isredis"] = true } } //规则匹配 func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) { defer qu.Catch() if fieldval != "" { SMap := e.StartMatch(field, fieldval) //SMap.AddKey(field+"_isaudit", false) for _, k := range SMap.Keys { tmpMap[k] = SMap.Map[k] } tmpMap[field+"_isaudit"] = false //添加字段未审核信息 } } //开始规则匹配 func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap { defer qu.Catch() SMap := pretreated.NewSortMap() lock.Lock() f := e.RecogFieldMap[field] lock.Unlock() if len(f) > 0 { fid := qu.BsonIdToSId(f["_id"]) recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"]) textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤 if textAfterRecogFieldPrerule != "" { lock.Lock() classMap := e.FidClassMap[fid] lock.Unlock() L: for _, c := range classMap { //class classid := qu.BsonIdToSId(c["_id"]) classPrerule := qu.ObjToString(c["s_class_prerule"]) savefield := qu.ObjToString(c["s_savefield"]) //保存字段 textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤 if textAfterClassPrerule != "" { lock.Lock() ruleMap := e.CidRuleMap[classid] lock.Unlock() for _, r := range ruleMap { //rule rulePrerule := qu.ObjToString(r["s_rule_prerule"]) s_name := qu.ObjToString(r["s_name"]) rule := r["rule"].([]interface{}) textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤 if textAfterRulePrerule != "" { b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule) if b { //匹配到一个分类下某个规则时,不再继续匹配 if savefield != "" { //保存字段不为空,存储代码信息 SMap.AddKey(field+"_"+savefield, s_name) } break L } } } } } } } return SMap } //筛选重复候选人-相关 func filterRepeatWinArr(j *ju.Job) { if j.SpiderCode=="sh_shszfhcxjsglwyh_jsgc_zhbhxrgs" { sort_WinOrder_Arr := make([][]map[string]interface{},0) sort_arr := make([]map[string]interface{},0) for _,v := range j.Winnerorder{ sort := qu.IntAll(v["sort"]) if sort==1 { //为一组 if len(sort_arr)>0 { sort_WinOrder_Arr = append(sort_WinOrder_Arr,sort_arr) } sort_arr = make([]map[string]interface{},0) } sort_arr = append(sort_arr,v) } if len(sort_arr)>0 { sort_WinOrder_Arr = append(sort_WinOrder_Arr,sort_arr) } if len(sort_WinOrder_Arr)>0 { //有重复排序组-开始筛选清理 isIndex :=0 for index,winArr := range sort_WinOrder_Arr { if len(winArr)>0 { if qu.ObjToString(winArr[0]["price"])!=""&& qu.ObjToString(winArr[0]["entname"])!="" { isIndex = index break } } } j.Winnerorder = sort_WinOrder_Arr[isIndex] } } } //中标候选人经过清理之后,重新取出赋值 func resetWinnerorder(j *ju.Job) { if len(j.Winnerorder) == 0 { return } maxlen := len(j.Winnerorder) - 1 //中标单位 //i := 0 winners := []*ju.ExtField{} bidamounts := []*ju.ExtField{} if maxlen > 0 { //新增-指定爬虫中标候选人过滤 filterRepeatWinArr(j) if qu.Float64All(j.Winnerorder[0]["sort"]) != 1 { return } winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5}) if j.Winnerorder[0]["price"] != nil { tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney) if tmpPrice[len(tmpPrice)-1].(bool) { bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true}) } } } if j.Result["winner"] == nil && len(winners) > 0 { j.Result["winner"] = winners } else if len(winners) > 0 { j.Result["winner"] = append(j.Result["winner"], winners...) } if j.Result["bidamount"] == nil && len(bidamounts) > 0 { j.Result["bidamount"] = bidamounts } else if len(bidamounts) > 0 { j.Result["bidamount"] = append(j.Result["bidamount"], bidamounts...) } if j.Result["winner"] == nil && len(j.Winnerorder) > 0 && qu.Float64All(j.Winnerorder[0]["sort"]) == 1 { winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5}) j.Result["winner"] = winners if j.Winnerorder[0]["price"] != nil { tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode, j.IsClearnMoney) if tmpPrice[len(tmpPrice)-1].(bool) { bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 2.5, IsTrue: true}) } j.Result["bidamount"] = bidamounts } } } func RemoveReplicaSliceString(slc []string) []string { result := make([]string, 0) tempMap := make(map[string]bool, len(slc)) for _, e := range slc { if tempMap[e] == false { tempMap[e] = true result = append(result, e) } } return result } type scoreIndex struct { Score float64 Index int }