package extract import ( "encoding/json" "fmt" "jy/clear" db "jy/mongodbutil" "jy/pretreated" ju "jy/util" "log" qu "qfw/util" redis "qfw/util/redis" "reflect" "regexp" "strconv" "sync" "time" "gopkg.in/mgo.v2/bson" ) var ( lock sync.RWMutex cut = ju.NewCut() //获取正文并清理 ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志 TaskList map[string]*ExtractTask //任务列表 ClearTaskList map[string]*ClearTask //清理任务列表 saveLimit = 200 //抽取日志批量保存 PageSize = 5000 //查询分页 Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1}` Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}` ) //启动测试抽取 func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool { defer qu.Catch() ext := &ExtractTask{} ext.Id = taskId ext.IsRun = true ext.InitTestTaskInfo(resultcoll, trackcoll) ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB) ext.InitRulePres() ext.InitRuleBacks() ext.InitRuleCore() ext.InitPkgCore() ext.InitTag() ext.InitClearFn() if ext.IsExtractCity { //版本上控制是否开始城市抽取 //初始化城市DFA信息 ext.InitDFA() } //质量审核 ext.InitAuditFields() ext.InitAuditRule() ext.InitAuditClass() ext.InitAuditRecogField() //品牌抽取是否开启 ju.IsBrandGoods = ju.Config["brandgoods"].(bool) return RunExtractTestTask(ext, startId, num) } func IdTrans(startId string) bson.ObjectId { defer qu.Catch() return bson.ObjectIdHex(startId) } //开始测试任务抽取 func RunExtractTestTask(ext *ExtractTask, startId, num string) bool { n, _ := strconv.Atoi(num) id := IdTrans(startId) if id.Valid() { query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}} list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n) for _, v := range *list { if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据 continue } //log.Println(v["_id"]) j := PreInfo(v) //fmt.Println(j.HasTable, j.HasGoods, j.HasBrand, j.HasKey, "j-------", j.BrandData) ext.TaskInfo.ProcessPool <- true go ext.ExtractProcess(j) } return true } else { return false } } //启动抽取 func StartExtractTaskId(taskId string) bool { isgo := false ext := TaskList[taskId] if ext == nil { ext = &ExtractTask{} ext.Id = taskId ext.InitTaskInfo() isgo = true } else { ext.Id = taskId ext.InitTaskInfo() } ext.TaskInfo.FDB = db.MgoFactory(2, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB) ext.TaskInfo.TDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB) ext.InitRulePres() ext.InitRuleBacks() ext.InitRuleCore() ext.InitPkgCore() ext.InitTag() ext.InitClearFn() if ext.IsExtractCity { //版本上控制是否开始城市抽取 //初始化城市DFA信息 ext.InitDFA() } //质量审核 ext.InitAuditFields() ext.InitAuditRule() ext.InitAuditClass() ext.InitAuditRecogField() ext.IsRun = true go ext.ResultSave() go ext.BidSave() if isgo { go RunExtractTask(taskId) } TaskList[taskId] = ext return true } //停止抽取 func StopExtractTaskId(taskId string) bool { ext := TaskList[taskId] if ext != nil { ext.IsRun = false TaskList[taskId] = ext } //更新task.s_extlastid db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`) return true } //开始抽取 func RunExtractTask(taskId string) { ext := TaskList[taskId] query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}} count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query) pageNum := (count + PageSize - 1) / PageSize limit := PageSize if count < PageSize { limit = count } log.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query) for i := 0; i < pageNum; i++ { query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}} log.Printf("page=%d,query=%v", i+1, query) list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit) for _, v := range *list { if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据 continue } //log.Println(v["_id"]) if !ext.IsRun { break } j := PreInfo(v) ext.TaskInfo.ProcessPool <- true go ext.ExtractProcess(j) ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"]) } db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`) if !ext.IsRun { break } } //更新task.s_extlastid time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) }) } //信息预处理 func PreInfo(doc map[string]interface{}) *ju.Job { detail := "" d1, _ := doc["detail"].(string) d2, _ := doc["contenthtml"].(string) if len(d1) >= len(d2) || d2 == "" { detail = d1 } else { detail = d2 } detail = ju.CutLableStr(detail) detail = cut.ClearHtml(detail) doc["detail"] = detail toptype := qu.ObjToString(doc["toptype"]) if qu.ObjToString(doc["type"]) == "bid" { toptype = "结果" } if toptype == "" { toptype = "*" } j := &ju.Job{ SourceMid: qu.BsonIdToSId(doc["_id"]), Category: toptype, Content: qu.ObjToString(doc["detail"]), SpiderCode: qu.ObjToString(doc["spidercode"]), //Domain: qu.ObjToString(doc["domain"]), //Href: qu.ObjToString(doc["href"]), Title: qu.ObjToString(doc["title"]), Data: &doc, City: qu.ObjToString(doc["city"]), Province: qu.ObjToString(doc["area"]), Result: map[string][]*ju.ExtField{}, BuyerAddr: qu.ObjToString(doc["buyeraddr"]), } qu.Try(func() { pretreated.AnalyStart(j) }, func(err interface{}) { log.Println("pretreated.AnalyStart", err) }) return j } //抽取 func (e *ExtractTask) ExtractProcess(j *ju.Job) { qu.Try(func() { doc := *j.Data //全局前置规则,结果覆盖doc属性 for _, v := range e.RulePres { doc = ExtRegPre(doc, j, v, e.TaskInfo) } //抽取规则 for _, vc := range e.RuleCores { tmp := ju.DeepCopy(doc).(map[string]interface{}) //是否进入逻辑 if !ju.Logic(vc.LuaLogic, tmp) { continue } //抽取-前置规则 for _, v := range vc.RulePres { tmp = ExtRegPre(tmp, j, v, e.TaskInfo) } //log.Println("抽取-前置规则", tmp) //抽取-规则 for _, v := range vc.RuleCores { ExtRegCore(vc.ExtFrom, tmp, j, v, e) } //log.Println("抽取-规则", tmp) //项目名称未能抽取到,标题来凑 if vc.Field == "projectname" { if len(j.Result[vc.Field]) < 1 { j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, j.Title, 0}) } } //抽取-后置规则 for _, v := range vc.RuleBacks { ExtRegBack(j, v, e.TaskInfo) } //log.Println("抽取-后置规则", tmp) } //全局后置规则 for _, v := range e.RuleBacks { ExtRegBack(j, v, e.TaskInfo) } //候选人加入 if len(j.Winnerorder) > 0 { winner := &ju.ExtField{ Field: "winner", Code: "", RuleText: "", Type: "winnerorder", MatchType: "winnerorder", ExtFrom: "", Value: j.Winnerorder[0]["entname"], Score: 0, } if len([]rune(qu.ObjToString(j.Winnerorder[0]["entname"]))) < 4 { winner.Score = -5 } winners := j.Result["winner"] if winners != nil { winners = append(winners, winner) } else { winners = []*ju.ExtField{} winners = append(winners, winner) } j.Result["winner"] = winners } //函数清理 for key, val := range j.Result { for _, v := range val { lock.Lock() cfn := e.ClearFn[key] lock.Unlock() data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}) v.Value = data[0] //清理特殊符号 if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil { text := qu.ObjToString(v.Value) if key == "projectname" { fmt.Println("1===========", text) } text = clear.OtherClean(key, text) if key == "projectname" { fmt.Println("2===========", text) } v.Value = text } } } PackageDetail(j, e) //处理分包信息 // bs, _ := json.Marshal(j.Result) // log.Println("抽取结果", j.Title, j.SourceMid, string(bs)) //分析抽取结果并保存 todo AnalysisSaveResult(j, e) }, func(err interface{}) { log.Println((*j.Data)["_id"], err) <-e.TaskInfo.ProcessPool }) <-e.TaskInfo.ProcessPool } //前置过滤 func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} { before := ju.DeepCopy(doc).(map[string]interface{}) extinfo := map[string]interface{}{} if in.IsLua { lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText} if j != nil { lua.Block = j.Block } extinfo = lua.RunScript("pre") for k, v := range extinfo { //结果覆盖原doc doc[k] = v } AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志 } else { key := qu.If(in.Field == "", "detail", in.Field).(string) text := qu.ObjToString(doc[key]) extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "") doc[key] = extinfo[key] //结果覆盖原doc AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志 } return doc } //抽取-规则 func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) { //废标、流标、ppp等跳过 b := IsExtract(in.Field, j.Title, j.Content) if !b { return } if in.IsLua { lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText} lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag) lua.Block = j.Block extinfo := lua.RunScript("core") for k, v := range extinfo { if k == in.Field { if j.Result[k] == nil { j.Result[k] = [](*ju.ExtField){} } if tmps, ok := v.([]map[string]interface{}); ok { for _, tmp := range tmps { j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"], 0}) } } } } if len(extinfo) > 0 { AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志 } } else { //全文正则 text := qu.ObjToString(doc[extfrom]) if in.Field != "" { extinfo := extRegCoreToResult(extfrom, text, j, in) if len(extinfo) > 0 { AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志 } } } } //lua脚本根据属性设置提取kv值 func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} { kvmap := map[string][]map[string]interface{}{} for fieldname, field := range in.LFields { lock.Lock() tags := t[field] //获取对应标签库 lock.Unlock() for _, bl := range j.Block { //冒号kv if bl.ColonKV != nil { kvs := bl.ColonKV.Kvs kvs2 := bl.ColonKV.Kvs_2 //log.Println("ColonKV1", kvs) //log.Println("ColonKV2", kvs2) for _, tag := range tags { for _, kv := range kvs { if tag.Type == "string" { if kv.Key == tag.Key { text := ju.TrimLRSpace(kv.Value, "") if text != "" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "field": field, "code": in.Code, "ruletext": tag.Key, "extfrom": extfrom, "value": text, "type": "colon1", "matchtype": "tag_string", }) } break } } else if tag.Type == "regexp" { if tag.Reg.MatchString(kv.Key) { text := ju.TrimLRSpace(kv.Value, "") if text != "" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "field": field, "code": in.Code, "ruletext": tag.Key, "extfrom": extfrom, "value": text, "type": "colon1", "matchtype": "tag_regexp", }) } break } } } for _, kv := range kvs2 { if tag.Type == "string" { if kv.Key == tag.Key { text := ju.TrimLRSpace(kv.Value, "") if text != "" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "field": field, "code": in.Code, "ruletext": tag.Key, "extfrom": extfrom, "value": text, "type": "colon2", "matchtype": "tag_string", }) } break } } else if tag.Type == "regexp" { if tag.Reg.MatchString(kv.Key) { text := ju.TrimLRSpace(kv.Value, "") if text != "" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "field": field, "code": in.Code, "ruletext": tag.Key, "extfrom": extfrom, "value": text, "type": "colon2", "matchtype": "tag_regexp", }) } break } } } } } //空格kv if bl.SpaceKV != nil { kvs := bl.SpaceKV.Kvs //log.Println("SpaceKV", kvs) for _, tag := range tags { for _, kv := range kvs { if tag.Type == "string" { if kv.Key == tag.Key { text := ju.TrimLRSpace(kv.Value, "") if text != "" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "field": field, "code": in.Code, "ruletext": tag.Key, "extfrom": extfrom, "value": text, "type": "space", "matchtype": "tag_string", }) } break } } else if tag.Type == "regexp" { if tag.Reg.MatchString(kv.Key) { text := ju.TrimLRSpace(kv.Value, "") if text != "" { kvmap[field] = append(kvmap[field], map[string]interface{}{ "field": field, "code": in.Code, "ruletext": tag.Key, "extfrom": extfrom, "value": text, "type": "space", "matchtype": "tag_regexp", }) } break } } } } } //表格kv if bl.TableKV != nil { tkv := bl.TableKV //log.Println("tkv", tkv) for k, v := range tkv.Kv { if k == fieldname { if len(tags) > -tkv.KvIndex[fieldname] { ruletext := "" if fieldname == "项目名称" && -tkv.KvIndex[fieldname] == -100 { ruletext = "项目名称" } else { ruletext = tags[-tkv.KvIndex[fieldname]].Key } kvmap[field] = append(kvmap[field], map[string]interface{}{ "field": field, "code": in.Code, "ruletext": ruletext, "extfrom": "table", "value": v, "type": "table", "matchtype": "tag_string", }) } else { //涉及其他待处理 //log.Println(tags) } } } } } } return kvmap } //正则提取结果 func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} { extinfo := map[string][]map[string]interface{}{} if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线) apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1) if len(apos) > 0 { pos := apos[0] for k, p := range v.RegCore.ExtractPos { if len(pos) > p { if pos[p] == -1 || pos[p+1] == -1 { continue } val := text[pos[p]:pos[p+1]] tmps := []map[string]interface{}{} tmp := map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "extfrom": extfrom, "value": val, "type": "regexp", "matchtype": "regcontent", } tmps = append(tmps, tmp) extinfo[k] = tmps if val != "" { if j.Result[v.Field] == nil { j.Result[k] = [](*ju.ExtField){} } j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0}) } } } } } else { pos := v.RegCore.Reg.FindStringIndex(text) val := "" if len(pos) == 2 { text = text[pos[1]:] rs := regexp.MustCompile("[^\r\n\t]+") tmp := rs.FindAllString(text, -1) if len(tmp) > 0 { val = tmp[0] } } if val != "" { tmps := []map[string]interface{}{} tmp := map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "extfrom": extfrom, "value": val, "type": "regexp", "matchtype": "regcontent", } tmps = append(tmps, tmp) extinfo[v.Field] = tmps if j.Result[v.Field] == nil { j.Result[v.Field] = [](*ju.ExtField){} } j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0}) } } return extinfo } //后置过滤 func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) { if in.IsLua { result := GetResultMapForLua(j) lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText} if j != nil { lua.Block = j.Block } extinfo := lua.RunScript("back") for k, v := range extinfo { if tmps, ok := v.([]map[string]interface{}); ok { j.Result[k] = [](*ju.ExtField){} for _, tmp := range tmps { j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"], 0}) } } } if len(extinfo) > 0 { AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志 } } else { extinfo := map[string]interface{}{} if in.Field != "" { if j.Result[in.Field] != nil { tmp := j.Result[in.Field] exts := []interface{}{} for k, v := range tmp { if v.Type == "table" { //table抽取到的数据不清理 continue } text := qu.ObjToString(v.Value) if text != "" { text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace) } j.Result[in.Field][k].Value = text exts = append(exts, map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "type": v.Type, "matchtype": v.MatchType, "extfrom": v.ExtFrom, "value": text, }) } extinfo[in.Field] = exts if len(extinfo) > 0 { AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志 } } } else { for key, tmp := range j.Result { exts := []interface{}{} for k, v := range tmp { if v.Type == "table" { //table抽取到的数据不清理 continue } text := qu.ObjToString(v.Value) if text != "" { text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace) } j.Result[key][k].Value = text exts = append(exts, map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "type": v.Type, "matchtype": v.MatchType, "extfrom": v.ExtFrom, "value": text, }) } extinfo[key] = exts } if len(extinfo) > 0 { AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志 } } } } //获取抽取结果map[string][]interface{},lua脚本使用 func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} { result := map[string][]map[string]interface{}{} for key, val := range j.Result { if result[key] == nil { result[key] = []map[string]interface{}{} } for _, v := range val { tmp := map[string]interface{}{ "field": v.Field, "code": v.Code, "ruletext": v.RuleText, "value": v.Value, "type": v.Type, "matchtype": v.MatchType, "extfrom": v.ExtFrom, } result[key] = append(result[key], tmp) } } return result } //抽取日志 func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) { if !t.IsEtxLog { return } logdata := map[string]interface{}{ "code": v.Code, "name": v.Name, "type": ftype, "ruletext": v.RuleText, "islua": v.IsLua, "field": v.Field, "version": t.Version, "taskname": t.Name, "before": before, "extinfo": extinfo, "sid": sid, "comeintime": time.Now().Unix(), } lock.Lock() ExtLogs[t] = append(ExtLogs[t], logdata) lock.Unlock() } //保存抽取日志 func SaveExtLog() { tmpLogs := map[*TaskInfo][]map[string]interface{}{} lock.Lock() tmpLogs = ExtLogs ExtLogs = map[*TaskInfo][]map[string]interface{}{} lock.Unlock() for k, v := range tmpLogs { if len(v) < saveLimit { db.Mgo.SaveBulk(k.TrackColl, v...) } else { for { if len(v) > saveLimit { tmp := v[:saveLimit] db.Mgo.SaveBulk(k.TrackColl, tmp...) v = v[saveLimit:] } else { db.Mgo.SaveBulk(k.TrackColl, v...) break } } } } time.AfterFunc(10*time.Second, SaveExtLog) } type FieldValue struct { Value interface{} Count int } //分析抽取结果并保存 func AnalysisSaveResult(j *ju.Job, e *ExtractTask) { log.Println("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData) doc := j.Data result := j.Result _id := qu.BsonIdToSId((*doc)["_id"]) iscore, _ := ju.Config["fieldscore"].(bool) if iscore { //打分 result = ScoreFields(j) } //结果排序 values := map[string][]*ju.SortObject{} for key, val := range result { fieldValue := map[string][]interface{}{} if iscore { //走打分 for _, v := range val { if len(fmt.Sprint(v.Value)) < 1 { continue //去除空串 } fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value} } } else { //不走打分,按出现频次 for _, v := range val { if len(fmt.Sprint(v.Value)) < 1 { continue //去除空串 } if fieldValue[fmt.Sprint(v.Value)] == nil { fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value} } else { fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1 } } } objects := []*ju.SortObject{} for k, v := range fieldValue { ValueStr := "" //第二排序 if reflect.TypeOf(v[1]).String() == "string" { ValueStr = qu.ObjToString(v[1]) } tmp := &ju.SortObject{ Key: k, Value: qu.IntAll(v[0]), Object: v[1], ValueStr: ValueStr, } objects = append(objects, tmp) } values[key] = ju.ExtSort(objects) } //从排序结果中取值 tmp := map[string]interface{}{} //抽取值 for key, val := range values { for _, v := range val { //取第一个非负数 if v.Key != "" && v.Value > -1 { tmp[key] = v.Object break } } } if len(j.PackageInfo) > 0 { //分包信息 tmp["package"] = j.PackageInfo } if len(j.Winnerorder) > 0 { //候选人信息 tmp["winnerorder"] = j.Winnerorder } for k, v := range *doc { //去重冗余字段 if k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" { continue } if tmp[k] == nil { tmp[k] = v } } //质量审核 if ju.Config["qualityaudit"].(bool) { e.QualityAudit(tmp) } if e.IsExtractCity { //城市抽取 b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市 //log.Println("省份---", p, "城市---", c, "区---", d) tmp["district"] = d if b { tmp["city"] = c tmp["area"] = p } } if e.TaskInfo.TestColl == "" { if len(tmp) > 0 { //保存抽取结果 tmparr := []map[string]interface{}{ map[string]interface{}{ "_id": qu.StringTOBsonId(_id), }, map[string]interface{}{"$set": tmp}, } e.BidArr = append(e.BidArr, tmparr) } if b, ok := ju.Config["saveresult"].(bool); ok && b { id := tmp["_id"] tmp["result"] = result delete(tmp, "_id") tmparr := []map[string]interface{}{ map[string]interface{}{ "_id": id, }, map[string]interface{}{"$set": tmp}, } e.ResultArr = append(e.ResultArr, tmparr) } } else { //测试结果 delete(tmp, "_id") if len(j.BlockPackage) > 0 { //分包详情 bs, _ := json.Marshal(j.BlockPackage) tmp["epackage"] = string(bs) } tmp["result"] = result b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false) if !b { log.Println(e.TaskInfo.TestColl, _id) } } } func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) { defer qu.Catch() //获取审核字段 for _, field := range e.AuditFields { //1.分包 if resulttmp["package"] != nil { packagedata := resulttmp["package"].(map[string]map[string]interface{}) for _, val := range packagedata { if val[field] != nil { fv := qu.ObjToString(val[field]) if fv != "" { if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则 e.RedisMatch(field, fv, val) //redis匹配 } else { //除了buyer和winner,其他字段走规则匹配 e.RuleMatch(field, fv, val) } } } } } //2.外围 if resulttmp[field] != nil { fv := qu.ObjToString(resulttmp[field]) if fv != "" { if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则 e.RedisMatch(field, fv, resulttmp) //redis匹配 } else { //除了buyer和winner,其他字段走规则匹配 e.RuleMatch(field, fv, resulttmp) } } } } } //Redis匹配 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) { defer qu.Catch() i := redis.GetInt(field, field+"_"+fv) //查找redis if i == 0 { //reids未找到,执行规则匹配 val[field+"_isredis"] = false e.RuleMatch(field, fv, val) //规则匹配 } else { //redis找到,打标识存库 val[field+"_isredis"] = true } } //规则匹配 func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) { defer qu.Catch() if fieldval != "" { SMap := e.StartMatch(field, fieldval) //SMap.AddKey(field+"_isaudit", false) for _, k := range SMap.Keys { tmpMap[k] = SMap.Map[k] } tmpMap[field+"_isaudit"] = false //添加字段未审核信息 } } //开始规则匹配 func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap { defer qu.Catch() SMap := pretreated.NewSortMap() lock.Lock() f := e.RecogFieldMap[field] lock.Unlock() if len(f) > 0 { fid := qu.BsonIdToSId(f["_id"]) recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"]) textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤 if textAfterRecogFieldPrerule != "" { lock.Lock() classMap := e.FidClassMap[fid] lock.Unlock() L: for _, c := range classMap { //class classid := qu.BsonIdToSId(c["_id"]) classPrerule := qu.ObjToString(c["s_class_prerule"]) savefield := qu.ObjToString(c["s_savefield"]) //保存字段 textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤 if textAfterClassPrerule != "" { lock.Lock() ruleMap := e.CidRuleMap[classid] lock.Unlock() for _, r := range ruleMap { //rule rulePrerule := qu.ObjToString(r["s_rule_prerule"]) s_code := qu.ObjToString(r["s_code"]) rule := r["rule"].([]interface{}) textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤 if textAfterRulePrerule != "" { b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule) if b { //匹配到一个分类下某个规则时,不再继续匹配 if savefield != "" { //保存字段不为空,存储代码信息 SMap.AddKey(field+"_"+savefield, s_code) } break L } } } } } } } return SMap }