123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946 |
- package extract
- import (
- //"encoding/json"
- "fmt"
- "jy/clear"
- db "jy/mongodbutil"
- "jy/pretreated"
- ju "jy/util"
- "log"
- qu "qfw/util"
- "regexp"
- "strconv"
- "strings"
- "sync"
- "time"
- "gopkg.in/mgo.v2/bson"
- )
- var (
- lock sync.RWMutex
- cut = ju.NewCut() //获取正文并清理
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
- TaskList map[string]*ExtractTask //任务列表
- saveLimit = 200 //抽取日志批量保存
- AreaGet DFA //敏感词
- AreaProvinceGet DFA //敏感词
- AreaSimGet DFA //敏感词
- Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
- )
- var CitySimConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市简称
- var CityAllConfig map[string]map[string]interface{} = make(map[string]map[string]interface{}) //城市全称
- var ProviceConfig map[string]interface{} = make(map[string]interface{}) //省份
- var ProvinceMap map[string]string = make(map[string]string)
- var CityBrief map[string]*City = make(map[string]*City) //只加载一次即可
- var ProvinceBrief map[string]*Province = make(map[string]*Province) //只加载一次
- var AreaToCity map[string][]*City = make(map[string][]*City) //两个文件共用
- //启动测试抽取
- func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
- defer qu.Catch()
- ext := &ExtractTask{}
- ext.Id = taskId
- ext.IsRun = true
- ext.InitTestTaskInfo(resultcoll, trackcoll)
- ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
- ext.InitRulePres()
- ext.InitRuleBacks()
- ext.InitRuleCore()
- ext.InitTag()
- ext.InitClearFn()
- return RunExtractTestTask(ext, startId, num)
- }
- func IdTrans(startId string) bson.ObjectId {
- defer qu.Catch()
- return bson.ObjectIdHex(startId)
- }
- //开始测试任务抽取
- func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
- n, _ := strconv.Atoi(num)
- id := IdTrans(startId)
- if id.Valid() {
- query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
- list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
- for _, v := range *list {
- j := PreInfo(v)
- ext.TaskInfo.ProcessPool <- true
- go ext.ExtractProcess(j)
- }
- return true
- } else {
- return false
- }
- }
- //启动抽取
- func StartExtractTaskId(taskId string) bool {
- isgo := false
- ext := TaskList[taskId]
- if ext == nil {
- ext = &ExtractTask{}
- ext.Id = taskId
- ext.InitTaskInfo()
- isgo = true
- } else {
- ext.Id = taskId
- ext.InitTaskInfo()
- }
- ext.TaskInfo.DB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
- ext.InitRulePres()
- ext.InitRuleBacks()
- ext.InitRuleCore()
- ext.InitTag()
- ext.InitClearFn()
- // ext.InitProvince()
- // ext.InitCityAll()
- // ext.InitCitySim()
- ext.IsRun = true
- if isgo {
- go RunExtractTask(taskId)
- }
- TaskList[taskId] = ext
- return true
- }
- //停止抽取
- func StopExtractTaskId(taskId string) bool {
- ext := TaskList[taskId]
- if ext != nil {
- ext.IsRun = false
- TaskList[taskId] = ext
- }
- //更新task.s_extlastid
- db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
- return true
- }
- //开始抽取
- func RunExtractTask(taskId string) {
- ext := TaskList[taskId]
- query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
- list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, -1, -1)
- for k, v := range *list {
- log.Println(k, v["_id"])
- if !ext.IsRun {
- break
- }
- j := PreInfo(v)
- ext.TaskInfo.ProcessPool <- true
- go ext.ExtractProcess(j)
- ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
- }
- //更新task.s_extlastid
- db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
- time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
- }
- //信息预处理
- func PreInfo(doc map[string]interface{}) *ju.Job {
- detail := ""
- d1, _ := doc["detail"].(string)
- d2, _ := doc["contenthtml"].(string)
- if len(d1) >= len(d2) || d2 == "" {
- detail = d1
- } else {
- detail = d2
- }
- detail = ju.CutLableStr(detail)
- detail = cut.ClearHtml(detail)
- doc["detail"] = detail
- href := qu.ObjToString(doc["href"])
- if strings.HasPrefix(href, "http://") {
- href = href[7:]
- } else if strings.HasPrefix(href, "https://") {
- href = href[8:]
- }
- pos := strings.Index(href, "/")
- if pos > 0 {
- href = href[:pos]
- }
- doc["domain"] = href
- toptype := qu.ObjToString(doc["toptype"])
- if qu.ObjToString(doc["type"]) == "bid" {
- toptype = "结果"
- }
- if toptype == "" {
- toptype = "*"
- }
- j := &ju.Job{
- SourceMid: qu.BsonIdToSId(doc["_id"]),
- Category: toptype,
- Content: qu.ObjToString(doc["detail"]),
- SpiderCode: qu.ObjToString(doc["spidercode"]),
- Domain: qu.ObjToString(doc["domain"]),
- Href: qu.ObjToString(doc["href"]),
- Title: qu.ObjToString(doc["title"]),
- Data: &doc,
- City: qu.ObjToString(doc["city"]),
- Province: qu.ObjToString(doc["area"]),
- Result: map[string][]*ju.ExtField{},
- //BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
- }
- pretreated.AnalyStart(j)
- return j
- }
- //抽取
- func (e *ExtractTask) ExtractProcess(j *ju.Job) {
- qu.Catch()
- qu.Try(func() {
- doc := *j.Data
- //全局前置规则,结果覆盖doc属性
- for _, v := range e.RulePres {
- doc = ExtRegPre(doc, j, v, e.TaskInfo)
- }
- //log.Println("全局前置规则", doc)
- //抽取规则
- for _, vc := range e.RuleCores {
- tmp := ju.DeepCopy(doc).(map[string]interface{})
- //是否进入逻辑
- if !ju.Logic(vc.LuaLogic, tmp) {
- continue
- }
- //抽取-前置规则
- for _, v := range vc.RulePres {
- tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
- }
- //log.Println("抽取-前置规则", tmp)
- //抽取-规则
- for _, v := range vc.RuleCores {
- ExtRegCore(vc.ExtFrom, tmp, j, v, e)
- }
- //log.Println("抽取-规则", tmp)
- //抽取-后置规则
- for _, v := range vc.RuleBacks {
- ExtRegBack(j, v, e.TaskInfo)
- }
- //log.Println("抽取-后置规则", tmp)
- }
- //全局后置规则
- for _, v := range e.RuleBacks {
- ExtRegBack(j, v, e.TaskInfo)
- }
- //函数清理
- for key, val := range j.Result {
- for _, v := range val {
- data := clear.DoClearFn(e.ClearFn[key], []interface{}{v.Value, j.Content})
- v.Value = data[0]
- }
- }
- //bs, _ := json.Marshal(j.Result)
- //log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
- //抽取省份城市县
- //fmt.Println("-----------", j.Province, j.City, j.BuyerAddr, j.Title) //j.Address
- //ExtractPC(j.Result, j.Province, j.City, j.Title, j.BuyerAddr, j.SourceMid) //j.Address
- ExtractPC2(j.Result, "Province", "City", "Title", "Addr", j.SourceMid)
- //分析抽取结果并保存 todo
- AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
- }, func(err interface{}) {
- log.Println(err)
- <-e.TaskInfo.ProcessPool
- })
- <-e.TaskInfo.ProcessPool
- }
- //前置过滤
- func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInfo) map[string]interface{} {
- before := ju.DeepCopy(doc).(map[string]interface{})
- extinfo := map[string]interface{}{}
- if in.IsLua {
- lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
- if j != nil {
- lua.Block = j.Block
- }
- extinfo = lua.RunScript("pre")
- for k, v := range extinfo { //结果覆盖原doc
- doc[k] = v
- }
- AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
- } else {
- key := qu.If(in.Field == "", "detail", in.Field).(string)
- text := qu.ObjToString(doc[key])
- extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
- doc[key] = extinfo[key] //结果覆盖原doc
- AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
- }
- return doc
- }
- //抽取-规则
- func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
- if in.IsLua {
- lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
- if in.IsHasFields { //lua脚本配置有属性字段
- lua.KvMap = getKvByLuaFields(extfrom, j, in, et.Tag)
- } else {
- lua.KvMap = map[string][]map[string]interface{}{}
- }
- lua.Block = j.Block
- extinfo := lua.RunScript("core")
- for k, v := range extinfo {
- if j.Result[k] == nil {
- j.Result[k] = [](*ju.ExtField){}
- }
- if tmps, ok := v.([]map[string]interface{}); ok {
- for _, tmp := range tmps {
- j.Result[k] = append(j.Result[k],
- &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), extfrom, tmp["value"]})
- }
- }
- }
- if len(extinfo) > 0 {
- AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
- }
- } else {
- //全文正则
- text := qu.ObjToString(doc[extfrom])
- if in.Field != "" {
- extinfo := extRegCoreToResult(extfrom, text, j, in)
- if len(extinfo) > 0 {
- AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
- }
- }
- }
- }
- //lua脚本根据属性设置提取kv值
- func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]*Tag) map[string][]map[string]interface{} {
- kvmap := map[string][]map[string]interface{}{}
- for _, vv := range in.LFields {
- field := qu.ObjToString(vv)
- tags := t[qu.ObjToString(vv)] //获取对应标签库
- for _, bl := range j.Block {
- //冒号kv
- if bl.ColonKV != nil {
- kvs := bl.ColonKV.Kvs
- kvs2 := bl.ColonKV.Kvs_2
- for _, tag := range tags {
- for _, kv := range kvs {
- if tag.Type == "string" {
- if kv.Key == tag.Key {
- text := ju.TrimLRSpace(kv.Value, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "field": field,
- "code": in.Code,
- "ruletext": tag.Key,
- "extfrom": extfrom,
- "value": text,
- "type": "colon1",
- "matchtype": "tag_string",
- })
- }
- break
- }
- } else if tag.Type == "regexp" {
- if tag.Reg.MatchString(kv.Key) {
- text := ju.TrimLRSpace(kv.Value, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "field": field,
- "code": in.Code,
- "ruletext": tag.Key,
- "extfrom": extfrom,
- "value": text,
- "type": "colon1",
- "matchtype": "tag_regexp",
- })
- }
- break
- }
- }
- }
- for _, kv := range kvs2 {
- if tag.Type == "string" {
- if kv.Key == tag.Key {
- text := ju.TrimLRSpace(kv.Value, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "field": field,
- "code": in.Code,
- "ruletext": tag.Key,
- "extfrom": extfrom,
- "value": text,
- "type": "colon2",
- "matchtype": "tag_string",
- })
- }
- break
- }
- } else if tag.Type == "regexp" {
- if tag.Reg.MatchString(kv.Key) {
- text := ju.TrimLRSpace(kv.Value, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "field": field,
- "code": in.Code,
- "ruletext": tag.Key,
- "extfrom": extfrom,
- "value": text,
- "type": "colon2",
- "matchtype": "tag_regexp",
- })
- }
- break
- }
- }
- }
- }
- }
- //空格kv
- if bl.SpaceKV != nil {
- kvs := bl.SpaceKV.Kvs
- for _, tag := range tags {
- for _, kv := range kvs {
- if tag.Type == "string" {
- if kv.Key == tag.Key {
- text := ju.TrimLRSpace(kv.Value, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "field": field,
- "code": in.Code,
- "ruletext": tag.Key,
- "extfrom": extfrom,
- "value": text,
- "type": "space",
- "matchtype": "tag_string",
- })
- }
- break
- }
- } else if tag.Type == "regexp" {
- if tag.Reg.MatchString(kv.Key) {
- text := ju.TrimLRSpace(kv.Value, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "field": field,
- "code": in.Code,
- "ruletext": tag.Key,
- "extfrom": extfrom,
- "value": text,
- "type": "space",
- "matchtype": "tag_regexp",
- })
- }
- break
- }
- }
- }
- }
- }
- //表格kv
- if bl.TableKV != nil {
- kv := bl.TableKV.Kv
- for _, tag := range tags {
- for k, val := range kv {
- if tag.Type == "string" {
- if k == tag.Key {
- text := ju.TrimLRSpace(val, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "field": field,
- "code": in.Code,
- "ruletext": tag.Key,
- "extfrom": extfrom,
- "value": text,
- "type": "table",
- "matchtype": "tag_string",
- })
- }
- break
- }
- } else if tag.Type == "regexp" {
- if tag.Reg.MatchString(k) {
- text := ju.TrimLRSpace(val, "")
- if text != "" {
- kvmap[field] = append(kvmap[field], map[string]interface{}{
- "field": field,
- "code": in.Code,
- "ruletext": tag.Key,
- "extfrom": extfrom,
- "value": text,
- "type": "table",
- "matchtype": "tag_regexp",
- })
- }
- break
- }
- }
- }
- }
- }
- }
- }
- return kvmap
- }
- //正则提取结果
- func extRegCoreToResult(extfrom, text string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
- extinfo := map[string][]map[string]interface{}{}
- if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
- apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
- if len(apos) > 0 {
- pos := apos[0]
- for k, p := range v.RegCore.ExtractPos {
- if len(pos) > p {
- if pos[p] == -1 || pos[p+1] == -1 {
- continue
- }
- val := text[pos[p]:pos[p+1]]
- tmps := []map[string]interface{}{}
- tmp := map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "extfrom": extfrom,
- "value": val,
- "type": "regexp",
- "matchtype": "regcontent",
- }
- tmps = append(tmps, tmp)
- extinfo[k] = tmps
- if val != "" {
- if j.Result[v.Field] == nil {
- j.Result[k] = [](*ju.ExtField){}
- }
- j.Result[k] = append(j.Result[k], &ju.ExtField{k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val})
- }
- }
- }
- }
- } else {
- pos := v.RegCore.Reg.FindStringIndex(text)
- val := ""
- if len(pos) == 2 {
- text = text[pos[1]:]
- rs := regexp.MustCompile("[^\r\n\t]+")
- tmp := rs.FindAllString(text, -1)
- if len(tmp) > 0 {
- val = tmp[0]
- }
- }
- if val != "" {
- tmps := []map[string]interface{}{}
- tmp := map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "extfrom": extfrom,
- "value": val,
- "type": "regexp",
- "matchtype": "regcontent",
- }
- tmps = append(tmps, tmp)
- extinfo[v.Field] = tmps
- if j.Result[v.Field] == nil {
- j.Result[v.Field] = [](*ju.ExtField){}
- }
- j.Result[v.Field] = append(j.Result[v.Field], &ju.ExtField{v.Field, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val})
- }
- }
- return extinfo
- }
- //后置过滤
- func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
- if in.IsLua {
- result := GetResultMapForLua(j)
- lua := ju.LuaScript{Code: in.Code, Name: in.Name, Result: result, Script: in.RuleText}
- if j != nil {
- lua.Block = j.Block
- }
- extinfo := lua.RunScript("back")
- for k, v := range extinfo {
- if tmps, ok := v.([]map[string]interface{}); ok {
- j.Result[k] = [](*ju.ExtField){}
- for _, tmp := range tmps {
- j.Result[k] = append(j.Result[k], &ju.ExtField{k, qu.ObjToString(tmp["code"]), qu.ObjToString(tmp["ruletext"]), qu.ObjToString(tmp["type"]), qu.ObjToString(tmp["matchtype"]), qu.ObjToString(tmp["extfrom"]), tmp["value"]})
- }
- }
- }
- if len(extinfo) > 0 {
- AddExtLog("clear", j.SourceMid, result, extinfo, in, t) //抽取日志
- }
- } else {
- extinfo := map[string]interface{}{}
- if in.Field != "" {
- if j.Result[in.Field] != nil {
- tmp := j.Result[in.Field]
- exts := []interface{}{}
- for k, v := range tmp {
- text := qu.ObjToString(v.Value)
- if text != "" {
- text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
- }
- j.Result[in.Field][k].Value = text
- exts = append(exts, map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "type": v.Type,
- "matchtype": v.MatchType,
- "extfrom": v.ExtFrom,
- "value": text,
- })
- }
- extinfo[in.Field] = exts
- if len(extinfo) > 0 {
- AddExtLog("clear", j.SourceMid, tmp, extinfo, in, t) //抽取日志
- }
- }
- } else {
- for key, tmp := range j.Result {
- exts := []interface{}{}
- for k, v := range tmp {
- text := qu.ObjToString(v.Value)
- if text != "" {
- text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
- }
- j.Result[key][k].Value = text
- exts = append(exts, map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "type": v.Type,
- "matchtype": v.MatchType,
- "extfrom": v.ExtFrom,
- "value": text,
- })
- }
- extinfo[key] = exts
- }
- if len(extinfo) > 0 {
- AddExtLog("clear", j.SourceMid, j.Result, extinfo, in, t) //抽取日志
- }
- }
- }
- }
- //获取抽取结果map[string][]interface{},lua脚本使用
- func GetResultMapForLua(j *ju.Job) map[string][]map[string]interface{} {
- result := map[string][]map[string]interface{}{}
- for key, val := range j.Result {
- if result[key] == nil {
- result[key] = []map[string]interface{}{}
- }
- for _, v := range val {
- tmp := map[string]interface{}{
- "field": v.Field,
- "code": v.Code,
- "ruletext": v.RuleText,
- "value": v.Value,
- "type": v.Type,
- "matchtype": v.MatchType,
- "extfrom": v.ExtFrom,
- }
- result[key] = append(result[key], tmp)
- }
- }
- return result
- }
- //抽取日志
- func AddExtLog(ftype, sid string, before interface{}, extinfo interface{}, v *RegLuaInfo, t *TaskInfo) {
- if !t.IsEtxLog {
- return
- }
- logdata := map[string]interface{}{
- "code": v.Code,
- "name": v.Name,
- "type": ftype,
- "ruletext": v.RuleText,
- "islua": v.IsLua,
- "field": v.Field,
- "version": t.Version,
- "taskname": t.Name,
- "before": before,
- "extinfo": extinfo,
- "sid": sid,
- "comeintime": time.Now().Unix(),
- }
- lock.Lock()
- ExtLogs[t] = append(ExtLogs[t], logdata)
- lock.Unlock()
- }
- //保存抽取日志
- func SaveExtLog() {
- tmpLogs := map[*TaskInfo][]map[string]interface{}{}
- lock.Lock()
- tmpLogs = ExtLogs
- ExtLogs = map[*TaskInfo][]map[string]interface{}{}
- lock.Unlock()
- for k, v := range tmpLogs {
- if len(v) < saveLimit {
- db.Mgo.SaveBulk(k.TrackColl, v...)
- } else {
- for {
- if len(v) > saveLimit {
- tmp := v[:saveLimit]
- db.Mgo.SaveBulk(k.TrackColl, tmp...)
- v = v[saveLimit:]
- } else {
- db.Mgo.SaveBulk(k.TrackColl, v...)
- break
- }
- }
- }
- }
- time.AfterFunc(10*time.Second, SaveExtLog)
- }
- type FieldValue struct {
- Value interface{}
- Count int
- }
- //分析抽取结果并保存
- func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, task *TaskInfo) {
- _id := qu.BsonIdToSId((*doc)["_id"])
- //结果排序
- values := map[string][]*ju.SortObject{}
- for key, val := range result {
- fieldValue := map[string][]interface{}{}
- for _, v := range val {
- if fieldValue[fmt.Sprint(v.Value)] == nil {
- fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
- } else {
- fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
- }
- }
- objects := []*ju.SortObject{}
- for k, v := range fieldValue {
- tmp := &ju.SortObject{
- Key: k,
- Value: qu.IntAll(v[0]),
- Object: v[1],
- }
- objects = append(objects, tmp)
- }
- values[key] = ju.ExtSort(objects)
- }
- //从排序结果中取值
- tmp := map[string]interface{}{}
- for key, val := range values {
- for _, v := range val { //取第一个
- if v.Key != "" {
- tmp[key] = v.Object
- break
- }
- }
- }
- if task.TestColl == "" {
- if len(tmp) > 0 { //保存抽取结果
- task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
- }
- //保存抽取详情
- tmp["result"] = result
- for k, v := range *doc {
- if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
- tmp[k] = v
- }
- }
- db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
- } else { //测试结果
- //保存抽取详情
- tmp["result"] = result
- for k, v := range *doc {
- if tmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
- tmp[k] = v
- }
- }
- db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
- }
- }
- //抽取城市、省份
- func ExtractPC2(result map[string][]*ju.ExtField, province, city, title, addr, sourcemid string) (bres bool, c, p string) {
- var pjnarr, buyerarr []string
- var pb []interface{}
- for n, val := range result["projectname"] {
- pjnarr[n] = fmt.Sprint(val.Value)
- }
- for n, val := range result["buyer"] {
- buyerarr[n] = fmt.Sprint(val.Value)
- }
- pl := len(pjnarr)
- bl := len(buyerarr)
- max := 0
- if pl > bl {
- max = pl
- } else {
- max = bl
- }
- //city, buyer, addr, projectname, title
- if max == 0 { //没有projectname和buyer结果集
- tmp1 := []string{city, "", addr, "", title}
- pb = append(pb, tmp1)
- } else { //至少有一个结果集
- if max == pl {
- for i := 0; i < max; i++ {
- p := pjnarr[i]
- b := ""
- if i < bl {
- b = buyerarr[i]
- }
- tmp2 := []string{city, b, addr, p, title}
- pb = append(pb, tmp2)
- }
- } else {
- for i := 0; i < max; i++ {
- b := buyerarr[i]
- p := ""
- if i < pl {
- p = pjnarr[i]
- }
- tmp3 := []string{city, b, addr, p, title}
- pb = append(pb, tmp3)
- }
- }
- }
- log.Println(pb)
- return
- }
- func ExtractPC(buyer, projectname, title, city, province, addr string, id interface{}) (bres bool, c, p string) {
- defer qu.Catch()
- bc := true //是否继续抽取
- if city != "" {
- if CityBrief[city] == nil { //简称不存在
- //log.Println("city err:", city, id)
- } else { //简称存在
- if province != CityBrief[city].P.Brief { //省份不对
- log.Println("province err:", city, province, id)
- } else {
- bc = false
- //原值正确,不用抽取
- }
- }
- }
- //有省份
- bp := false
- if ProvinceBrief[province] != nil {
- bp = true
- } else { //没有省份,先识别省份
- for _, str := range []string{city, buyer, addr, projectname, title} {
- word := AreaProvinceGet.CheckSensitiveWord(str) //省全称
- if word != "" {
- province = ProvinceMap[word] //省简称
- bp = true
- break
- }
- }
- }
- //匹配城市
- if bc { //城市简称不存在CityBrief[city]==nil,或城市简称存在但省份不对,继续抽取
- //目前是全匹配模式,如果再加上精简匹配,加一层循环
- for pos, GET := range []DFA{AreaGet, AreaSimGet} {
- ws := make([]string, 5)
- for n, str := range []string{city, buyer, addr, projectname, title} {
- if str != "" {
- word := GET.CheckSensitiveWord(str)
- if pos == 1 { //用简称 后辍为路、集团替换
- str1 := strings.Replace(str, word+"路", "", 1)
- if str1 != str {
- word = GET.CheckSensitiveWord(str1)
- }
- }
- ws[n] = word
- if word != "" {
- res := AreaToCity[word]
- if len(res) == 1 {
- //判断省份
- if !bp || province == res[0].P.Brief { //省份不存在或一致直接返回
- bres = true
- c = res[0].Brief
- p = res[0].P.Brief
- break
- } else { //不一致时。。暂时不处理
- }
- } else { //多个时
- }
- }
- }
- }
- if !bres {
- mc := map[string]int{}
- for _, w := range ws {
- res := AreaToCity[w]
- for _, ct := range res {
- if ct == nil {
- continue
- }
- if bp { //有省份
- if ct.P != nil && ct.P.Brief == province {
- mc[ct.Brief]++
- }
- } else { //没有省份
- mc[ct.Brief]++
- }
- }
- }
- //计算mc中最大值且大于1
- max := 1
- v := ""
- for mk, mv := range mc {
- if mv > max {
- v = mk
- }
- }
- if v != "" {
- bres = true
- c = CityBrief[v].Brief
- p = CityBrief[v].P.Brief
- } else if len(mc) > 0 {
- //取级别更大的
- v := ""
- for mk, _ := range mc {
- if CityBrief[mk].P.Cap == mk {
- bres = true
- c = CityBrief[mk].Brief
- p = CityBrief[mk].P.Brief
- break
- } else {
- v = mk
- }
- }
- if !bres {
- bres = true
- c = CityBrief[v].Brief
- p = CityBrief[v].P.Brief
- }
- }
- }
- if bres {
- break
- }
- }
- } else {
- return
- }
- if !bres {
- //取默认省会
- if ProvinceBrief[province] != nil {
- bres = true
- c = ProvinceBrief[province].Cap
- p = province
- }
- }
- return
- }
|