123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815 |
- package extract
- import (
- "fmt"
- log "github.com/donnie4w/go-logger/logger"
- "gopkg.in/mgo.v2/bson"
- "jy/clear"
- db "jy/mongodbutil"
- "jy/pretreated"
- ju "jy/util"
- qu "qfw/util"
- "qfw/util/redis"
- "strconv"
- "strings"
- "time"
- "unicode/utf8"
- )
- // 结果追踪调试
- func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
- defer qu.Catch()
- ext := TaskList[taskId]
- if ext == nil {
- ext = &ExtractTask{}
- ext.Id = taskId
- ext.InitTestTaskInfo(resultcoll, trackcoll)
- ext.IsRun = true
- ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
- }
- ext.InitSite()
- ext.InitRulePres()
- ext.InitRuleBacks(false)
- ext.InitRuleBacks(true)
- ext.InitRuleCore(false)
- ext.InitRuleCore(true)
- ext.InitPkgCore()
- ext.InitBlockRule()
- ext.InfoTypeList()
- ext.InitTag(false)
- ext.InitTag(true)
- ext.InitClearFn(false)
- ext.InitClearFn(true)
- ext.Lock()
- if ext.IsExtractCity && ext.ProvinceMap == nil { //版本上控制是否开始城市抽取
- ext.InitCityInfo()
- ext.InitAreaCode()
- ext.InitPostCode()
- }
- ext.Unlock()
- //质量审核
- ext.InitAuditFields()
- ext.InitAuditRule()
- ext.InitAuditClass()
- ext.InitAuditRecogField()
- //品牌抽取是否开启
- ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
- //价格个数抽取是否开启
- ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
- //附件抽取是否开启
- ext.InitFile()
- ext.TaskInfo.TestColl = resultcoll
- TaskList[taskId] = ext
- return RunExtractTestTask(ext, startId, num)
- }
- func IdTrans(startId string) bson.ObjectId {
- defer qu.Catch()
- return bson.ObjectIdHex(startId)
- }
- // 开始测试任务抽取~结果追踪
- func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
- n, _ := strconv.Atoi(num)
- id := IdTrans(startId)
- if id.Valid() {
- query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
- list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
- for _, v := range *list {
- if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
- continue
- }
- var j, jf *ju.Job
- var isSite bool
- j, _, isSite = ext.PreInfo(v)
- go ext.ExtractProcess(j, jf, isSite) //抽取-打分-保存
- ext.TaskInfo.ProcessPool <- true
- }
- return true
- } else {
- return false
- }
- }
- // 启动抽取
- func StartExtractTaskId(taskId string) bool {
- defer qu.Catch()
- isgo := false
- ext := TaskList[taskId]
- if ext == nil {
- ext = &ExtractTask{}
- ext.Id = taskId
- ext.InitTaskInfo()
- isgo = true
- } else {
- ext.Id = taskId
- ext.InitTaskInfo()
- }
- ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
- ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
- ext.InitSite()
- ext.InitRulePres()
- ext.InitRuleBacks(false)
- ext.InitRuleBacks(true)
- ext.InitRuleCore(false)
- ext.InitRuleCore(true)
- ext.InitPkgCore()
- ext.InitBlockRule()
- ext.InfoTypeList()
- ext.InitTag(false)
- ext.InitTag(true)
- ext.InitClearFn(false)
- ext.InitClearFn(true)
- ext.Lock()
- if ext.IsExtractCity { //版本上控制是否开始城市抽取
- ext.InitCityInfo()
- ext.InitAreaCode()
- ext.InitPostCode()
- }
- ext.Unlock()
- //质量审核
- ext.InitAuditFields()
- ext.InitAuditRule()
- ext.InitAuditClass()
- ext.InitAuditRecogField()
- //品牌抽取是否开启
- ju.IsBrandGoods, _ = ju.Config["brandgoods"].(bool)
- //价格个数抽取是否开启
- ju.IsPriceNumber, _ = ju.Config["pricenumber"].(bool)
- //附件抽取是否开启
- ext.InitFile()
- ext.IsRun = true
- go ext.ResultSave(true)
- go ext.BidSave(true)
- if isgo {
- go RunExtractTask(taskId)
- }
- TaskList[taskId] = ext
- return true
- }
- // 停止抽取
- func StopExtractTaskId(taskId string) bool {
- defer qu.Catch()
- ext := TaskList[taskId]
- if ext != nil {
- ext.IsRun = false
- TaskList[taskId] = ext
- }
- //更新task.s_extlastid
- db.Mgo.UpdateById("task", taskId, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
- return true
- }
- // 开始抽取
- func RunExtractTask(taskId string) {
- defer qu.Catch()
- ext := TaskList[taskId]
- query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
- count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
- pageNum := (count + PageSize - 1) / PageSize
- limit := PageSize
- if count < PageSize {
- limit = count
- }
- fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
- for i := 0; i < pageNum; i++ {
- query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
- list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
- fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
- for _, v := range *list {
- //根据标题判断是否抽取
- b := IsExtract("title", qu.ObjToString(v["title"]), "")
- if !b {
- continue
- }
- _id := qu.BsonIdToSId(v["_id"])
- //log.Debug(_id)
- if !ext.IsRun {
- break
- }
- var j, jf *ju.Job
- var isSite bool
- if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
- v["isextFile"] = true
- j, jf, isSite = ext.PreInfo(v)
- } else {
- j, _, isSite = ext.PreInfo(v)
- }
- go ext.ExtractProcess(j, jf, isSite)
- ext.TaskInfo.LastExtId = _id
- ext.TaskInfo.ProcessPool <- true
- }
- db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
- if !ext.IsRun {
- break
- }
- }
- //更新task.s_extlastid
- time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
- }
- // 信息预处理-不和版本关联,取最新版本的配置项
- func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
- return (&ExtractTask{}).PreInfo(doc)
- }
- // 信息预处理-和版本关联-处理表格-附件-kv标签库-中标候选人
- func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
- defer qu.Catch()
- //判断是否有附件这个字段
- var isextFile bool
- if doc["isextFile"] != nil {
- isextFile = doc["isextFile"].(bool)
- }
- isextFile = false
- detail := ""
- summary := qu.ObjToString(doc["summary"])
- detail = CleanDetailText(qu.ObjToString(doc["detail"]), summary)
- //调整采用detail抽取
- if utf8.RuneCountInString(detail) > 10000 {
- detail = string(([]rune(detail))[:10000])
- }
- doc["detail"] = detail
- isClearnMoney := !clearMoneyReg.MatchString(detail)
- if isClearnMoney {
- isClearnMoney = !clearMoneyReg.MatchString(qu.ObjToString(doc["title"]))
- }
- isClearnMoneystr := qu.ObjToString(qu.If(isClearnMoney, "T", ""))
- if isextFile {
- file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
- }
- toptype := qu.ObjToString(doc["toptype"])
- subtype := qu.ObjToString(doc["subtype"])
- if qu.ObjToString(doc["type"]) == "bid" {
- toptype = "结果"
- }
- if subtype == "其他" {
- subtype = "其它"
- }
- if toptype == "" || subtype == "" {
- toptype, subtype = "all", "all"
- }
- if toptype == "采购意向" || subtype == "采购意向" {
- toptype, subtype = "招标", "招标" //暂时按照"招标"
- }
- toMap := qu.ObjToMap(doc["jsondata"])
- if (*toMap) != nil {
- if (*toMap)["extweight"] == nil {
- (*toMap)["extweight"] = ju.Config["jsondata_extweight"]
- }
- if (*toMap)["jsoncontent"] != nil {
- delete(*toMap, "jsoncontent")
- }
- for k, v := range *toMap {
- if _, ok := v.(float64); ok {
- continue
- } else if _, ok := v.(int64); ok {
- continue
- } else if _, ok2 := v.(string); ok2 {
- continue
- } else {
- delete(*toMap, k)
- }
- }
- }
- j = &ju.Job{
- SourceMid: qu.BsonIdToSId(doc["_id"]),
- Category: toptype,
- CategorySecond: subtype,
- Content: qu.ObjToString(doc["detail"]),
- SpiderCode: qu.ObjToString(doc["spidercode"]),
- Site: qu.ObjToString(doc["site"]),
- Title: qu.ObjToString(doc["title"]),
- Data: &doc,
- City: qu.ObjToString(doc["city"]),
- Province: qu.ObjToString(doc["area"]),
- Jsondata: toMap,
- Result: map[string][]*ju.ExtField{},
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
- RuleBlock: e.RuleBlock,
- Dataging: qu.IntAll(doc["dataging"]),
- IsClearnMoney: isClearnMoneystr,
- IsUnRulesTab: false,
- }
- if isextFile {
- jf = &ju.Job{
- SourceMid: qu.BsonIdToSId(doc["_id"]),
- Category: toptype,
- CategorySecond: subtype,
- Content: qu.ObjToString(doc["detailfile"]),
- SpiderCode: qu.ObjToString(doc["spidercode"]),
- Site: qu.ObjToString(doc["site"]),
- Title: qu.ObjToString(doc["title"]),
- Data: &doc,
- City: qu.ObjToString(doc["city"]),
- Province: qu.ObjToString(doc["area"]),
- Jsondata: toMap,
- Result: map[string][]*ju.ExtField{},
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
- RuleBlock: e.RuleBlock,
- IsFile: isextFile,
- Dataging: qu.IntAll(doc["dataging"]),
- IsClearnMoney: isClearnMoneystr,
- IsUnRulesTab: false,
- }
- }
- codeSite := j.SpiderCode
- //是否启用站点
- if value, ok := e.SiteMerge.Load(codeSite); ok {
- isSite = value.(bool)
- }
- if isSite {
- //是否配置站点
- exp, isSite := e.Luacodes.Load(codeSite)
- if isSite {
- if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
- e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
- }
- if exp.(map[string]interface{})["e.SiteTag"] != nil {
- e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
- }
- if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
- e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
- }
- if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
- e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
- }
- }
- }
- qu.Try(func() { //不解析表格
- pretreated.AnalyStartNoTable(j, isSite, codeSite) //job.Block分块
- if isextFile && strings.TrimSpace(jf.Content) != "" {
- pretreated.AnalyStartNoTable(jf, isSite, codeSite)
- }
- }, func(err interface{}) {
- log.Debug("pretreated.AnalyStart", err, j.SourceMid)
- })
- return j, jf, isSite
- }
- // 抽取-正文
- func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
- e.ExtractDetail(j, isSite, j.SpiderCode) //正文抽取属性
- if jf != nil && jf.IsFile { //附件jf → j 合并
- e.ExtractDetail(jf, isSite, j.SpiderCode)
- for tmpk, xs := range jf.Result {
- if len(j.Result[tmpk]) == 0 {
- if tmpk == "budget" || tmpk == "bidamount" {
- for _, v := range xs {
- if fv, ok := v.Value.(float64); ok && fv > 100 && fv < 50000000000 {
- j.Result[tmpk] = append(j.Result[tmpk], v)
- }
- }
- } else {
- if tmpk == "winner" && j.Category == "招标" && j.CategorySecond != "单一" {
- continue
- }
- j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
- }
- } else {
- if tmpk == "winner" { //均没有有效值~采用附件的
- isUsed := false
- for _, v := range j.Result[tmpk] {
- if v.Value != "" {
- isUsed = true
- break
- }
- }
- if !isUsed {
- if j.Category == "招标" && j.CategorySecond != "单一" {
- continue
- }
- j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
- }
- }
- }
- }
- if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
- if j.Category == "招标" && j.CategorySecond != "单一" {
- } else {
- j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
- }
- }
- if len(j.PackageInfo) == 0 && isUsedPackageJF(jf.PackageInfo) {
- j.PackageInfo = jf.PackageInfo
- }
- }
- if isSite {
- ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
- if ok && ismerge.(bool) {
- tmpj := &ju.Job{
- SourceMid: j.SourceMid,
- Category: j.Category,
- CategorySecond: j.CategorySecond,
- Content: j.Content,
- SpiderCode: j.SpiderCode,
- //Domain: qu.ObjToString(doc["domain"]),
- //Href: qu.ObjToString(doc["href"]),
- Title: j.Title,
- Data: j.Data,
- City: j.City,
- Province: j.Province,
- Jsondata: j.Jsondata,
- Result: map[string][]*ju.ExtField{},
- BuyerAddr: j.BuyerAddr,
- RuleBlock: e.RuleBlock,
- }
- qu.Try(func() {
- pretreated.AnalyStart(tmpj, false, "") //job.Block分块
- }, func(err interface{}) {
- log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
- })
- e.ExtractDetail(tmpj, false, "")
- //合并数据
- j.Block = append(j.Block, tmpj.Block...)
- j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
- for tmpk, _ := range j.Result {
- if len(tmpj.Result[tmpk]) > 0 {
- j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
- }
- }
- for tmpk, _ := range tmpj.Result {
- if len(j.Result[tmpk]) == 0 {
- j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
- }
- }
- }
- }
- //分析抽取结果并保存
- AnalysisSaveResult(j, jf, e)
- <-e.TaskInfo.ProcessPool
- }
- // 抽取-正文-规则等 detail
- func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
- qu.Try(func() {
- doc := *j.Data
- tmprules := map[string][]*RuleCore{}
- lockrule.Lock()
- //加载分类抽取配置
- if j.Category == "all" || j.CategorySecond == "all" {
- if isSite {
- for k, vc1 := range e.SiteRuleCores["all_all"] {
- tmprules[k] = vc1
- }
- } else {
- for k, vc1 := range e.RuleCores["all_all"] {
- tmprules[k] = vc1
- }
- }
- } else {
- if isSite {
- for k, vc1 := range e.SiteRuleCores[j.Category+"_"+j.CategorySecond] {
- tmprules[k] = vc1
- }
- //找不到配置类别全抽
- if tmprules == nil || len(tmprules) == 0 {
- for k, vc1 := range e.SiteRuleCores["all_all"] {
- tmprules[k] = vc1
- }
- }
- } else {
- for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
- tmprules[k] = vc1
- }
- //找不到配置类别全抽
- if tmprules == nil || len(tmprules) == 0 {
- for k, vc1 := range e.RuleCores["all_all"] {
- tmprules[k] = vc1
- }
- }
- }
- }
- if len(tmprules) < 1 { //分类未覆盖部分
- if isSite {
- for k, vc1 := range e.RuleCores["all_all"] {
- tmprules[k] = vc1
- }
- } else {
- for k, vc1 := range e.SiteRuleCores["all_all"] {
- tmprules[k] = vc1
- }
- }
- }
- lockrule.Unlock()
- //抽取规则
- for _, vc1 := range tmprules {
- for _, vc := range vc1 {
- tmp := ju.DeepCopy(doc).(map[string]interface{})
- //是否进入逻辑
- if !ju.Logic(vc.LuaLogic, tmp) {
- continue
- }
- if vc.Field == "bidamount" {
- //log.Debug("调试抽取字段")
- }
- //抽取-前置规则
- //for _, v := range vc.RulePres {
- // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
- //}
- // log.Debug("抽取-前置规则", tmp)
- //抽取-规则
- ExtRuleCore(tmp, e, vc, j, isSite)
- // log.Debug("抽取-规则", tmp)
- //抽取-后置规则
- for _, v := range vc.RuleBacks {
- ExtRegBack(j, v, e.TaskInfo, vc)
- }
- //kv规则
- for _, v := range vc.KVRuleCores {
- ExtRuleKV(j, v, e.TaskInfo)
- }
- //项目名称未能抽取到,标题来凑
- if vc.Field == "projectname" {
- if vc.ExtFrom == "title" {
- isextitle := true
- for _, v := range j.Result[vc.Field] {
- if len([]rune(qu.ObjToString(v.Value))) > 5 {
- isextitle = false
- break
- }
- }
- if isextitle { //标题加入选举
- field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
- if isSite {
- field.Score = 1
- }
- j.Result[vc.Field] = append(j.Result[vc.Field], field)
- }
- }
- for i := 0; i < 3; i++ {
- for _, v := range vc.RuleBacks {
- ExtRegBack(j, v, e.TaskInfo, vc)
- }
- }
- }
- }
- }
- //全局后置规则
- if isSite {
- for _, v := range e.SiteRuleBacks {
- ExtRegBack(j, v, e.TaskInfo, nil)
- }
- } else {
- for _, v := range e.RuleBacks {
- ExtRegBack(j, v, e.TaskInfo, nil)
- }
- }
- //函数清理
- for key, val := range j.Result {
- for i, v := range val {
- if v.Field == "projectname" && v.Type == "table" {
- break
- }
- if key == "budget" || key == "bidamount" {
- if _, ok := v.Value.(float64); ok && !v.IsTrue {
- continue
- }
- }
- lockclear.Lock()
- var cfn = []string{}
- if isSite {
- cfn = e.SiteClearFn[key]
- if len(cfn) == 0 {
- cfn = e.ClearFn[key]
- }
- } else {
- cfn = e.ClearFn[key]
- }
- lockclear.Unlock()
- if len(cfn) == 0 {
- continue
- }
- data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
- if key == "budget" || key == "bidamount" {
- if istrue, ok := data[len(data)-1].(bool); istrue && ok {
- j.Result[key][i].IsTrue = true
- } else {
- j.Result[key][i].Value = data[0]
- continue
- }
- }
- before, _ := v.Value.(string)
- v.Value = data[0]
- BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
- //添加行数清理的日志 , 清理特殊符号
- lockclear.Lock()
- if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
- text := qu.ObjToString(v.Value)
- before = text
- //指定清理--新增-函数清理-其他清理
- if key == "winner" || key == "agency" || key == "buyer" {
- text = strings.ReplaceAll(text, "【", "")
- text = strings.ReplaceAll(text, "】", "")
- }
- v.Value = clear.OtherClean(key, text)
- BeforeAddClearFnLog("clear.OtherClean", "特殊符号清理clear.OtherClean", j.SourceMid, before, v.MatchType, v, e)
- }
- //AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)
- lockclear.Unlock()
- }
- }
- PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
- //bs, _ := json.Marshal(j.Result)
- //log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
- }, func(err interface{}) {
- log.Debug("ExtractProcess err", err, j.SourceMid)
- })
- }
- func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
- qu.Try(func() {
- doc := *j.Data
- //抽取规则
- tmprules := map[string][]*RuleCore{}
- lockrule.Lock()
- if j.Category == "all" || j.CategorySecond == "all" {
- for k, vc1 := range e.RuleCores["all_all"] {
- tmprules[k] = vc1
- }
- } else {
- for k, vc1 := range e.RuleCores[j.Category+"_"+j.CategorySecond] {
- tmprules[k] = vc1
- }
- }
- lockrule.Unlock()
- for _, vc1 := range tmprules {
- for _, vc := range vc1 {
- tmp := ju.DeepCopy(doc).(map[string]interface{})
- //是否进入逻辑
- if !ju.Logic(vc.LuaLogic, tmp) {
- continue
- }
- //抽取-前置规则
- //for _, v := range vc.RulePres {
- // if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
- // tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
- // }
- //}
- //抽取-规则
- if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
- ExtRuleCore(tmp, e, vc, j, isSite)
- }
- //抽取-后置规则
- for _, v := range vc.RuleBacks {
- if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
- ExtRegBack(j, v, e.TaskInfo, vc)
- }
- }
- }
- }
- //全局后置规则
- for _, v := range e.RuleBacks {
- if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
- ExtRegBack(j, v, e.TaskInfo, nil)
- }
- }
- //函数清理
- for key, val := range j.Result {
- for _, v := range val {
- lockclear.Lock()
- var cfn = []string{}
- if isSite {
- cfn = e.SiteClearFn[key]
- if len(cfn) == 0 {
- cfn = e.ClearFn[key]
- }
- } else {
- cfn = e.ClearFn[key]
- }
- lockclear.Unlock()
- if len(cfn) == 0 {
- continue
- }
- data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode, j.IsClearnMoney)
- v.Value = data[0]
- //清理特殊符号
- lockclear.Lock()
- if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
- clear.MesField[key] != nil {
- text := qu.ObjToString(v.Value)
- text = clear.OtherClean(key, text)
- v.Value = text
- }
- lockclear.Unlock()
- }
- }
- PackageDetail(j, e, isSite, codeSite) //处理分包信息
- //bs, _ := json.Marshal(j.Result)
- //log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
- }, func(err interface{}) {
- log.Debug("ExtractProcess err", err)
- })
- }
- // 审查
- func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
- defer qu.Catch()
- //获取审核字段
- for _, field := range e.AuditFields {
- //1.分包
- if resulttmp["package"] != nil {
- packagedata := resulttmp["package"].(map[string]map[string]interface{})
- for _, val := range packagedata {
- if val[field] != nil {
- fv := qu.ObjToString(val[field])
- if fv != "" {
- if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
- e.RedisMatch(field, fv, val) //redis匹配
- } else { //除了buyer和winner,其他字段走规则匹配
- e.RuleMatch(field, fv, val)
- }
- }
- }
- }
- }
- //2.外围
- if resulttmp[field] != nil {
- fv := qu.ObjToString(resulttmp[field])
- if fv != "" {
- if field == "buyer" || field == "winner" { //field为buyer和winner时特殊处理,先从Redis中查,有直接通过,没有走匹配规则
- e.RedisMatch(field, fv, resulttmp) //redis匹配
- } else { //除了buyer和winner,其他字段走规则匹配
- e.RuleMatch(field, fv, resulttmp)
- }
- }
- }
- }
- }
- // Redis匹配
- func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
- defer qu.Catch()
- i := redis.GetInt(field, field+"_"+fv) //查找redis
- if i == 0 { //reids未找到,执行规则匹配
- val[field+"_isredis"] = false
- e.RuleMatch(field, fv, val) //规则匹配
- } else { //redis找到,打标识存库
- val[field+"_isredis"] = true
- }
- }
- // 规则匹配
- func (e *ExtractTask) RuleMatch(field, fieldval string, tmpMap map[string]interface{}) {
- defer qu.Catch()
- if fieldval != "" {
- SMap := e.StartMatch(field, fieldval)
- //SMap.AddKey(field+"_isaudit", false)
- for _, k := range SMap.Keys {
- tmpMap[k] = SMap.Map[k]
- }
- tmpMap[field+"_isaudit"] = false //添加字段未审核信息
- }
- }
- // 开始规则匹配
- func (e *ExtractTask) StartMatch(field, text string) *pretreated.SortMap {
- defer qu.Catch()
- SMap := pretreated.NewSortMap()
- lock.Lock()
- f := e.RecogFieldMap[field]
- lock.Unlock()
- if len(f) > 0 {
- fid := qu.BsonIdToSId(f["_id"])
- recogFieldPreRule := qu.ObjToString(f["s_recogfield_prerule"])
- textAfterRecogFieldPrerule := ju.PreFilter(text, recogFieldPreRule) //识别字段的前置过滤
- if textAfterRecogFieldPrerule != "" {
- lock.Lock()
- classMap := e.FidClassMap[fid]
- lock.Unlock()
- L:
- for _, c := range classMap { //class
- classid := qu.BsonIdToSId(c["_id"])
- classPrerule := qu.ObjToString(c["s_class_prerule"])
- savefield := qu.ObjToString(c["s_savefield"]) //保存字段
- textAfterClassPrerule := ju.PreFilter(textAfterRecogFieldPrerule, classPrerule) //class的前置过滤
- if textAfterClassPrerule != "" {
- lock.Lock()
- ruleMap := e.CidRuleMap[classid]
- lock.Unlock()
- for _, r := range ruleMap { //rule
- rulePrerule := qu.ObjToString(r["s_rule_prerule"])
- s_name := qu.ObjToString(r["s_name"])
- rule := r["rule"].([]interface{})
- textAfterRulePrerule := ju.PreFilter(textAfterClassPrerule, rulePrerule) //class的前置过滤
- if textAfterRulePrerule != "" {
- b, _ := ju.RecogAnalyRules(textAfterRulePrerule, rule)
- if b { //匹配到一个分类下某个规则时,不再继续匹配
- if savefield != "" { //保存字段不为空,存储代码信息
- SMap.AddKey(field+"_"+savefield, s_name)
- }
- break L
- }
- }
- }
- }
- }
- }
- }
- return SMap
- }
|