|
@@ -8,7 +8,6 @@ import (
|
|
db "jy/mongodbutil"
|
|
db "jy/mongodbutil"
|
|
"jy/pretreated"
|
|
"jy/pretreated"
|
|
ju "jy/util"
|
|
ju "jy/util"
|
|
- "log"
|
|
|
|
qu "qfw/util"
|
|
qu "qfw/util"
|
|
"qfw/util/redis"
|
|
"qfw/util/redis"
|
|
"reflect"
|
|
"reflect"
|
|
@@ -18,6 +17,7 @@ import (
|
|
"time"
|
|
"time"
|
|
"unicode/utf8"
|
|
"unicode/utf8"
|
|
|
|
|
|
|
|
+ log "github.com/donnie4w/go-logger/logger"
|
|
"gopkg.in/mgo.v2/bson"
|
|
"gopkg.in/mgo.v2/bson"
|
|
)
|
|
)
|
|
|
|
|
|
@@ -134,8 +134,8 @@ func StartExtractTaskId(taskId string) bool {
|
|
ext.InitFile()
|
|
ext.InitFile()
|
|
|
|
|
|
ext.IsRun = true
|
|
ext.IsRun = true
|
|
- go ext.ResultSave()
|
|
|
|
- go ext.BidSave()
|
|
|
|
|
|
+ go ext.ResultSave(true)
|
|
|
|
+ go ext.BidSave(true)
|
|
if isgo {
|
|
if isgo {
|
|
go RunExtractTask(taskId)
|
|
go RunExtractTask(taskId)
|
|
}
|
|
}
|
|
@@ -167,17 +167,17 @@ func RunExtractTask(taskId string) {
|
|
if count < PageSize {
|
|
if count < PageSize {
|
|
limit = count
|
|
limit = count
|
|
}
|
|
}
|
|
- log.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
|
|
|
|
|
|
+ fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
|
|
for i := 0; i < pageNum; i++ {
|
|
for i := 0; i < pageNum; i++ {
|
|
query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
- log.Printf("page=%d,query=%v", i+1, query)
|
|
|
|
|
|
+ fmt.Printf("page=%d,query=%v", i+1, query)
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
|
|
for _, v := range *list {
|
|
for _, v := range *list {
|
|
if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
|
|
if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
_id := qu.BsonIdToSId(v["_id"])
|
|
_id := qu.BsonIdToSId(v["_id"])
|
|
- log.Println(_id)
|
|
|
|
|
|
+ log.Debug(_id)
|
|
if !ext.IsRun {
|
|
if !ext.IsRun {
|
|
break
|
|
break
|
|
}
|
|
}
|
|
@@ -266,7 +266,7 @@ func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
|
|
pretreated.AnalyStart(jf)
|
|
pretreated.AnalyStart(jf)
|
|
}
|
|
}
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
- log.Println("pretreated.AnalyStart", err)
|
|
|
|
|
|
+ log.Debug("pretreated.AnalyStart", err)
|
|
})
|
|
})
|
|
return j, jf
|
|
return j, jf
|
|
}
|
|
}
|
|
@@ -332,13 +332,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
for _, v := range vc.RulePres {
|
|
for _, v := range vc.RulePres {
|
|
tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
- //log.Println("抽取-前置规则", tmp)
|
|
|
|
|
|
+ // log.Debug("抽取-前置规则", tmp)
|
|
|
|
|
|
//抽取-规则
|
|
//抽取-规则
|
|
for _, v := range vc.RuleCores {
|
|
for _, v := range vc.RuleCores {
|
|
ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
}
|
|
}
|
|
- //log.Println("抽取-规则", tmp)
|
|
|
|
|
|
+ // log.Debug("抽取-规则", tmp)
|
|
|
|
|
|
//项目名称未能抽取到,标题来凑
|
|
//项目名称未能抽取到,标题来凑
|
|
if vc.Field == "projectname" {
|
|
if vc.Field == "projectname" {
|
|
@@ -351,7 +351,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
for _, v := range vc.RuleBacks {
|
|
for _, v := range vc.RuleBacks {
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
- //log.Println("抽取-后置规则", tmp)
|
|
|
|
|
|
+ // log.Debug("抽取-后置规则", tmp)
|
|
}
|
|
}
|
|
|
|
|
|
//全局后置规则
|
|
//全局后置规则
|
|
@@ -403,9 +403,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
|
|
}
|
|
}
|
|
PackageDetail(j, e) //处理分包信息
|
|
PackageDetail(j, e) //处理分包信息
|
|
// bs, _ := json.Marshal(j.Result)
|
|
// bs, _ := json.Marshal(j.Result)
|
|
- // log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
|
|
|
|
+ // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
- log.Println("ExtractProcess err", err)
|
|
|
|
|
|
+ log.Debug("ExtractProcess err", err)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
@@ -430,7 +430,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- //log.Println("抽取-前置规则", tmp)
|
|
|
|
|
|
+ // log.Debug("抽取-前置规则", tmp)
|
|
|
|
|
|
//抽取-规则
|
|
//抽取-规则
|
|
for _, v := range vc.RuleCores {
|
|
for _, v := range vc.RuleCores {
|
|
@@ -438,7 +438,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
ExtRegCore(vc.ExtFrom, tmp, j, v, e)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- //log.Println("抽取-规则", tmp)
|
|
|
|
|
|
+ // log.Debug("抽取-规则", tmp)
|
|
|
|
|
|
//抽取-后置规则
|
|
//抽取-后置规则
|
|
for _, v := range vc.RuleBacks {
|
|
for _, v := range vc.RuleBacks {
|
|
@@ -446,7 +446,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- //log.Println("抽取-后置规则", tmp)
|
|
|
|
|
|
+ // log.Debug("抽取-后置规则", tmp)
|
|
}
|
|
}
|
|
|
|
|
|
//全局后置规则
|
|
//全局后置规则
|
|
@@ -501,9 +501,9 @@ func (e *ExtractTask) ExtractFile(j *ju.Job) {
|
|
|
|
|
|
PackageDetail(j, e) //处理分包信息
|
|
PackageDetail(j, e) //处理分包信息
|
|
// bs, _ := json.Marshal(j.Result)
|
|
// bs, _ := json.Marshal(j.Result)
|
|
- // log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
|
|
|
|
+ // log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
- log.Println("ExtractProcess err", err)
|
|
|
|
|
|
+ log.Debug("ExtractProcess err", err)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
|
|
@@ -591,8 +591,8 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
if bl.ColonKV != nil {
|
|
if bl.ColonKV != nil {
|
|
kvs := bl.ColonKV.Kvs
|
|
kvs := bl.ColonKV.Kvs
|
|
kvs2 := bl.ColonKV.Kvs_2
|
|
kvs2 := bl.ColonKV.Kvs_2
|
|
- //log.Println("ColonKV1", kvs)
|
|
|
|
- //log.Println("ColonKV2", kvs2)
|
|
|
|
|
|
+ // log.Debug("ColonKV1", kvs)
|
|
|
|
+ // log.Debug("ColonKV2", kvs2)
|
|
for _, tag := range tags {
|
|
for _, tag := range tags {
|
|
for _, kv := range kvs {
|
|
for _, kv := range kvs {
|
|
if tag.Type == "string" {
|
|
if tag.Type == "string" {
|
|
@@ -669,7 +669,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
//空格kv
|
|
//空格kv
|
|
if bl.SpaceKV != nil {
|
|
if bl.SpaceKV != nil {
|
|
kvs := bl.SpaceKV.Kvs
|
|
kvs := bl.SpaceKV.Kvs
|
|
- //log.Println("SpaceKV", kvs)
|
|
|
|
|
|
+ // log.Debug("SpaceKV", kvs)
|
|
for _, tag := range tags {
|
|
for _, tag := range tags {
|
|
for _, kv := range kvs {
|
|
for _, kv := range kvs {
|
|
if tag.Type == "string" {
|
|
if tag.Type == "string" {
|
|
@@ -711,7 +711,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
//表格kv
|
|
//表格kv
|
|
if bl.TableKV != nil {
|
|
if bl.TableKV != nil {
|
|
tkv := bl.TableKV
|
|
tkv := bl.TableKV
|
|
- //log.Println("tkv", tkv)
|
|
|
|
|
|
+ // log.Debug("tkv", tkv)
|
|
for k, v := range tkv.Kv {
|
|
for k, v := range tkv.Kv {
|
|
if k == fieldname {
|
|
if k == fieldname {
|
|
if len(tags) > -tkv.KvIndex[fieldname] {
|
|
if len(tags) > -tkv.KvIndex[fieldname] {
|
|
@@ -731,7 +731,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
|
|
"matchtype": "tag_string",
|
|
"matchtype": "tag_string",
|
|
})
|
|
})
|
|
} else { //涉及其他待处理
|
|
} else { //涉及其他待处理
|
|
- //log.Println(tags)
|
|
|
|
|
|
+ // log.Debug(tags)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -1031,7 +1031,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
}
|
|
}
|
|
if e.IsExtractCity { //城市抽取
|
|
if e.IsExtractCity { //城市抽取
|
|
b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
|
|
b, p, c, d := e.TransmitData(tmp, _id) //抽取省份城市
|
|
- //log.Println("省份---", p, "城市---", c, "区---", d)
|
|
|
|
|
|
+ // log.Debug("省份---", p, "城市---", c, "区---", d)
|
|
tmp["district"] = d
|
|
tmp["district"] = d
|
|
if b {
|
|
if b {
|
|
tmp["city"] = c
|
|
tmp["city"] = c
|
|
@@ -1049,7 +1049,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
if len(j.BrandData) > 0 {
|
|
if len(j.BrandData) > 0 {
|
|
tmp["tablebrand"] = j.BrandData
|
|
tmp["tablebrand"] = j.BrandData
|
|
}
|
|
}
|
|
- //log.Println("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
|
|
|
|
|
|
+ // log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
|
|
}
|
|
}
|
|
if e.TaskInfo.TestColl == "" {
|
|
if e.TaskInfo.TestColl == "" {
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
@@ -1090,11 +1090,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
tmp["resultf"] = resultf
|
|
tmp["resultf"] = resultf
|
|
b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
|
|
b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
|
|
if !b {
|
|
if !b {
|
|
- log.Println(e.TaskInfo.TestColl, _id)
|
|
|
|
|
|
+ log.Debug(e.TaskInfo.TestColl, _id)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}, func(err interface{}) {
|
|
}, func(err interface{}) {
|
|
- log.Println("AnalysisSaveResult err", err)
|
|
|
|
|
|
+ log.Debug("AnalysisSaveResult err", err)
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
|