|
@@ -1,6 +1,7 @@
|
|
|
package extract
|
|
|
|
|
|
import (
|
|
|
+ "bytes"
|
|
|
"encoding/json"
|
|
|
"fmt"
|
|
|
"jy/clear"
|
|
@@ -15,6 +16,7 @@ import (
|
|
|
"strconv"
|
|
|
"sync"
|
|
|
"time"
|
|
|
+ "unicode/utf8"
|
|
|
|
|
|
"gopkg.in/mgo.v2/bson"
|
|
|
)
|
|
@@ -27,7 +29,7 @@ var (
|
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
saveLimit = 200 //抽取日志批量保存
|
|
|
PageSize = 5000 //查询分页
|
|
|
- Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1}`
|
|
|
+ Fields = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
|
|
|
@@ -77,8 +79,15 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
|
if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
|
|
|
continue
|
|
|
}
|
|
|
- //log.Println(v["_id"])
|
|
|
- j, jf := PreInfo(v, false)
|
|
|
+ var j, jf *ju.Job
|
|
|
+ if ext.IsFileField{
|
|
|
+ if v["projectinfo"] != nil {
|
|
|
+ v["isextFile"] = true
|
|
|
+ j, jf = PreInfo(v)
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ j, _ = PreInfo(v)
|
|
|
+ }
|
|
|
ext.TaskInfo.ProcessPool <- true
|
|
|
go ext.ExtractProcess(j, jf)
|
|
|
}
|
|
@@ -171,7 +180,15 @@ func RunExtractTask(taskId string) {
|
|
|
if !ext.IsRun {
|
|
|
break
|
|
|
}
|
|
|
- j, jf := PreInfo(v, false)
|
|
|
+ var j, jf *ju.Job
|
|
|
+ if ext.IsFileField{
|
|
|
+ if v["projectinfo"] != nil {
|
|
|
+ v["isextFile"] = true
|
|
|
+ j, jf = PreInfo(v)
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ j, _ = PreInfo(v)
|
|
|
+ }
|
|
|
ext.TaskInfo.ProcessPool <- true
|
|
|
go ext.ExtractProcess(j, jf)
|
|
|
ext.TaskInfo.LastExtId = _id
|
|
@@ -186,8 +203,13 @@ func RunExtractTask(taskId string) {
|
|
|
}
|
|
|
|
|
|
//信息预处理
|
|
|
-func PreInfo(doc map[string]interface{}, isextFile bool) (j, jf *ju.Job) {
|
|
|
+func PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
|
|
|
defer qu.Catch()
|
|
|
+ //判断是否有附件这个字段
|
|
|
+ var isextFile bool
|
|
|
+ if doc["isextFile"] != nil{
|
|
|
+ isextFile = doc["isextFile"].(bool)
|
|
|
+ }
|
|
|
detail := ""
|
|
|
d1, _ := doc["detail"].(string)
|
|
|
d2, _ := doc["contenthtml"].(string)
|
|
@@ -199,7 +221,10 @@ func PreInfo(doc map[string]interface{}, isextFile bool) (j, jf *ju.Job) {
|
|
|
detail = ju.CutLableStr(detail)
|
|
|
detail = cut.ClearHtml(detail)
|
|
|
doc["detail"] = detail
|
|
|
- doc["detailfile"] = "" //附件文本堆一起(后期可以考虑,分开处理)
|
|
|
+
|
|
|
+ if isextFile {
|
|
|
+ file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
|
+ }
|
|
|
toptype := qu.ObjToString(doc["toptype"])
|
|
|
if qu.ObjToString(doc["type"]) == "bid" {
|
|
|
toptype = "结果"
|
|
@@ -246,13 +271,53 @@ func PreInfo(doc map[string]interface{}, isextFile bool) (j, jf *ju.Job) {
|
|
|
return j, jf
|
|
|
}
|
|
|
|
|
|
+//遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
|
+func file2text(doc *map[string]interface{}) {
|
|
|
+ var strfileinfo bytes.Buffer
|
|
|
+ if v, ok := (*doc)["projectinfo"].(map[string]interface{}); ok {
|
|
|
+ if va, ok := v["attachments"].(map[string]interface{}); ok {
|
|
|
+ for _, vaatt := range va {
|
|
|
+ if fileinfo, ok := vaatt.(map[string]interface{}); ok {
|
|
|
+ if qu.ObjToString(fileinfo["content"]) != "" {
|
|
|
+ switch fileinfo["content"].(type) {
|
|
|
+ case string:
|
|
|
+ lock.Lock()
|
|
|
+ strfileinfo.WriteString(fileinfo["content"].(string) + " \n")
|
|
|
+ lock.Unlock()
|
|
|
+ case []map[string]interface{}:
|
|
|
+ for _, fv := range fileinfo["content"].([]map[string]interface{}) {
|
|
|
+ if fv["context"] != nil {
|
|
|
+ lock.Lock()
|
|
|
+ strfileinfo.WriteString(fv["context"].(string) + " \n")
|
|
|
+ lock.Unlock()
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if utf8.RuneCountInString(strfileinfo.String()) < qu.IntAllDef(ju.Config["filelength"],100000 ){
|
|
|
+ (*doc)["detailfile"] = strfileinfo.String() //附件文本堆一起(后期可以考虑,分开处理)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
//抽取
|
|
|
func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
|
|
|
qu.Try(func() {
|
|
|
doc := *j.Data
|
|
|
+ docfile := make(map[string]interface{})
|
|
|
+ if jf != nil{
|
|
|
+ docfile = *jf.Data
|
|
|
+ docfile["dockey"]= "detailfile"
|
|
|
+ }
|
|
|
//全局前置规则,结果覆盖doc属性
|
|
|
for _, v := range e.RulePres {
|
|
|
doc = ExtRegPre(doc, j, v, e.TaskInfo)
|
|
|
+ if jf != nil{
|
|
|
+ docfile = ExtRegPre(docfile, jf, v, e.TaskInfo)
|
|
|
+ }
|
|
|
}
|
|
|
//抽取规则
|
|
|
for _, vc := range e.RuleCores {
|
|
@@ -286,9 +351,47 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
|
|
|
}
|
|
|
//log.Println("抽取-后置规则", tmp)
|
|
|
}
|
|
|
+ //抽取规则-附件
|
|
|
+ if jf != nil{
|
|
|
+ for _, vc := range e.RuleCores {
|
|
|
+ tmp := ju.DeepCopy(docfile).(map[string]interface{})
|
|
|
+ //是否进入逻辑
|
|
|
+ if !ju.Logic(vc.LuaLogic, tmp) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ //抽取-前置规则
|
|
|
+ for _, v := range vc.RulePres {
|
|
|
+ tmp = ExtRegPre(tmp, jf, v, e.TaskInfo)
|
|
|
+ }
|
|
|
+ //log.Println("抽取-前置规则", tmp)
|
|
|
+
|
|
|
+ //抽取-规则
|
|
|
+ for _, v := range vc.RuleCores {
|
|
|
+ ExtRegCore(vc.ExtFrom, tmp, jf, v, e)
|
|
|
+ }
|
|
|
+ //log.Println("抽取-规则", tmp)
|
|
|
+
|
|
|
+ //项目名称未能抽取到,标题来凑
|
|
|
+ if vc.Field == "projectname" {
|
|
|
+ if len(jf.Result[vc.Field]) < 1 {
|
|
|
+ jf.Result[vc.Field] = append(jf.Result[vc.Field], &ju.ExtField{vc.Field, "title", "title", "regexp", "title", vc.ExtFrom, jf.Title, 0})
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ //抽取-后置规则
|
|
|
+ for _, v := range vc.RuleBacks {
|
|
|
+ ExtRegBack(jf, v, e.TaskInfo)
|
|
|
+ }
|
|
|
+ //log.Println("抽取-后置规则", tmp)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
//全局后置规则
|
|
|
for _, v := range e.RuleBacks {
|
|
|
ExtRegBack(j, v, e.TaskInfo)
|
|
|
+ if jf != nil {
|
|
|
+ ExtRegBack(jf, v, e.TaskInfo)
|
|
|
+ }
|
|
|
}
|
|
|
//候选人加入
|
|
|
if len(j.Winnerorder) > 0 {
|
|
@@ -314,6 +417,32 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
|
|
|
}
|
|
|
j.Result["winner"] = winners
|
|
|
}
|
|
|
+ //候选人加入-附件
|
|
|
+ if jf != nil{
|
|
|
+ if len(jf.Winnerorder) > 0 {
|
|
|
+ winner := &ju.ExtField{
|
|
|
+ Field: "winner",
|
|
|
+ Code: "",
|
|
|
+ RuleText: "",
|
|
|
+ Type: "winnerorder",
|
|
|
+ MatchType: "winnerorder",
|
|
|
+ ExtFrom: "",
|
|
|
+ Value: jf.Winnerorder[0]["entname"],
|
|
|
+ Score: 0,
|
|
|
+ }
|
|
|
+ if len([]rune(qu.ObjToString(jf.Winnerorder[0]["entname"]))) < 4 {
|
|
|
+ winner.Score = -5
|
|
|
+ }
|
|
|
+ winners := jf.Result["winner"]
|
|
|
+ if winners != nil {
|
|
|
+ winners = append(winners, winner)
|
|
|
+ } else {
|
|
|
+ winners = []*ju.ExtField{}
|
|
|
+ winners = append(winners, winner)
|
|
|
+ }
|
|
|
+ jf.Result["winner"] = winners
|
|
|
+ }
|
|
|
+ }
|
|
|
//函数清理
|
|
|
for key, val := range j.Result {
|
|
|
for _, v := range val {
|
|
@@ -333,11 +462,38 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job) {
|
|
|
lock.Unlock()
|
|
|
}
|
|
|
}
|
|
|
+ //函数清理-附件
|
|
|
+ if jf != nil{
|
|
|
+ for key, val := range jf.Result {
|
|
|
+ for _, v := range val {
|
|
|
+ lock.Lock()
|
|
|
+ cfn := e.ClearFn[key]
|
|
|
+ lock.Unlock()
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{v.Value, jf.Content})
|
|
|
+ v.Value = data[0]
|
|
|
+ //清理特殊符号
|
|
|
+ lock.Lock()
|
|
|
+ if clear.AsyField[key] != nil || clear.SymField[key] != nil ||
|
|
|
+ clear.MesField[key] != nil {
|
|
|
+ text := qu.ObjToString(v.Value)
|
|
|
+ text = clear.OtherClean(key, text)
|
|
|
+ v.Value = text
|
|
|
+ }
|
|
|
+ lock.Unlock()
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
PackageDetail(j, e) //处理分包信息
|
|
|
+ if jf != nil{
|
|
|
+ PackageDetail(jf, e) //处理分包信息-附件
|
|
|
+ }
|
|
|
// bs, _ := json.Marshal(j.Result)
|
|
|
// log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
|
//分析抽取结果并保存 todo
|
|
|
AnalysisSaveResult(j, e)
|
|
|
+ if jf != nil{
|
|
|
+ AnalysisSaveResult(jf, e) //分析抽取结果并保存-附件
|
|
|
+ }
|
|
|
}, func(err interface{}) {
|
|
|
log.Println("ExtractProcess err", err)
|
|
|
})
|
|
@@ -360,7 +516,12 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
|
|
|
}
|
|
|
AddExtLog("prereplace", j.SourceMid, before, extinfo, in, t) //抽取日志
|
|
|
} else {
|
|
|
- key := qu.If(in.Field == "", "detail", in.Field).(string)
|
|
|
+ var key string
|
|
|
+ if doc["dockey"]== nil{
|
|
|
+ key = qu.If(in.Field == "", "detail", in.Field).(string)
|
|
|
+ }else {
|
|
|
+ key = qu.If(in.Field == "", "detailfile", in.Field).(string)
|
|
|
+ }
|
|
|
text := qu.ObjToString(doc[key])
|
|
|
extinfo[key] = in.RegPreBac.Reg.ReplaceAllString(text, "")
|
|
|
doc[key] = extinfo[key] //结果覆盖原doc
|