|
@@ -35,8 +35,6 @@ var (
|
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
|
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
|
-var Luacodes = sync.Map{}
|
|
|
-var SiteManages = sync.Map{}
|
|
|
|
|
|
//启动测试抽取
|
|
|
func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
|
|
@@ -97,15 +95,14 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
|
|
|
}
|
|
|
var j, jf *ju.Job
|
|
|
var isSite bool
|
|
|
- var codeSite string
|
|
|
if ext.IsFileField && v["projectinfo"] != nil {
|
|
|
v["isextFile"] = true
|
|
|
- j, jf, isSite,codeSite = ext.PreInfo(v)
|
|
|
+ j, jf, isSite = ext.PreInfo(v)
|
|
|
} else {
|
|
|
- j, _, isSite,codeSite = ext.PreInfo(v)
|
|
|
+ j, _, isSite = ext.PreInfo(v)
|
|
|
}
|
|
|
+ go ext.ExtractProcess(j, jf, isSite)
|
|
|
ext.TaskInfo.ProcessPool <- true
|
|
|
- go ext.ExtractProcess(j, jf, isSite,codeSite)
|
|
|
}
|
|
|
return true
|
|
|
} else {
|
|
@@ -197,8 +194,8 @@ func RunExtractTask(taskId string) {
|
|
|
fmt.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
|
|
|
for i := 0; i < pageNum; i++ {
|
|
|
query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
|
- fmt.Printf("page=%d,query=%v", i+1, query)
|
|
|
list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
|
|
|
+ fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
|
|
|
for _, v := range *list {
|
|
|
if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
|
|
|
continue
|
|
@@ -215,16 +212,15 @@ func RunExtractTask(taskId string) {
|
|
|
}
|
|
|
var j, jf *ju.Job
|
|
|
var isSite bool
|
|
|
- var codeSite string
|
|
|
if ext.IsFileField && v["projectinfo"] != nil {
|
|
|
v["isextFile"] = true
|
|
|
- j, jf, isSite,codeSite = ext.PreInfo(v)
|
|
|
+ j, jf, isSite = ext.PreInfo(v)
|
|
|
} else {
|
|
|
- j, _, isSite,codeSite = ext.PreInfo(v)
|
|
|
+ j, _, isSite = ext.PreInfo(v)
|
|
|
}
|
|
|
- ext.TaskInfo.ProcessPool <- true
|
|
|
- go ext.ExtractProcess(j, jf, isSite,codeSite)
|
|
|
+ go ext.ExtractProcess(j, jf, isSite)
|
|
|
ext.TaskInfo.LastExtId = _id
|
|
|
+ ext.TaskInfo.ProcessPool <- true
|
|
|
}
|
|
|
db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
|
|
|
if !ext.IsRun {
|
|
@@ -236,12 +232,12 @@ func RunExtractTask(taskId string) {
|
|
|
}
|
|
|
|
|
|
//信息预处理-不和版本关联,取最新版本的配置项
|
|
|
-func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool,codeSite string) {
|
|
|
+func PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
|
|
|
return (&ExtractTask{}).PreInfo(doc)
|
|
|
}
|
|
|
|
|
|
//信息预处理-和版本关联
|
|
|
-func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool,codeSite string) {
|
|
|
+func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite bool) {
|
|
|
defer qu.Catch()
|
|
|
//判断是否有附件这个字段
|
|
|
var isextFile bool
|
|
@@ -258,6 +254,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
|
}
|
|
|
detail = regexp.MustCompile(`<!--[\w\W]*?-->`).ReplaceAllString(detail, "")
|
|
|
d3, _ := doc["summary"].(string)
|
|
|
+ //全文的需要修复表格
|
|
|
+ detail = pretreated.RepairCon(detail)
|
|
|
detail = ju.CutLableStr(d3 + "\n" + detail)
|
|
|
detail = cut.ClearHtml(d3 + "\n" + detail)
|
|
|
doc["detail"] = detail
|
|
@@ -326,31 +324,31 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
|
}
|
|
|
}
|
|
|
//是否配置站点
|
|
|
- codeSite = qu.ObjToString(doc["spidercode"])
|
|
|
- exp, isSite := Luacodes.Load(codeSite)
|
|
|
- if isSite{
|
|
|
- if exp.( map[string]interface{})["e.SiteClearFn"]!= nil{
|
|
|
- e.SiteClearFn = exp.( map[string]interface{})["e.SiteClearFn"].( map[string][]string)
|
|
|
+ codeSite := j.SpiderCode
|
|
|
+ exp, isSite := e.Luacodes.Load(codeSite)
|
|
|
+ if isSite {
|
|
|
+ if exp.(map[string]interface{})["e.SiteClearFn"] != nil {
|
|
|
+ e.SiteClearFn = exp.(map[string]interface{})["e.SiteClearFn"].(map[string][]string)
|
|
|
}
|
|
|
- if exp.( map[string]interface{})["e.SiteTag"]!= nil{
|
|
|
- e.SiteTag = exp.( map[string]interface{})["e.SiteTag"].( map[string][]*Tag)
|
|
|
+ if exp.(map[string]interface{})["e.SiteTag"] != nil {
|
|
|
+ e.SiteTag = exp.(map[string]interface{})["e.SiteTag"].(map[string][]*Tag)
|
|
|
}
|
|
|
- if exp.( map[string]interface{})["e.SiteRuleCores"] != nil{
|
|
|
- e.SiteRuleCores = exp.( map[string]interface{})["e.SiteRuleCores"].( map[string]map[string][]*RuleCore)
|
|
|
+ if exp.(map[string]interface{})["e.SiteRuleCores"] != nil {
|
|
|
+ e.SiteRuleCores = exp.(map[string]interface{})["e.SiteRuleCores"].(map[string]map[string][]*RuleCore)
|
|
|
}
|
|
|
- if exp.( map[string]interface{})["e.SiteRuleBacks"]!= nil{
|
|
|
- e.SiteRuleBacks = exp.( map[string]interface{})["e.SiteRuleBacks"].( []*RegLuaInfo)
|
|
|
+ if exp.(map[string]interface{})["e.SiteRuleBacks"] != nil {
|
|
|
+ e.SiteRuleBacks = exp.(map[string]interface{})["e.SiteRuleBacks"].([]*RegLuaInfo)
|
|
|
}
|
|
|
}
|
|
|
qu.Try(func() {
|
|
|
- pretreated.AnalyStart(j, isSite,codeSite) //job.Block分块
|
|
|
+ pretreated.AnalyStart(j, isSite, codeSite) //job.Block分块
|
|
|
if isextFile {
|
|
|
- pretreated.AnalyStart(jf, isSite,codeSite)
|
|
|
+ pretreated.AnalyStart(jf, isSite, codeSite)
|
|
|
}
|
|
|
}, func(err interface{}) {
|
|
|
log.Debug("pretreated.AnalyStart", err, j.SourceMid)
|
|
|
})
|
|
|
- return j, jf, isSite,codeSite
|
|
|
+ return j, jf, isSite
|
|
|
}
|
|
|
|
|
|
//遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
@@ -386,17 +384,62 @@ func file2text(doc *map[string]interface{}) {
|
|
|
}
|
|
|
|
|
|
//抽取
|
|
|
-func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool,codeSite string) {
|
|
|
- e.ExtractDetail(j, isSite,codeSite)
|
|
|
+func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
|
+ e.ExtractDetail(j, isSite, j.SpiderCode)
|
|
|
if jf != nil && jf.IsFile {
|
|
|
- e.ExtractFile(jf, isSite,codeSite)
|
|
|
+ e.ExtractFile(jf, isSite, j.SpiderCode)
|
|
|
+ }
|
|
|
+ if isSite {
|
|
|
+ ismerge, ok := e.SiteMerge.Load(j.SpiderCode)
|
|
|
+ if ok && ismerge.(bool) {
|
|
|
+ tmpj := &ju.Job{
|
|
|
+ SourceMid: j.SourceMid,
|
|
|
+ Category: j.Category,
|
|
|
+ CategorySecond: j.CategorySecond,
|
|
|
+ Content: j.Content,
|
|
|
+ SpiderCode: j.SpiderCode,
|
|
|
+ //Domain: qu.ObjToString(doc["domain"]),
|
|
|
+ //Href: qu.ObjToString(doc["href"]),
|
|
|
+ Title: j.Title,
|
|
|
+ Data: j.Data,
|
|
|
+ City: j.City,
|
|
|
+ Province: j.Province,
|
|
|
+ Jsondata: j.Jsondata,
|
|
|
+ Result: map[string][]*ju.ExtField{},
|
|
|
+ BuyerAddr: j.BuyerAddr,
|
|
|
+ RuleBlock: e.RuleBlock,
|
|
|
+ }
|
|
|
+ qu.Try(func() {
|
|
|
+ pretreated.AnalyStart(tmpj, false, "") //job.Block分块
|
|
|
+ }, func(err interface{}) {
|
|
|
+ log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
|
|
|
+ })
|
|
|
+ e.ExtractDetail(tmpj, false, "")
|
|
|
+ //if jf != nil && jf.IsFile {
|
|
|
+ // e.ExtractFile(jf, false, "")
|
|
|
+ //}
|
|
|
+ //合并数据
|
|
|
+ j.Block = append(j.Block, tmpj.Block...)
|
|
|
+ j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
|
|
|
+ for tmpk,_:= range j.Result{
|
|
|
+ if len(tmpj.Result[tmpk]) >0 {
|
|
|
+ j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for tmpk ,_ :=range tmpj.Result{
|
|
|
+ if len(j.Result[tmpk]) == 0{
|
|
|
+ j.Result[tmpk] = append(j.Result[tmpk], tmpj.Result[tmpk]...)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
+
|
|
|
//分析抽取结果并保存 todo
|
|
|
AnalysisSaveResult(j, jf, e)
|
|
|
<-e.TaskInfo.ProcessPool
|
|
|
}
|
|
|
|
|
|
-func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool,codeSite string) {
|
|
|
+func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
qu.Try(func() {
|
|
|
doc := *j.Data
|
|
|
//全局前置规则,结果覆盖doc属性
|
|
@@ -453,7 +496,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool,codeSite string) {
|
|
|
// log.Debug("抽取-前置规则", tmp)
|
|
|
|
|
|
//抽取-规则
|
|
|
- ExtRuleCore(tmp, e, vc, j)
|
|
|
+ ExtRuleCore(tmp, e, vc, j, isSite)
|
|
|
// log.Debug("抽取-规则", tmp)
|
|
|
|
|
|
//抽取-后置规则
|
|
@@ -474,6 +517,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool,codeSite string) {
|
|
|
}
|
|
|
if isextitle { //标题加入选举
|
|
|
field := &ju.ExtField{Field: vc.Field, Code: vc.Id + "_title", RuleText: "title", Type: "title", MatchType: "title", ExtFrom: vc.ExtFrom, SourceValue: j.Title, Value: j.Title}
|
|
|
+ if isSite{
|
|
|
+ field.Score = 1
|
|
|
+ }
|
|
|
j.Result[vc.Field] = append(j.Result[vc.Field], field)
|
|
|
}
|
|
|
}
|
|
@@ -502,11 +548,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool,codeSite string) {
|
|
|
var cfn = []string{}
|
|
|
if isSite {
|
|
|
cfn = e.SiteClearFn[key]
|
|
|
-
|
|
|
} else {
|
|
|
cfn = e.ClearFn[key]
|
|
|
}
|
|
|
lockclear.Unlock()
|
|
|
+ if len(cfn) == 0 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
|
|
|
before, _ := v.Value.(string)
|
|
|
v.Value = data[0]
|
|
@@ -524,14 +572,14 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool,codeSite string) {
|
|
|
lockclear.Unlock()
|
|
|
}
|
|
|
}
|
|
|
- PackageDetail(j, e, isSite,codeSite) //处理分包信息
|
|
|
+ PackageDetail(j, e, isSite, codeSite) //处理分包信息
|
|
|
// bs, _ := json.Marshal(j.Result)
|
|
|
// log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
|
}, func(err interface{}) {
|
|
|
log.Debug("ExtractProcess err", err)
|
|
|
})
|
|
|
}
|
|
|
-func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool,codeSite string) {
|
|
|
+func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
|
qu.Try(func() {
|
|
|
doc := *j.Data
|
|
|
//全局前置规则,结果覆盖doc属性
|
|
@@ -570,7 +618,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool,codeSite string) {
|
|
|
|
|
|
//抽取-规则
|
|
|
if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
|
|
|
- ExtRuleCore(tmp, e, vc, j)
|
|
|
+ ExtRuleCore(tmp, e, vc, j, isSite)
|
|
|
}
|
|
|
// log.Debug("抽取-规则", tmp)
|
|
|
|
|
@@ -610,7 +658,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool,codeSite string) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- PackageDetail(j, e, isSite,codeSite) //处理分包信息
|
|
|
+ PackageDetail(j, e, isSite, codeSite) //处理分包信息
|
|
|
// bs, _ := json.Marshal(j.Result)
|
|
|
// log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
|
|
|
}, func(err interface{}) {
|
|
@@ -649,7 +697,7 @@ func ExtRegPre(doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, t *TaskInf
|
|
|
}
|
|
|
|
|
|
//抽取-规则
|
|
|
-func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job) {
|
|
|
+func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju.Job, isSite bool) {
|
|
|
//候选人加入
|
|
|
var kvMap map[string][]map[string]interface{}
|
|
|
extByReg := true
|
|
@@ -658,9 +706,9 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
|
|
|
}
|
|
|
for _, v := range vc.RuleCores {
|
|
|
if v.IsLua {
|
|
|
- ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, kvMap,e)
|
|
|
+ ExtRuleCoreByKv(vc.ExtFrom, doc, j, v, &kvMap, e)
|
|
|
} else if extByReg {
|
|
|
- ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e)
|
|
|
+ ExtRuleCoreByReg(vc.ExtFrom, doc, j, v, e, isSite)
|
|
|
}
|
|
|
}
|
|
|
//如果只有一个分包,预算没有抽取到,把分包中的预算保存到外面
|
|
@@ -686,7 +734,7 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
|
|
|
for _, vv := range v.KvTags[fieldname] {
|
|
|
text := ju.TrimLRSpace(vv.Value, "")
|
|
|
if text != "" {
|
|
|
- j.Result[vc.Field] = append(j.Result[vc.Field], &ju.ExtField{
|
|
|
+ tmp := &ju.ExtField{
|
|
|
Field: vc.Field,
|
|
|
Code: "CL_分包",
|
|
|
Type: tp,
|
|
@@ -694,7 +742,11 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
|
|
|
RuleText: bp.Text,
|
|
|
SourceValue: vv.Key,
|
|
|
Value: text,
|
|
|
- })
|
|
|
+ }
|
|
|
+ if isSite{
|
|
|
+ tmp.Score = 1
|
|
|
+ }
|
|
|
+ j.Result[vc.Field] = append(j.Result[vc.Field],tmp)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -709,6 +761,9 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
|
|
|
}
|
|
|
for _, tmp := range v {
|
|
|
field := &ju.ExtField{Weight: qu.IntAll(tmp["weight"]), Field: k, Code: qu.ObjToString(tmp["code"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), RuleText: qu.ObjToString(tmp["ruletext"]), SourceValue: tmp["sourcevalue"], Value: tmp["value"]}
|
|
|
+ if isSite {
|
|
|
+ field.Score = 1
|
|
|
+ }
|
|
|
if tmp["blocktag"] != nil {
|
|
|
btag := make(map[string]string)
|
|
|
for k := range tmp["blocktag"].(map[string]bool) {
|
|
@@ -728,20 +783,20 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
|
|
|
}
|
|
|
|
|
|
//抽取-规则-kv
|
|
|
-func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap map[string][]map[string]interface{}, et *ExtractTask) {
|
|
|
+func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, kvMap *map[string][]map[string]interface{}, et *ExtractTask) {
|
|
|
defer qu.Catch()
|
|
|
if extfrom == "title" || !in.IsLua {
|
|
|
return
|
|
|
}
|
|
|
lua := ju.LuaScript{Code: in.Code, Name: in.Name, Doc: doc, Script: in.RuleText}
|
|
|
- lua.KvMap = kvMap
|
|
|
+ lua.KvMap = *kvMap
|
|
|
lua.Block = j.Block
|
|
|
extinfo := lua.RunScript("core")
|
|
|
if tmps, ok := extinfo[in.Field].([]map[string]interface{}); ok {
|
|
|
for _, v := range tmps {
|
|
|
v["core"] = in.Code
|
|
|
}
|
|
|
- kvMap[in.Field] = tmps
|
|
|
+ (*kvMap)[in.Field] = append((*kvMap)[in.Field], tmps...)
|
|
|
}
|
|
|
if len(extinfo) > 0 {
|
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
@@ -749,7 +804,7 @@ func ExtRuleCoreByKv(extfrom string, doc map[string]interface{}, j *ju.Job, in *
|
|
|
}
|
|
|
|
|
|
//抽取-规则-正则
|
|
|
-func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask) {
|
|
|
+func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLuaInfo, et *ExtractTask, isSite bool) {
|
|
|
defer qu.Catch()
|
|
|
//根据field配置项目,是否抽取。例如:废标、流标等跳过,
|
|
|
b := IsExtract(in.Field, j.Title, j.Content)
|
|
@@ -767,7 +822,7 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
|
|
|
//块抽取
|
|
|
if in.Field != "" {
|
|
|
if extfrom == "title" {
|
|
|
- extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in)
|
|
|
+ extinfo := extRegCoreToResult(extfrom, qu.ObjToString(doc[extfrom]), &map[string]string{}, j, in, isSite)
|
|
|
if len(extinfo) > 0 {
|
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
}
|
|
@@ -779,7 +834,7 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
|
|
|
btag[k] = TagConfigDesc[k]
|
|
|
blocktag.Unlock()
|
|
|
}
|
|
|
- extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in)
|
|
|
+ extinfo := extRegCoreToResult(extfrom, v.Text, &btag, j, in, isSite)
|
|
|
if len(extinfo) > 0 {
|
|
|
AddExtLog("extract", j.SourceMid, nil, extinfo, in, et.TaskInfo) //抽取日志
|
|
|
}
|
|
@@ -897,89 +952,98 @@ func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kv
|
|
|
}
|
|
|
|
|
|
//正则提取结果
|
|
|
-func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, v *RegLuaInfo) map[string][]map[string]interface{} {
|
|
|
+func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job, vre *RegLuaInfo,isSite bool) map[string][]map[string]interface{} {
|
|
|
defer qu.Catch()
|
|
|
+ var score int
|
|
|
+ if isSite{
|
|
|
+ score = 1
|
|
|
+ }
|
|
|
extinfo := map[string][]map[string]interface{}{}
|
|
|
- if v.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
|
|
|
- apos := v.RegCore.Reg.FindAllStringSubmatchIndex(text, -1)
|
|
|
- if len(apos) > 0 {
|
|
|
- pos := apos[0]
|
|
|
- for k, p := range v.RegCore.ExtractPos {
|
|
|
- if len(pos) > p {
|
|
|
- if pos[p] == -1 || pos[p+1] == -1 {
|
|
|
- continue
|
|
|
- }
|
|
|
- val := text[pos[p]:pos[p+1]]
|
|
|
- sourcevalue := val
|
|
|
- if val == "招标公告" {
|
|
|
- return extinfo
|
|
|
- }
|
|
|
- if utf8.RuneCountInString(val) < 2 && extfrom == "title" {
|
|
|
- val = text
|
|
|
+ rep := map[string]string{}
|
|
|
+ if vre.RegCore.Bextract { //正则是两部分的,可以直接抽取的(含下划线)
|
|
|
+ //处理正负数修正
|
|
|
+ ptmp := strings.Split(vre.RuleText, "#")
|
|
|
+ sign := 0
|
|
|
+ if len(ptmp) == 2 {
|
|
|
+ if ptmp[1] == "正" {
|
|
|
+ sign = 1
|
|
|
+ } else if ptmp[1] == "负" {
|
|
|
+ sign = -1
|
|
|
+ }
|
|
|
+ }
|
|
|
+ tmp := strings.Split(ptmp[0], "__")
|
|
|
+ if len(tmp) == 2 {
|
|
|
+ epos := strings.Split(tmp[1], ",")
|
|
|
+ posm := map[string]int{}
|
|
|
+ for _, v := range epos {
|
|
|
+ ks := strings.Split(v, ":")
|
|
|
+ if len(ks) == 2 { //(.*)招标公告(.*)__2:projectname,4:area
|
|
|
+ posm[ks[1]] = qu.IntAll(ks[0])
|
|
|
+ } else {
|
|
|
+ posm[vre.Field] = qu.IntAll(ks[0])
|
|
|
+ }
|
|
|
+ }
|
|
|
+ var pattern string
|
|
|
+ if strings.Contains(tmp[0], "\\u") {
|
|
|
+ tmp[0] = strings.Replace(tmp[0], "\\", "\\\\", -1)
|
|
|
+ tmp[0] = strings.Replace(tmp[0], "\\\\u", "\\u", -1)
|
|
|
+ pattern, _ = strconv.Unquote(`"` + tmp[0] + `"`)
|
|
|
+ } else {
|
|
|
+ pattern = tmp[0]
|
|
|
+ }
|
|
|
+ //log.Debug("pattern", pattern)
|
|
|
+ //fmt.Println(text)
|
|
|
+ reg := regexp.MustCompile(pattern)
|
|
|
+ apos := reg.FindAllStringSubmatchIndex(text, -1)
|
|
|
+ for i, _ := range apos {
|
|
|
+ pos := apos[i]
|
|
|
+ for k, p := range posm {
|
|
|
+ if len(pos) > p {
|
|
|
+ if pos[p] == -1 || pos[p+1] == -1 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ val := text[pos[p]:pos[p+1]]
|
|
|
+ if string(val) == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if sign == -1 {
|
|
|
+ rep[k+"_"+fmt.Sprint(i)] = "-" + val
|
|
|
+ } else {
|
|
|
+ rep[k+"_"+fmt.Sprint(i)] = val
|
|
|
+ }
|
|
|
}
|
|
|
- tmps := []map[string]interface{}{}
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //fmt.Println(text)
|
|
|
+ tmps := []map[string]interface{}{}
|
|
|
+ for i := 0; i < len(apos); i++ {
|
|
|
+ if strings.TrimSpace(rep[vre.Field+"_"+fmt.Sprint(i)]) != "" {
|
|
|
tmp := map[string]interface{}{
|
|
|
- "field": v.Field,
|
|
|
- "code": v.Code,
|
|
|
- "ruletext": v.RuleText,
|
|
|
+ "field": vre.Field,
|
|
|
+ "code": vre.Code,
|
|
|
+ "ruletext": vre.RuleText,
|
|
|
"extfrom": text,
|
|
|
- "value": val,
|
|
|
+ "value": rep[vre.Field+"_"+fmt.Sprint(i)],
|
|
|
"type": "regexp",
|
|
|
"matchtype": "regcontent",
|
|
|
"blocktag": *tag,
|
|
|
+ "score" :score,
|
|
|
}
|
|
|
tmps = append(tmps, tmp)
|
|
|
- extinfo[k] = tmps
|
|
|
- if strings.TrimSpace(val) != "" {
|
|
|
- if v.RegCore.NumSign == -1 { //正负值修正
|
|
|
- val = "-" + val
|
|
|
- }
|
|
|
- exfield := ju.ExtField{BlockTag: *tag, Field: k, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: sourcevalue, Value: val}
|
|
|
- if tmp["blocktag"] != nil {
|
|
|
- exfield.BlockTag = tmp["blocktag"].(map[string]string)
|
|
|
- }
|
|
|
- j.Result[k] = append(j.Result[k], &exfield)
|
|
|
- //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
+
|
|
|
+ exfield := ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: rep[vre.Field+"_"+fmt.Sprint(i)], Value: rep[vre.Field+"_"+fmt.Sprint(i)]}
|
|
|
+ if tmp["blocktag"] != nil {
|
|
|
+ exfield.BlockTag = tmp["blocktag"].(map[string]string)
|
|
|
}
|
|
|
+ j.Result[vre.Field] = append(j.Result[vre.Field], &exfield)
|
|
|
}
|
|
|
}
|
|
|
- if len(extinfo) == 0 {
|
|
|
- regArr := strings.Split(v.RuleText, "__")
|
|
|
- //fmt.Println(regArr[0])
|
|
|
- if len(regArr) > 0 {
|
|
|
- reg, err := regexp.Compile(regArr[0])
|
|
|
- if err == nil {
|
|
|
- datavals := reg.FindStringSubmatch(text)
|
|
|
- tmps := []map[string]interface{}{}
|
|
|
- for _, value := range datavals {
|
|
|
- if value == "" {
|
|
|
- continue
|
|
|
- }
|
|
|
- tmp := map[string]interface{}{
|
|
|
- "field": v.Field,
|
|
|
- "code": v.Code,
|
|
|
- "ruletext": regArr[0],
|
|
|
- "extfrom": text,
|
|
|
- "value": value,
|
|
|
- "type": "regexp",
|
|
|
- "matchtype": "regcontent",
|
|
|
- "blocktag": *tag,
|
|
|
- }
|
|
|
- tmps = append(tmps, tmp)
|
|
|
- extinfo[v.Field] = tmps
|
|
|
- exfield := ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code + "去除__*后", RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: value}
|
|
|
- if tmp["blocktag"] != nil {
|
|
|
- exfield.BlockTag = tmp["blocktag"].(map[string]string)
|
|
|
- }
|
|
|
- j.Result[v.Field] = append(j.Result[v.Field], &exfield)
|
|
|
- //j.Result[k] = append(j.Result[k], &ju.ExtField{tmp["blocktag"].(map[string]bool), k, v.Code, v.RuleText, "regexp", "regcontent", extfrom, val, 0})
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+ if len(tmps) > 0 {
|
|
|
+ extinfo[vre.Field] = tmps
|
|
|
}
|
|
|
}
|
|
|
} else {
|
|
|
- pos := v.RegCore.Reg.FindStringIndex(text)
|
|
|
+ pos := vre.RegCore.Reg.FindStringIndex(text)
|
|
|
val := ""
|
|
|
if len(pos) == 2 {
|
|
|
text = text[pos[1]:]
|
|
@@ -992,25 +1056,26 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
|
|
|
if val != "" {
|
|
|
tmps := []map[string]interface{}{}
|
|
|
tmp := map[string]interface{}{
|
|
|
- "field": v.Field,
|
|
|
- "code": v.Code,
|
|
|
- "ruletext": v.RuleText,
|
|
|
+ "field": vre.Field,
|
|
|
+ "code": vre.Code,
|
|
|
+ "ruletext": vre.RuleText,
|
|
|
"extfrom": text,
|
|
|
"value": val,
|
|
|
"type": "regexp",
|
|
|
"matchtype": "regcontent",
|
|
|
"blocktag": *tag,
|
|
|
+ "score" :score,
|
|
|
}
|
|
|
tmps = append(tmps, tmp)
|
|
|
- extinfo[v.Field] = tmps
|
|
|
- if j.Result[v.Field] == nil {
|
|
|
- j.Result[v.Field] = [](*ju.ExtField){}
|
|
|
+ extinfo[vre.Field] = tmps
|
|
|
+ if j.Result[vre.Field] == nil {
|
|
|
+ j.Result[vre.Field] = [](*ju.ExtField){}
|
|
|
}
|
|
|
- field := &ju.ExtField{BlockTag: *tag, Field: v.Field, Code: v.Code, RuleText: v.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
|
|
|
+ field := &ju.ExtField{BlockTag: *tag, Field: vre.Field, Code: vre.Code, RuleText: vre.RuleText, Type: "regexp", MatchType: "regcontent", ExtFrom: extfrom, SourceValue: text, Value: val}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
field.BlockTag = tmp["blocktag"].(map[string]string)
|
|
|
}
|
|
|
- j.Result[v.Field] = append(j.Result[v.Field], field)
|
|
|
+ j.Result[vre.Field] = append(j.Result[vre.Field], field)
|
|
|
}
|
|
|
}
|
|
|
return extinfo
|
|
@@ -1030,7 +1095,7 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
|
|
|
if tmps, ok := v.([]map[string]interface{}); ok {
|
|
|
j.Result[k] = [](*ju.ExtField){}
|
|
|
for _, tmp := range tmps {
|
|
|
- field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"], Score: 0}
|
|
|
+ field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: qu.ObjToString(tmp["extfrom"]), Value: tmp["value"]}
|
|
|
if tmp["blocktag"] != nil {
|
|
|
field.BlockTag = tmp["blocktag"].(map[string]string)
|
|
|
}
|
|
@@ -1661,11 +1726,15 @@ func resetWinnerorder(j *ju.Job) {
|
|
|
if len(j.Winnerorder) == 0 {
|
|
|
return
|
|
|
}
|
|
|
+ maxlen := len(j.Winnerorder)-1
|
|
|
//中标单位
|
|
|
i := 0
|
|
|
winners := []*ju.ExtField{}
|
|
|
for _, v := range j.Result["winner"] {
|
|
|
if v.Code == "winnerorder" {
|
|
|
+ if maxlen < i {
|
|
|
+ continue
|
|
|
+ }
|
|
|
j.Winnerorder[i]["entname"] = v.Value
|
|
|
i++
|
|
|
} else {
|
|
@@ -1678,6 +1747,9 @@ func resetWinnerorder(j *ju.Job) {
|
|
|
bidamounts := []*ju.ExtField{}
|
|
|
for _, v := range j.Result["bidamount"] {
|
|
|
if v.Code == "winnerorder" {
|
|
|
+ if maxlen < i {
|
|
|
+ continue
|
|
|
+ }
|
|
|
j.Winnerorder[i]["price"] = v.Value
|
|
|
i++
|
|
|
} else {
|