|
@@ -27,12 +27,12 @@ import (
|
|
var (
|
|
var (
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
|
|
|
|
|
|
- cut = ju.NewCut() //获取正文并清理
|
|
|
|
- ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
|
- TaskList map[string]*ExtractTask //任务列表
|
|
|
|
- ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
|
- saveLimit = 100 //抽取日志批量保存
|
|
|
|
- PageSize = 5000 //查询分页
|
|
|
|
|
|
+ cut = ju.NewCut() //获取正文并清理
|
|
|
|
+ ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
|
|
|
|
+ TaskList map[string]*ExtractTask //任务列表
|
|
|
|
+ ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
|
+ saveLimit = 100 //抽取日志批量保存
|
|
|
|
+ PageSize = 5000 //查询分页
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
|
|
Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
)
|
|
)
|
|
@@ -331,7 +331,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
Result: map[string][]*ju.ExtField{},
|
|
Result: map[string][]*ju.ExtField{},
|
|
BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
RuleBlock: e.RuleBlock,
|
|
RuleBlock: e.RuleBlock,
|
|
- Dataging: qu.IntAll(doc["dataging"]),
|
|
|
|
|
|
+ Dataging: qu.IntAll(doc["dataging"]),
|
|
}
|
|
}
|
|
if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
|
|
if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
|
|
delete((*j.Jsondata), "jsoncontent")
|
|
delete((*j.Jsondata), "jsoncontent")
|
|
@@ -353,7 +353,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
RuleBlock: e.RuleBlock,
|
|
RuleBlock: e.RuleBlock,
|
|
IsFile: isextFile,
|
|
IsFile: isextFile,
|
|
- Dataging: qu.IntAll(doc["dataging"]),
|
|
|
|
|
|
+ Dataging: qu.IntAll(doc["dataging"]),
|
|
}
|
|
}
|
|
if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
|
|
if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
|
|
delete((*jf.Jsondata), "jsoncontent")
|
|
delete((*jf.Jsondata), "jsoncontent")
|
|
@@ -402,6 +402,11 @@ func file2text(doc *map[string]interface{}) {
|
|
for _, fileinfo := range fileinfos {
|
|
for _, fileinfo := range fileinfos {
|
|
if ff, ok := fileinfo.(map[string]interface{}); ok {
|
|
if ff, ok := fileinfo.(map[string]interface{}); ok {
|
|
attach_url := qu.ObjToString(ff["attach_url"])
|
|
attach_url := qu.ObjToString(ff["attach_url"])
|
|
|
|
+ //if utf8.RuneCountInString(tmpstr+attach_url) < qu.IntAllDef(ju.Config["filelength"], 100000) {
|
|
|
|
+ // tmpstr += attach_url + "\n"
|
|
|
|
+ //} else {
|
|
|
|
+ // break
|
|
|
|
+ //}
|
|
bs := ju.OssGetObject(attach_url)
|
|
bs := ju.OssGetObject(attach_url)
|
|
if utf8.RuneCountInString(tmpstr+bs) < qu.IntAllDef(ju.Config["filelength"], 100000) {
|
|
if utf8.RuneCountInString(tmpstr+bs) < qu.IntAllDef(ju.Config["filelength"], 100000) {
|
|
tmpstr += bs + "\n"
|
|
tmpstr += bs + "\n"
|
|
@@ -579,11 +584,9 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
//函数清理
|
|
//函数清理
|
|
for key, val := range j.Result {
|
|
for key, val := range j.Result {
|
|
for i, v := range val {
|
|
for i, v := range val {
|
|
- // if v.ExtFrom == "title"&& v.Field == "buyer"{
|
|
|
|
- // qu.Debug("title---",v.Value)
|
|
|
|
- // }else if v.Field == "buyer"{
|
|
|
|
- // qu.Debug("text---",v.Value)
|
|
|
|
- // }
|
|
|
|
|
|
+ if v.Field == "projectname" && v.Type == "table" {
|
|
|
|
+ break
|
|
|
|
+ }
|
|
lockclear.Lock()
|
|
lockclear.Lock()
|
|
var cfn = []string{}
|
|
var cfn = []string{}
|
|
if isSite {
|
|
if isSite {
|
|
@@ -1441,9 +1444,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
//table抽取到的数据不清理
|
|
//table抽取到的数据不清理
|
|
- // if v.Type == "table" && v.Field != "projectname" {
|
|
|
|
- // continue
|
|
|
|
- // }
|
|
|
|
|
|
+ if v.Type == "table" && v.Field == "projectname" {
|
|
|
|
+ return
|
|
|
|
+ }
|
|
text := qu.ObjToString(v.Value)
|
|
text := qu.ObjToString(v.Value)
|
|
if text != "" {
|
|
if text != "" {
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
text = in.RegPreBac.Reg.ReplaceAllString(text, in.RegPreBac.Replace)
|
|
@@ -1471,8 +1474,9 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
|
|
for key, tmp := range j.Result {
|
|
for key, tmp := range j.Result {
|
|
exts := []interface{}{}
|
|
exts := []interface{}{}
|
|
for k, v := range tmp {
|
|
for k, v := range tmp {
|
|
- if v.Type == "table" { //table抽取到的数据不清理
|
|
|
|
- continue
|
|
|
|
|
|
+ //table抽取到的数据不清理
|
|
|
|
+ if v.Type == "table" && v.Field == "projectname" {
|
|
|
|
+ return
|
|
}
|
|
}
|
|
text := qu.ObjToString(v.Value)
|
|
text := qu.ObjToString(v.Value)
|
|
if text != "" {
|
|
if text != "" {
|
|
@@ -1683,6 +1687,7 @@ var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:
|
|
//分析抽取结果并保存
|
|
//分析抽取结果并保存
|
|
func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
qu.Try(func() {
|
|
qu.Try(func() {
|
|
|
|
+
|
|
//重新取出清理过后的中标候选人
|
|
//重新取出清理过后的中标候选人
|
|
resetWinnerorder(j)
|
|
resetWinnerorder(j)
|
|
doc, result, _id := funcAnalysis(j, e)
|
|
doc, result, _id := funcAnalysis(j, e)
|
|
@@ -1782,7 +1787,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
savewinner = RemoveReplicaSliceString(savewinner)
|
|
savewinner = RemoveReplicaSliceString(savewinner)
|
|
tmp["s_winner"] = strings.Join(savewinner, ",")
|
|
tmp["s_winner"] = strings.Join(savewinner, ",")
|
|
}
|
|
}
|
|
-
|
|
|
|
} else if tmp["winner"] != nil && tmp["winner"] != "" {
|
|
} else if tmp["winner"] != nil && tmp["winner"] != "" {
|
|
//没有分包取winner
|
|
//没有分包取winner
|
|
tmp["s_winner"] = tmp["winner"]
|
|
tmp["s_winner"] = tmp["winner"]
|
|
@@ -2084,7 +2088,7 @@ func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
|
|
|
|
|
|
//去重冗余字段
|
|
//去重冗余字段
|
|
func delFiled(k string) bool {
|
|
func delFiled(k string) bool {
|
|
- return k=="detailfile"||k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
|
|
|
|
|
|
+ return k == "detailfile" || k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
|
|
}
|
|
}
|
|
|
|
|
|
func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
|
|
func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
|