|
@@ -33,7 +33,8 @@ var (
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
saveLimit = 100 //抽取日志批量保存
|
|
saveLimit = 100 //抽取日志批量保存
|
|
PageSize = 5000 //查询分页
|
|
PageSize = 5000 //查询分页
|
|
- Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
|
|
|
|
|
|
+ //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
|
|
|
|
+ Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
)
|
|
)
|
|
|
|
|
|
@@ -323,15 +324,15 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
Site: qu.ObjToString(doc["site"]),
|
|
Site: qu.ObjToString(doc["site"]),
|
|
//Domain: qu.ObjToString(doc["domain"]),
|
|
//Domain: qu.ObjToString(doc["domain"]),
|
|
//Href: qu.ObjToString(doc["href"]),
|
|
//Href: qu.ObjToString(doc["href"]),
|
|
- Title: qu.ObjToString(doc["title"]),
|
|
|
|
- Data: &doc,
|
|
|
|
- City: qu.ObjToString(doc["city"]),
|
|
|
|
- Province: qu.ObjToString(doc["area"]),
|
|
|
|
- Jsondata: toMap,
|
|
|
|
- Result: map[string][]*ju.ExtField{},
|
|
|
|
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
|
- RuleBlock: e.RuleBlock,
|
|
|
|
- Dataging: qu.IntAll(doc["dataging"]),
|
|
|
|
|
|
+ Title: qu.ObjToString(doc["title"]),
|
|
|
|
+ Data: &doc,
|
|
|
|
+ City: qu.ObjToString(doc["city"]),
|
|
|
|
+ Province: qu.ObjToString(doc["area"]),
|
|
|
|
+ Jsondata: toMap,
|
|
|
|
+ Result: map[string][]*ju.ExtField{},
|
|
|
|
+ BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
|
+ RuleBlock: e.RuleBlock,
|
|
|
|
+ Dataging: qu.IntAll(doc["dataging"]),
|
|
}
|
|
}
|
|
if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
|
|
if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
|
|
delete((*j.Jsondata), "jsoncontent")
|
|
delete((*j.Jsondata), "jsoncontent")
|
|
@@ -396,7 +397,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
//遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
//遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
|
|
func file2text(doc *map[string]interface{}) {
|
|
func file2text(doc *map[string]interface{}) {
|
|
tmpstr := ""
|
|
tmpstr := ""
|
|
- if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
|
|
|
|
|
|
+ //if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
|
|
|
|
+ if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
|
|
for _, attachs := range attach_text {
|
|
for _, attachs := range attach_text {
|
|
if fileinfos, ok := attachs.(map[string]interface{}); ok {
|
|
if fileinfos, ok := attachs.(map[string]interface{}); ok {
|
|
for _, fileinfo := range fileinfos {
|
|
for _, fileinfo := range fileinfos {
|
|
@@ -423,6 +425,7 @@ func file2text(doc *map[string]interface{}) {
|
|
|
|
|
|
//抽取
|
|
//抽取
|
|
func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
|
|
|
|
+
|
|
e.ExtractDetail(j, isSite, j.SpiderCode)
|
|
e.ExtractDetail(j, isSite, j.SpiderCode)
|
|
if jf != nil && jf.IsFile {
|
|
if jf != nil && jf.IsFile {
|
|
e.ExtractFile(jf, isSite, j.SpiderCode)
|
|
e.ExtractFile(jf, isSite, j.SpiderCode)
|
|
@@ -598,7 +601,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
if len(cfn) == 0 {
|
|
if len(cfn) == 0 {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
- data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
|
|
|
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
|
|
if key == "budget" || key == "bidamount" {
|
|
if key == "budget" || key == "bidamount" {
|
|
if istrue, ok := data[len(data)-1].(bool); istrue && ok {
|
|
if istrue, ok := data[len(data)-1].(bool); istrue && ok {
|
|
j.Result[key][i].IsTrue = true
|
|
j.Result[key][i].IsTrue = true
|
|
@@ -696,7 +699,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
lockclear.Lock()
|
|
lockclear.Lock()
|
|
cfn := e.ClearFn[key]
|
|
cfn := e.ClearFn[key]
|
|
lockclear.Unlock()
|
|
lockclear.Unlock()
|
|
- data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
|
|
|
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
|
|
v.Value = data[0]
|
|
v.Value = data[0]
|
|
//清理特殊符号
|
|
//清理特殊符号
|
|
lockclear.Lock()
|
|
lockclear.Lock()
|
|
@@ -1009,7 +1012,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
|
|
lock.Lock()
|
|
lock.Lock()
|
|
cfn := e.ClearFn[in.Field]
|
|
cfn := e.ClearFn[in.Field]
|
|
lock.Unlock()
|
|
lock.Unlock()
|
|
- data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
|
|
|
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
|
|
if data[len(data)-1].(bool) {
|
|
if data[len(data)-1].(bool) {
|
|
j.BlockPackage[k].Budget = qu.Float64All(data[0])
|
|
j.BlockPackage[k].Budget = qu.Float64All(data[0])
|
|
j.BlockPackage[k].IsTrueBudget = true
|
|
j.BlockPackage[k].IsTrueBudget = true
|
|
@@ -1019,7 +1022,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
|
|
lock.Lock()
|
|
lock.Lock()
|
|
cfn := e.ClearFn[in.Field]
|
|
cfn := e.ClearFn[in.Field]
|
|
lock.Unlock()
|
|
lock.Unlock()
|
|
- data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
|
|
|
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
|
|
if data[len(data)-1].(bool) {
|
|
if data[len(data)-1].(bool) {
|
|
j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
|
|
j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
|
|
j.BlockPackage[k].IsTrueBidamount = true
|
|
j.BlockPackage[k].IsTrueBidamount = true
|
|
@@ -1081,7 +1084,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
|
|
lock.Lock()
|
|
lock.Lock()
|
|
cfn := e.ClearFn[in.Field]
|
|
cfn := e.ClearFn[in.Field]
|
|
lock.Unlock()
|
|
lock.Unlock()
|
|
- data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
|
|
|
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
|
|
if data[len(data)-1].(bool) {
|
|
if data[len(data)-1].(bool) {
|
|
j.BlockPackage[k].Budget = qu.Float64All(data[0])
|
|
j.BlockPackage[k].Budget = qu.Float64All(data[0])
|
|
j.BlockPackage[k].IsTrueBudget = true
|
|
j.BlockPackage[k].IsTrueBudget = true
|
|
@@ -1092,7 +1095,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
|
|
lock.Lock()
|
|
lock.Lock()
|
|
cfn := e.ClearFn[in.Field]
|
|
cfn := e.ClearFn[in.Field]
|
|
lock.Unlock()
|
|
lock.Unlock()
|
|
- data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
|
|
|
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
|
|
if data[len(data)-1].(bool) {
|
|
if data[len(data)-1].(bool) {
|
|
j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
|
|
j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
|
|
j.BlockPackage[k].IsTrueBidamount = true
|
|
j.BlockPackage[k].IsTrueBidamount = true
|
|
@@ -1687,7 +1690,6 @@ var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:
|
|
//分析抽取结果并保存
|
|
//分析抽取结果并保存
|
|
func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
qu.Try(func() {
|
|
qu.Try(func() {
|
|
-
|
|
|
|
//重新取出清理过后的中标候选人
|
|
//重新取出清理过后的中标候选人
|
|
resetWinnerorder(j)
|
|
resetWinnerorder(j)
|
|
doc, result, _id := funcAnalysis(j, e)
|
|
doc, result, _id := funcAnalysis(j, e)
|
|
@@ -1711,9 +1713,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
if v.Score > -1 {
|
|
if v.Score > -1 {
|
|
tmp[v.Field] = v.Value
|
|
tmp[v.Field] = v.Value
|
|
break
|
|
break
|
|
- } else if v.Field == "projectname" {
|
|
|
|
- tmp[v.Field] = v.Value
|
|
|
|
- break
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -1794,7 +1793,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
if len(j.Winnerorder) > 0 { //候选人信息
|
|
if len(j.Winnerorder) > 0 { //候选人信息
|
|
for i, v := range j.Winnerorder {
|
|
for i, v := range j.Winnerorder {
|
|
if v["price"] != nil {
|
|
if v["price"] != nil {
|
|
- j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""})[0]
|
|
|
|
|
|
+ j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""},j.SpiderCode)[0]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
tmp["winnerorder"] = j.Winnerorder
|
|
tmp["winnerorder"] = j.Winnerorder
|
|
@@ -1823,10 +1822,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
tmp["ffield"] = ffield
|
|
tmp["ffield"] = ffield
|
|
}
|
|
}
|
|
for k, v := range *doc {
|
|
for k, v := range *doc {
|
|
- //去重冗余字段
|
|
|
|
- if delFiled(k) {
|
|
|
|
- continue
|
|
|
|
|
|
+ if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
|
|
|
|
+ (*doc)[k] = []rune(qu.ObjToString(v))[:100000]
|
|
}
|
|
}
|
|
|
|
+ //去重冗余字段
|
|
|
|
+ //if delFiled(k) {
|
|
|
|
+ // continue
|
|
|
|
+ //}
|
|
if tmp[k] == nil {
|
|
if tmp[k] == nil {
|
|
tmp[k] = v
|
|
tmp[k] = v
|
|
}
|
|
}
|
|
@@ -1934,6 +1936,28 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
// }
|
|
// }
|
|
//tmp["extract_content"] = j.Content
|
|
//tmp["extract_content"] = j.Content
|
|
tmp["dataging"] = j.Dataging
|
|
tmp["dataging"] = j.Dataging
|
|
|
|
+
|
|
|
|
+ if attach_text, ok := (tmp)["new_attach_text"].(map[string]interface{}); ok {
|
|
|
|
+ //if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
|
|
|
|
+ for ai, attachs := range attach_text {
|
|
|
|
+ if fileinfos, ok := attachs.(map[string]interface{}); ok {
|
|
|
|
+ for fi, fileinfo := range fileinfos {
|
|
|
|
+ if ff, ok := fileinfo.(map[string]interface{}); ok {
|
|
|
|
+ attach_url := qu.ObjToString(ff["attach_url"])
|
|
|
|
+ if utf8.RuneCountInString(attach_url) > qu.IntAllDef(ju.Config["filelength"], 10000) {
|
|
|
|
+ (tmp)["new_attach_text"].(map[string]interface{})[ai].((map[string]interface{}))[fi].(map[string]interface{})["attach_url"] = "文本过长..."
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }//}budget bidamount
|
|
|
|
+ if bg,ok :=tmp["budget"].(float64);ok && bg>=500000000000{
|
|
|
|
+ delete(tmp,"budget")
|
|
|
|
+ }
|
|
|
|
+ if bg,ok :=tmp["bidamount"].(float64);ok && bg>=500000000000{
|
|
|
|
+ delete(tmp,"bidamount")
|
|
|
|
+ }
|
|
if e.TaskInfo.TestColl == "" {
|
|
if e.TaskInfo.TestColl == "" {
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
/* if len(e.SiteFields) <= 0 {
|
|
/* if len(e.SiteFields) <= 0 {
|
|
@@ -1988,7 +2012,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
tmp["result"] = result
|
|
tmp["result"] = result
|
|
- tmp["resultf"] = resultf
|
|
|
|
|
|
+ //tmp["resultf"] = resultf
|
|
b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
|
|
b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
|
|
if !b {
|
|
if !b {
|
|
log.Debug(e.TaskInfo.TestColl, _id)
|
|
log.Debug(e.TaskInfo.TestColl, _id)
|
|
@@ -2104,7 +2128,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
|
|
}
|
|
}
|
|
if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
|
|
if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
|
|
//jsondata清理
|
|
//jsondata清理
|
|
- clearJd(j.Jsondata, e)
|
|
|
|
|
|
+ clearJd(j.Jsondata, e,j.SpiderCode)
|
|
marshalbt, _ := json.Marshal(j.Jsondata)
|
|
marshalbt, _ := json.Marshal(j.Jsondata)
|
|
tmpjddata := make(map[string]interface{})
|
|
tmpjddata := make(map[string]interface{})
|
|
json.Unmarshal(marshalbt, &tmpjddata)
|
|
json.Unmarshal(marshalbt, &tmpjddata)
|
|
@@ -2118,7 +2142,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
|
|
if len(cfn) == 0 {
|
|
if len(cfn) == 0 {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
- newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""})
|
|
|
|
|
|
+ newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""},j.SpiderCode)
|
|
if tmpv.Value == newNum[0] {
|
|
if tmpv.Value == newNum[0] {
|
|
extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
|
|
extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
|
|
j.Result[jdkey] = append(j.Result[jdkey], extField)
|
|
j.Result[jdkey] = append(j.Result[jdkey], extField)
|
|
@@ -2321,7 +2345,7 @@ func resetWinnerorder(j *ju.Job) {
|
|
if maxlen > 0 {
|
|
if maxlen > 0 {
|
|
winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
|
|
winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
|
|
if j.Winnerorder[0]["price"] != nil {
|
|
if j.Winnerorder[0]["price"] != nil {
|
|
- tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""})
|
|
|
|
|
|
+ tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""},j.SpiderCode)
|
|
if tmpPrice[len(tmpPrice)-1].(bool) {
|
|
if tmpPrice[len(tmpPrice)-1].(bool) {
|
|
bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
|
|
bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
|
|
}
|
|
}
|