|
@@ -33,11 +33,22 @@ var (
|
|
|
ClearTaskList map[string]*ClearTask //清理任务列表
|
|
|
saveLimit = 100 //抽取日志批量保存
|
|
|
PageSize = 5000 //查询分页
|
|
|
- //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
|
|
|
- Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
|
|
|
- Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
+ Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
|
|
|
+ //Fields = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
|
|
|
+ Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
|
|
|
)
|
|
|
|
|
|
+func closeDb(ext *ExtractTask) {
|
|
|
+ if ext.TaskInfo.FDB != nil {
|
|
|
+ s := ext.TaskInfo.FDB.Get()
|
|
|
+ db.Mgo.Close(s)
|
|
|
+ }
|
|
|
+ if ext.TaskInfo.TDB != nil {
|
|
|
+ s := ext.TaskInfo.TDB.Get()
|
|
|
+ db.Mgo.Close(s)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
//启动测试抽取
|
|
|
func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
|
|
|
defer qu.Catch()
|
|
@@ -46,6 +57,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
|
|
|
ext.IsRun = true
|
|
|
ext.InitTestTaskInfo(resultcoll, trackcoll)
|
|
|
ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
|
|
|
+ defer closeDb(ext)
|
|
|
ext.InitSite()
|
|
|
ext.InitRulePres()
|
|
|
ext.InitRuleBacks(false)
|
|
@@ -130,6 +142,7 @@ func StartExtractTaskId(taskId string) bool {
|
|
|
}
|
|
|
ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
|
|
|
ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
|
|
|
+ defer closeDb(ext)
|
|
|
ext.InitSite()
|
|
|
ext.InitRulePres()
|
|
|
ext.InitRuleBacks(false)
|
|
@@ -192,6 +205,7 @@ func RunExtractTask(taskId string) {
|
|
|
ext := TaskList[taskId]
|
|
|
query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
|
|
|
count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
|
|
|
+ defer closeDb(ext)
|
|
|
pageNum := (count + PageSize - 1) / PageSize
|
|
|
limit := PageSize
|
|
|
if count < PageSize {
|
|
@@ -324,15 +338,15 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
|
|
|
Site: qu.ObjToString(doc["site"]),
|
|
|
//Domain: qu.ObjToString(doc["domain"]),
|
|
|
//Href: qu.ObjToString(doc["href"]),
|
|
|
- Title: qu.ObjToString(doc["title"]),
|
|
|
- Data: &doc,
|
|
|
- City: qu.ObjToString(doc["city"]),
|
|
|
- Province: qu.ObjToString(doc["area"]),
|
|
|
- Jsondata: toMap,
|
|
|
- Result: map[string][]*ju.ExtField{},
|
|
|
- BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
- RuleBlock: e.RuleBlock,
|
|
|
- Dataging: qu.IntAll(doc["dataging"]),
|
|
|
+ Title: qu.ObjToString(doc["title"]),
|
|
|
+ Data: &doc,
|
|
|
+ City: qu.ObjToString(doc["city"]),
|
|
|
+ Province: qu.ObjToString(doc["area"]),
|
|
|
+ Jsondata: toMap,
|
|
|
+ Result: map[string][]*ju.ExtField{},
|
|
|
+ BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
|
|
|
+ RuleBlock: e.RuleBlock,
|
|
|
+ Dataging: qu.IntAll(doc["dataging"]),
|
|
|
}
|
|
|
if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
|
|
|
delete((*j.Jsondata), "jsoncontent")
|
|
@@ -601,7 +615,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
|
|
|
if len(cfn) == 0 {
|
|
|
continue
|
|
|
}
|
|
|
- data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode)
|
|
|
if key == "budget" || key == "bidamount" {
|
|
|
if istrue, ok := data[len(data)-1].(bool); istrue && ok {
|
|
|
j.Result[key][i].IsTrue = true
|
|
@@ -699,7 +713,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
|
|
|
lockclear.Lock()
|
|
|
cfn := e.ClearFn[key]
|
|
|
lockclear.Unlock()
|
|
|
- data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode)
|
|
|
v.Value = data[0]
|
|
|
//清理特殊符号
|
|
|
lockclear.Lock()
|
|
@@ -1012,7 +1026,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
|
|
|
lock.Lock()
|
|
|
cfn := e.ClearFn[in.Field]
|
|
|
lock.Unlock()
|
|
|
- data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode)
|
|
|
if data[len(data)-1].(bool) {
|
|
|
j.BlockPackage[k].Budget = qu.Float64All(data[0])
|
|
|
j.BlockPackage[k].IsTrueBudget = true
|
|
@@ -1022,7 +1036,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
|
|
|
lock.Lock()
|
|
|
cfn := e.ClearFn[in.Field]
|
|
|
lock.Unlock()
|
|
|
- data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode)
|
|
|
if data[len(data)-1].(bool) {
|
|
|
j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
|
|
|
j.BlockPackage[k].IsTrueBidamount = true
|
|
@@ -1084,7 +1098,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
|
|
|
lock.Lock()
|
|
|
cfn := e.ClearFn[in.Field]
|
|
|
lock.Unlock()
|
|
|
- data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode)
|
|
|
if data[len(data)-1].(bool) {
|
|
|
j.BlockPackage[k].Budget = qu.Float64All(data[0])
|
|
|
j.BlockPackage[k].IsTrueBudget = true
|
|
@@ -1095,7 +1109,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
|
|
|
lock.Lock()
|
|
|
cfn := e.ClearFn[in.Field]
|
|
|
lock.Unlock()
|
|
|
- data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
|
|
|
+ data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode)
|
|
|
if data[len(data)-1].(bool) {
|
|
|
j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
|
|
|
j.BlockPackage[k].IsTrueBidamount = true
|
|
@@ -1936,7 +1950,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
// }
|
|
|
//tmp["extract_content"] = j.Content
|
|
|
tmp["dataging"] = j.Dataging
|
|
|
-
|
|
|
if attach_text, ok := (tmp)["new_attach_text"].(map[string]interface{}); ok {
|
|
|
//if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
|
|
|
for ai, attachs := range attach_text {
|
|
@@ -1945,18 +1958,18 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
|
|
|
if ff, ok := fileinfo.(map[string]interface{}); ok {
|
|
|
attach_url := qu.ObjToString(ff["attach_url"])
|
|
|
if utf8.RuneCountInString(attach_url) > qu.IntAllDef(ju.Config["filelength"], 10000) {
|
|
|
- (tmp)["new_attach_text"].(map[string]interface{})[ai].((map[string]interface{}))[fi].(map[string]interface{})["attach_url"] = "文本过长..."
|
|
|
+ (tmp)["new_attach_text"].(map[string]interface{})[ai].((map[string]interface{}))[fi].(map[string]interface{})["attach_url"] = "文本过长..."
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- }//}budget bidamount
|
|
|
- if bg,ok :=tmp["budget"].(float64);ok && bg>=500000000000{
|
|
|
- delete(tmp,"budget")
|
|
|
+ } //}budget bidamount
|
|
|
+ if bg, ok := tmp["budget"].(float64); ok && bg >= 500000000000 {
|
|
|
+ delete(tmp, "budget")
|
|
|
}
|
|
|
- if bg,ok :=tmp["bidamount"].(float64);ok && bg>=500000000000{
|
|
|
- delete(tmp,"bidamount")
|
|
|
+ if bg, ok := tmp["bidamount"].(float64); ok && bg >= 500000000000 {
|
|
|
+ delete(tmp, "bidamount")
|
|
|
}
|
|
|
if e.TaskInfo.TestColl == "" {
|
|
|
if len(tmp) > 0 { //保存抽取结果
|
|
@@ -2128,7 +2141,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
|
|
|
}
|
|
|
if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
|
|
|
//jsondata清理
|
|
|
- clearJd(j.Jsondata, e,j.SpiderCode)
|
|
|
+ clearJd(j.Jsondata, e, j.SpiderCode)
|
|
|
marshalbt, _ := json.Marshal(j.Jsondata)
|
|
|
tmpjddata := make(map[string]interface{})
|
|
|
json.Unmarshal(marshalbt, &tmpjddata)
|
|
@@ -2142,7 +2155,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
|
|
|
if len(cfn) == 0 {
|
|
|
continue
|
|
|
}
|
|
|
- newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""},j.SpiderCode)
|
|
|
+ newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode)
|
|
|
if tmpv.Value == newNum[0] {
|
|
|
extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
|
|
|
j.Result[jdkey] = append(j.Result[jdkey], extField)
|
|
@@ -2345,7 +2358,7 @@ func resetWinnerorder(j *ju.Job) {
|
|
|
if maxlen > 0 {
|
|
|
winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
|
|
|
if j.Winnerorder[0]["price"] != nil {
|
|
|
- tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""},j.SpiderCode)
|
|
|
+ tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode)
|
|
|
if tmpPrice[len(tmpPrice)-1].(bool) {
|
|
|
bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
|
|
|
}
|