fengweiqiang há 5 anos atrás
pai
commit
033f6890c5
3 ficheiros alterados com 27 adições e 26 exclusões
  1. 1 1
      src/jy/extract/exportask.go
  2. 23 22
      src/jy/extract/extract.go
  3. 3 3
      src/jy/extract/extractudp.go

+ 1 - 1
src/jy/extract/exportask.go

@@ -80,7 +80,7 @@ func extractAndExport(v string, t map[string]interface{}) {
 		}
 		var j, jf *ju.Job
 		var isSite bool
-		if e.IsFileField && v["projectinfo"] != nil {
+		if e.IsFileField && (v["projectinfo"] != nil ||v["attach_text"] != nil ){
 			v["isextFile"] = true
 			j, jf, isSite = e.PreInfo(v)
 		} else {

+ 23 - 22
src/jy/extract/extract.go

@@ -27,12 +27,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 100                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask //任务列表
+	ClearTaskList map[string]*ClearTask   //清理任务列表
+	saveLimit     = 100                   //抽取日志批量保存
+	PageSize      = 5000                  //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -98,7 +98,7 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
 			}
 			var j, jf *ju.Job
 			var isSite bool
-			if ext.IsFileField && v["projectinfo"] != nil {
+			if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
 				v["isextFile"] = true
 				j, jf, isSite = ext.PreInfo(v)
 			} else {
@@ -217,7 +217,7 @@ func RunExtractTask(taskId string) {
 			}
 			var j, jf *ju.Job
 			var isSite bool
-			if ext.IsFileField && v["projectinfo"] != nil {
+			if ext.IsFileField && (v["projectinfo"] != nil || v["attach_text"] != nil) {
 				v["isextFile"] = true
 				j, jf, isSite = ext.PreInfo(v)
 			} else {
@@ -337,20 +337,21 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	}
 	if isextFile {
 		jf = &ju.Job{
-			SourceMid:  qu.BsonIdToSId(doc["_id"]),
-			Category:   toptype,
-			Content:    qu.ObjToString(doc["detailfile"]),
-			SpiderCode: qu.ObjToString(doc["spidercode"]),
-			Site:       qu.ObjToString(doc["site"]),
-			Title:      qu.ObjToString(doc["title"]),
-			Data:       &doc,
-			City:       qu.ObjToString(doc["city"]),
-			Province:   qu.ObjToString(doc["area"]),
-			Jsondata:   toMap,
-			Result:     map[string][]*ju.ExtField{},
-			BuyerAddr:  qu.ObjToString(doc["buyeraddr"]),
-			RuleBlock:  e.RuleBlock,
-			IsFile:     isextFile,
+			SourceMid:      qu.BsonIdToSId(doc["_id"]),
+			Category:       toptype,
+			CategorySecond: subtype,
+			Content:        qu.ObjToString(doc["detailfile"]),
+			SpiderCode:     qu.ObjToString(doc["spidercode"]),
+			Site:           qu.ObjToString(doc["site"]),
+			Title:          qu.ObjToString(doc["title"]),
+			Data:           &doc,
+			City:           qu.ObjToString(doc["city"]),
+			Province:       qu.ObjToString(doc["area"]),
+			Jsondata:       toMap,
+			Result:         map[string][]*ju.ExtField{},
+			BuyerAddr:      qu.ObjToString(doc["buyeraddr"]),
+			RuleBlock:      e.RuleBlock,
+			IsFile:         isextFile,
 		}
 		if (jf.Jsondata != nil || (*jf.Jsondata) != nil) && (*jf.Jsondata)["jsoncontent"] != nil {
 			delete((*jf.Jsondata), "jsoncontent")

+ 3 - 3
src/jy/extract/extractudp.go

@@ -159,7 +159,7 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				}
 				var j, jf *ju.Job
 				var isSite bool
-				if ext.IsFileField && v["projectinfo"] != nil {
+				if ext.IsFileField && (v["projectinfo"] != nil ||v["attach_text"] != nil ){
 					v["isextFile"] = true
 					j, jf,isSite = ext.PreInfo(v)
 				} else {
@@ -176,7 +176,7 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				}
 				var j, jf *ju.Job
 				var isSite bool
-				if ext.IsFileField && v["projectinfo"] != nil {
+				if ext.IsFileField && (v["projectinfo"] != nil ||v["attach_text"] != nil ){
 					v["isextFile"] = true
 					j, jf,isSite = ext.PreInfo(v)
 				} else {
@@ -224,7 +224,7 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 				_id := qu.BsonIdToSId(v["_id"])
 				var j, jf *ju.Job
 				var isSite bool
-				if ext.IsFileField && v["projectinfo"] != nil {
+				if ext.IsFileField && (v["projectinfo"] != nil ||v["attach_text"] != nil ){
 					v["isextFile"] = true
 					j, jf,isSite = ext.PreInfo(v)
 				} else {