Forráskód Böngészése

敏感词数据抽取过滤

unknown 6 éve
szülő
commit
da99c7d327
3 módosított fájl, 27 hozzáadás és 1 törlés
  1. 5 0
      src/jy/extract/exportask.go
  2. 6 1
      src/jy/extract/extract.go
  3. 16 0
      src/jy/extract/extractudp.go

+ 5 - 0
src/jy/extract/exportask.go

@@ -59,6 +59,11 @@ func extractAndExport(v string, t map[string]interface{}) {
 	limit := qu.IntAll(t["limit"])
 	list, _ := e.TaskInfo.DB.Find(e.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
 	for _, v := range *list {
+		if v["check_sensitive2"] != nil { //去除含敏感词数据
+			if v["check_sensitive2"].(string) != "" {
+				continue
+			}
+		}
 		j := PreInfo(v)
 		e.TaskInfo.ProcessPool <- true
 		go e.ExtractProcess(j)

+ 6 - 1
src/jy/extract/extract.go

@@ -27,7 +27,7 @@ var (
 	TaskList  map[string]*ExtractTask                //任务列表
 	saveLimit = 200                                  //抽取日志批量保存
 	PageSize  = 5000                                 //查询分页
-	Fields    = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1}`
+	Fields    = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"check_sensitive2":1}`
 )
 
 //启动测试抽取
@@ -148,6 +148,11 @@ func RunExtractTask(taskId string) {
 		log.Printf("page=%d,query=%v", i+1, query)
 		list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
 		for _, v := range *list {
+			if v["check_sensitive2"] != nil { //去除含敏感词数据
+				if v["check_sensitive2"].(string) != "" {
+					continue
+				}
+			}
 			//log.Println(v["_id"])
 			if !ext.IsRun {
 				break

+ 16 - 0
src/jy/extract/extractudp.go

@@ -98,6 +98,7 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 		ext.InitDFA()
 	}
 	//质量审核
+	ext.InitAuditFields()
 	ext.InitAuditRule()
 	ext.InitAuditClass()
 	ext.InitAuditRecogField()
@@ -135,6 +136,11 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 			if ext.TaskInfo.DB.Count(ext.TaskInfo.FromColl, query) > 0 {
 				list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
 				for _, v := range *list {
+					if v["check_sensitive2"] != nil { //去除含敏感词数据
+						if v["check_sensitive2"].(string) != "" {
+							continue
+						}
+					}
 					//log.Println(v["_id"])
 					j := PreInfo(v)
 					ext.TaskInfo.ProcessPool <- true
@@ -151,6 +157,11 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 			if ext.TaskInfo.DB.Count(ext.TaskInfo.FromColl+"_back", queryback) > 0 {
 				list2, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl+"_back", queryback, nil, Fields, false, 0, limit)
 				for _, v := range *list2 {
+					if v["check_sensitive2"] != nil { //去除含敏感词数据
+						if v["check_sensitive2"].(string) != "" {
+							continue
+						}
+					}
 					//log.Println(v["_id"])
 					j := PreInfo(v)
 					ext.TaskInfo.ProcessPool <- true
@@ -172,6 +183,11 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 		query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
 		list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, -1, -1)
 		for _, v := range *list {
+			if v["check_sensitive2"] != nil { //去除含敏感词数据
+				if v["check_sensitive2"].(string) != "" {
+					continue
+				}
+			}
 			//log.Println(v["_id"])
 			j := PreInfo(v)
 			ext.TaskInfo.ProcessPool <- true