瀏覽代碼

分布式抽取,中断抽取问题

zhangjinkun 6 年之前
父節點
當前提交
3cdab787b3
共有 5 個文件被更改,包括 47 次插入7 次删除
  1. 1 1
      src/jy/clear/specialsymbols.go
  2. 2 1
      src/jy/cluster/distributed.go
  3. 2 0
      src/jy/cluster/ssh.go
  4. 18 3
      src/jy/extract/extractudp.go
  5. 24 2
      src/specialsymbols.json

+ 1 - 1
src/jy/clear/specialsymbols.go

@@ -203,7 +203,7 @@ func RemoveAsymmetric(text string) string {
 		if AsyReg.MatchString(first) {
 			textRune = textRune[1:]
 		}
-		if AsyReg.MatchString(last) && len(text) > 0 {
+		if len(textRune) > 0 && AsyReg.MatchString(last) && len(text) > 0 {
 			textRune = textRune[:len(textRune)-1]
 		}
 		text = string(textRune)

+ 2 - 1
src/jy/cluster/distributed.go

@@ -64,6 +64,7 @@ func RunEcsTask() int {
 				log.Println(err)
 			} else {
 				num++
+				time.Sleep(2 * time.Second)
 				log.Println("分发任务", string(by))
 			}
 		}
@@ -105,7 +106,7 @@ func RangeIdsByDate(escnum int, start, edate time.Time) map[string][]string {
 				tmpnum := DB.Count(table, bson.M{"_id": bson.M{"$gte": tmpsid, "$lt": bson.NewObjectIdWithTime(end.Add(24 * 10 * time.Hour) /*连续10天无数据*/)}})
 				if tmpnum < 1 && table != "bidding" {
 					table = "bidding"
-					start = start.Add(-12 * time.Hour)
+					start = start.Add(-4 * time.Hour)
 					continue
 				}
 			} else {

+ 2 - 0
src/jy/cluster/ssh.go

@@ -47,6 +47,8 @@ func ssHConnect(user, password, host string, port int) (*ssh.Session, error) {
 var sshstr = `
 #!/bin/bash
 cd /opt
+kill -9 $(pidof extract_v3)
+rm -rf extract_v3*
 wget http://10.170.187.34:8300/upload/extract_v3.tgz
 tar -xzvf extract_v3.tgz
 cd /opt/extract_v3

+ 18 - 3
src/jy/extract/extractudp.go

@@ -116,15 +116,22 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 		limit = count
 	}
 	log.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
+	sidback := sid
 	//接着上次任务执行
 	startI := 0
 	if len(instanceId) > 0 {
 		esc, _ := db.Mgo.FindOne("ecs", `{"InstanceId":"`+instanceId[0]+`"}`)
 		startI = qu.IntAll((*esc)["pagecurrent"])
+		if qu.ObjToString((*esc)["lastId"]) != "" {
+			sid = qu.ObjToString((*esc)["lastId"])
+		}
+		if qu.ObjToString((*esc)["lastIdback"]) != "" {
+			sidback = qu.ObjToString((*esc)["lastIdback"])
+		}
 	}
-	sidback := sid
+
 	for i := startI; i < pageNum; i++ {
-		query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid)}}
+		query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
 		log.Printf("page=%d,query=%v", i+1, query)
 		if ext.TaskInfo.DB.Count(ext.TaskInfo.FromColl, query) > 0 {
 			list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
@@ -135,11 +142,15 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 				go ext.ExtractProcess(j)
 				sid = qu.BsonIdToSId(v["_id"])
 			}
+			db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
+				map[string]interface{}{"$set": map[string]interface{}{
+					"lastId": sid,
+				}}, true, false)
 		}
 		queryback := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sidback)}}
 		log.Printf("page=%d,queryback=%v", i+1, queryback)
 		if ext.TaskInfo.DB.Count(ext.TaskInfo.FromColl+"_back", queryback) > 0 {
-			list2, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl+"_back", query, nil, Fields, false, 0, limit)
+			list2, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl+"_back", queryback, nil, Fields, false, 0, limit)
 			for _, v := range *list2 {
 				//log.Println(v["_id"])
 				j := PreInfo(v)
@@ -147,6 +158,10 @@ func ExtractByUdp(sid, eid string, instanceId ...string) {
 				go ext.ExtractProcess(j)
 				sidback = qu.BsonIdToSId(v["_id"])
 			}
+			db.Mgo.Update("ecs", `{"InstanceId":"`+instanceId[0]+`"}`,
+				map[string]interface{}{"$set": map[string]interface{}{
+					"lastIdback": sidback,
+				}}, true, false)
 		}
 		//分布式抽取进度
 		if len(instanceId) > 0 {

+ 24 - 2
src/specialsymbols.json

@@ -1,13 +1,20 @@
 {
 	"symmetric":{
 		"field":{
+		"projectname":true,
+		"projectcode":true,
+		"buyer":true,
+		"winner":true,
+		"agency":true
 		},
 		"symbol":[
 			["(",")"],
 			["\\[","\\]"],
 			["{","}"],
 			["{","}"],
-			["'","'"],
+			["'","'"],			
+			["‘","’"],
+			["“","”"],			
 			["\"","\""],
 			["【","】"],
 			["(",")"],
@@ -16,7 +23,14 @@
 	},
 	"asymmetric":{
 		"field":{
-			"projectcode":true
+		"projectname":true,
+		"projectcode":true,
+		"buyer":true,
+		"winner":true,
+		"agency":true,
+		"agency":true,
+		"buyertel":true,
+		"buyerperson":true
 		},
 		"symbol":[
 			":",
@@ -31,6 +45,14 @@
 	},
 	"messycode":{
 		"field":{
+		"projectname":true,
+		"projectcode":true,
+		"buyer":true,
+		"winner":true,
+		"agency":true,
+		"agency":true,
+		"buyertel":true,
+		"buyerperson":true
 		},
 		"symbol":[