zhangjinkun 6 лет назад
Родитель
Сommit
c01625f14f
5 измененных файлов с 50 добавлено и 29 удалено
  1. 2 1
      src/config.json
  2. 36 20
      src/jy/extract/extract.go
  3. 10 5
      src/jy/extract/extractInit.go
  4. 0 1
      src/jy/extract/score.go
  5. 2 2
      src/main.go

+ 2 - 1
src/config.json

@@ -2,5 +2,6 @@
     "port": "9090",
     "mgodb": "192.168.3.207:27082",
     "dbsize": 5,
-    "dbname": "extract_kf"
+    "dbname": "extract_kf",
+    "fieldscore": true
 } 

+ 36 - 20
src/jy/extract/extract.go

@@ -25,8 +25,8 @@ var (
 	ExtLogs   map[*TaskInfo][]map[string]interface{} //抽取日志
 	TaskList  map[string]*ExtractTask                //任务列表
 	saveLimit = 200                                  //抽取日志批量保存
-
-	Fields = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
+	PageSize  = 5000                                 //查询分页
+	Fields    = `{"title":1,"detail":1,"contenthtml":1,"href":1,"site":1,"spidercode":1,"toptype":1,"area":1,"city":1}`
 )
 
 //启动测试抽取
@@ -123,19 +123,29 @@ func StopExtractTaskId(taskId string) bool {
 func RunExtractTask(taskId string) {
 	ext := TaskList[taskId]
 	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
-	list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, -1, -1)
-	for k, v := range *list {
-		log.Println(k, v["_id"])
+	count := ext.TaskInfo.DB.Count(ext.TaskInfo.FromColl, query)
+	pageNum := (count + PageSize - 1) / PageSize
+	log.Printf("count=%d,pageNum=%d,query=%v", count, pageNum, query)
+	for i := 0; i < pageNum; i++ {
+		query = bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
+		log.Printf("page=%d,query=%v", i+1, query)
+		list, _ := ext.TaskInfo.DB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, PageSize)
+		for _, v := range *list {
+			log.Println(v["_id"])
+			if !ext.IsRun {
+				break
+			}
+			j := PreInfo(v)
+			ext.TaskInfo.ProcessPool <- true
+			go ext.ExtractProcess(j)
+			ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
+		}
+		db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
 		if !ext.IsRun {
 			break
 		}
-		j := PreInfo(v)
-		ext.TaskInfo.ProcessPool <- true
-		go ext.ExtractProcess(j)
-		ext.TaskInfo.LastExtId = qu.BsonIdToSId(v["_id"])
 	}
 	//更新task.s_extlastid
-	db.Mgo.UpdateById("task", ext.Id, `{"$set":{"s_extlastid":"`+ext.TaskInfo.LastExtId+`"}}`)
 	time.AfterFunc(1*time.Minute, func() { RunExtractTask(taskId) })
 }
 
@@ -696,20 +706,26 @@ type FieldValue struct {
 //分析抽取结果并保存
 func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, task *TaskInfo) {
 	_id := qu.BsonIdToSId((*doc)["_id"])
-	result = ScoreFields(result)
+	iscore, _ := ju.Config["fieldscore"].(bool)
+	if iscore { //打分
+		result = ScoreFields(result)
+	}
 	//结果排序
 	values := map[string][]*ju.SortObject{}
 	for key, val := range result {
 		fieldValue := map[string][]interface{}{}
-		//		for _, v := range val {
-		//			if fieldValue[fmt.Sprint(v.Value)] == nil {
-		//				fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
-		//			} else {
-		//				fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
-		//			}
-		//		}
-		for _, v := range val {
-			fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
+		if iscore { //走打分
+			for _, v := range val {
+				fieldValue[fmt.Sprint(v.Value)+v.Type] = []interface{}{v.Score, v.Value}
+			}
+		} else { //不走打分,按出现频次
+			for _, v := range val {
+				if fieldValue[fmt.Sprint(v.Value)] == nil {
+					fieldValue[fmt.Sprint(v.Value)] = []interface{}{0, v.Value}
+				} else {
+					fieldValue[fmt.Sprint(v.Value)][0] = qu.IntAll(fieldValue[fmt.Sprint(v.Value)][0]) + 1
+				}
+			}
 		}
 		objects := []*ju.SortObject{}
 		for k, v := range fieldValue {

+ 10 - 5
src/jy/extract/extractInit.go

@@ -124,6 +124,7 @@ func (e *ExtractTask) InitRulePres() {
 		}
 		if rinfo.IsLua {
 			rinfo.RuleText = v["s_luascript"].(string)
+			e.RulePres = append(e.RulePres, rinfo)
 		} else {
 			qu.Try(func() {
 				rinfo.RuleText = v["s_rule"].(string)
@@ -133,11 +134,11 @@ func (e *ExtractTask) InitRulePres() {
 				} else {
 					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
 				}
+				e.RulePres = append(e.RulePres, rinfo)
 			}, func(err interface{}) {
 				log.Println(rinfo.Code, rinfo.Field, err)
 			})
 		}
-		e.RulePres = append(e.RulePres, rinfo)
 	}
 }
 
@@ -153,6 +154,7 @@ func (e *ExtractTask) InitRuleBacks() {
 		}
 		if rinfo.IsLua {
 			rinfo.RuleText = v["s_luascript"].(string)
+			e.RuleBacks = append(e.RuleBacks, rinfo)
 		} else {
 			qu.Try(func() {
 				rinfo.RuleText = v["s_rule"].(string)
@@ -162,11 +164,11 @@ func (e *ExtractTask) InitRuleBacks() {
 				} else {
 					rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
 				}
+				e.RuleBacks = append(e.RuleBacks, rinfo)
 			}, func(err interface{}) {
 				log.Println(rinfo.Code, rinfo.Field, err)
 			})
 		}
-		e.RuleBacks = append(e.RuleBacks, rinfo)
 	}
 }
 
@@ -201,6 +203,7 @@ func (e *ExtractTask) InitRuleCore() {
 				}
 				if rinfo.IsLua {
 					rinfo.RuleText = v["s_luascript"].(string)
+					rulePres = append(rulePres, rinfo)
 				} else {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
@@ -210,11 +213,11 @@ func (e *ExtractTask) InitRuleCore() {
 						} else {
 							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
 						}
+						rulePres = append(rulePres, rinfo)
 					}, func(err interface{}) {
 						log.Println(rinfo.Code, rinfo.Field, err)
 					})
 				}
-				rulePres = append(rulePres, rinfo)
 			}
 			rcore.RulePres = rulePres
 
@@ -230,6 +233,7 @@ func (e *ExtractTask) InitRuleCore() {
 				}
 				if rinfo.IsLua {
 					rinfo.RuleText = v["s_luascript"].(string)
+					ruleBacks = append(ruleBacks, rinfo)
 				} else {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
@@ -239,11 +243,11 @@ func (e *ExtractTask) InitRuleCore() {
 						} else {
 							rinfo.RegPreBac = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Replace: ""}
 						}
+						ruleBacks = append(ruleBacks, rinfo)
 					}, func(err interface{}) {
 						log.Println(rinfo.Code, rinfo.Field, err)
 					})
 				}
-				ruleBacks = append(ruleBacks, rinfo)
 			}
 			rcore.RuleBacks = ruleBacks
 
@@ -264,6 +268,7 @@ func (e *ExtractTask) InitRuleCore() {
 					rinfo.RuleText = v["s_luascript"].(string)
 					//提取全部属性
 					rinfo.LFields = getALLFields()
+					ruleCores = append(ruleCores, rinfo)
 				} else {
 					qu.Try(func() {
 						rinfo.RuleText = v["s_rule"].(string)
@@ -283,11 +288,11 @@ func (e *ExtractTask) InitRuleCore() {
 						} else {
 							rinfo.RegCore = &ExtReg{Reg: regexp.MustCompile(tmp[0]), Bextract: false}
 						}
+						ruleCores = append(ruleCores, rinfo)
 					}, func(err interface{}) {
 						log.Println(rinfo.Code, rinfo.Field, err)
 					})
 				}
-				ruleCores = append(ruleCores, rinfo)
 			}
 			rcore.RuleCores = ruleCores
 			//

+ 0 - 1
src/jy/extract/score.go

@@ -29,7 +29,6 @@ func init() {
 			}
 		}
 	}
-	log.Println(SoreConfig["projectname"])
 }
 
 //结果打分

+ 2 - 2
src/main.go

@@ -10,7 +10,6 @@ import (
 	"jy/util"
 	"log"
 	qu "qfw/util"
-	"time"
 )
 
 func init() {
@@ -22,5 +21,6 @@ func main() {
 	go extract.Export()
 	go Router.Run(":" + qu.ObjToString(util.Config["port"]))
 	go log.Println("启动..", qu.ObjToString(util.Config["port"]))
-	time.Sleep(99999 * time.Hour)
+	lock := make(chan bool)
+	<-lock
 }