zhengkun 1 éve
szülő
commit
f555f21b38

+ 2 - 2
src/jy/extract/extract.go

@@ -230,8 +230,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	summary := qu.ObjToString(doc["summary"])
 	detail = CleanDetailText(qu.ObjToString(doc["detail"]), summary)
 	//调整采用detail抽取
-	if utf8.RuneCountInString(detail) >= 50000 {
-		detail = detail[:50000]
+	if utf8.RuneCountInString(detail) > 10000 {
+		detail = string(([]rune(detail))[:10000])
 	}
 	doc["detail"] = detail
 	isClearnMoney := !clearMoneyReg.MatchString(detail)

+ 2 - 2
src/jy/extract/extractcheck.go

@@ -17,8 +17,8 @@ func delFiled(k string) bool {
 
 // 检查字段-
 func checkFields(tmp map[string]interface{}, j_data map[string]interface{}) map[string]interface{} {
-	delete(tmp, "contenthtml")
-	delete(tmp, "detail")
+	//delete(tmp, "contenthtml")
+	//delete(tmp, "detail")
 	//剑鱼链接方便查阅
 	jyhref := fmt.Sprintf(JYUrl, qu.CommonEncodeArticle("content", qu.BsonIdToSId(tmp["_id"])))
 	tmp["jytest_href"] = jyhref

+ 2 - 1
src/jy/extract/extractsave.go

@@ -401,6 +401,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if e.IsExtractCity {
 			e.ExtractRegionInfo(j, jf, &tmp, false)
 			e.ExtractRegionClean(&tmp)
+			delete(tmp, "regions_log")
 		}
 		//品牌抽取
 		if ju.IsBrandGoods {
@@ -441,7 +442,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			}
 		}
 		//所有kv组成的字符串
-		assembleKVText(j, &tmp)
+		//assembleKVText(j, &tmp)
 		//检查字段
 		tmp["dataging"] = j.Dataging
 		tmp = checkFields(tmp, *j.Data)

+ 8 - 11
src/jy/extract/extractudp.go

@@ -370,15 +370,15 @@ func ExtractByUdpPre(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 		ext.BidTotal = 0
 	}
 	query := bson.M{"_id": bson.M{"$gt": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
-	count1 := ext.TaskInfo.FDB.Count("bidding_nomal", query)
-	count2 := ext.TaskInfo.FDB.Count("bidding_file", query)
+	count1 := ext.TaskInfo.FDB.Count("zktest_bidding_nomal", query)
+	count2 := ext.TaskInfo.FDB.Count("zktest_bidding_file", query)
 	log.Debug("待抽取数量:", count1+count2)
-	list1, _ := ext.TaskInfo.FDB.Find("bidding_nomal", query, nil, Fields, false, -1, -1)
-	list2, _ := ext.TaskInfo.FDB.Find("bidding_file", query, nil, Fields, false, -1, -1)
+	list1, _ := ext.TaskInfo.FDB.Find("zktest_bidding_nomal", query, nil, Fields, false, -1, -1)
+	list2, _ := ext.TaskInfo.FDB.Find("zktest_bidding_file", query, nil, Fields, false, -1, -1)
 	new_list := append(*list1, *list2...)
 	now_time := time.Now().Unix()
 	total := 0
-	wg_mgo := &sync.WaitGroup{}
+	wg_mgo := sync.WaitGroup{}
 	for _, v := range new_list {
 		if total%1000 == 0 {
 			log.Debug("cur index :", total, v["_id"])
@@ -389,16 +389,13 @@ func ExtractByUdpPre(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 		}
 		ext.TaskInfo.ProcessPool <- true
 		wg_mgo.Add(1)
-		go func(v map[string]interface{}) {
-			defer func() {
-				<-ext.TaskInfo.ProcessPool
-				wg_mgo.Done()
-			}()
+		go func(wg_mgo *sync.WaitGroup, v map[string]interface{}) {
+			defer wg_mgo.Done()
 			var j, jf *ju.Job
 			var isSite bool
 			j, _, isSite = ext.PreInfo(v)
 			ext.ExtractProcess(j, jf, isSite)
-		}(v)
+		}(&wg_mgo, v)
 	}
 	wg_mgo.Wait()
 	log.Debug("抽取完成:", total, ",耗时:", time.Now().Unix()-now_time)