فهرست منبع

正文小于50个字,有附件把附件内容加到正文

fengweiqiang 6 سال پیش
والد
کامیت
a63dd91ad7
5فایلهای تغییر یافته به همراه41 افزوده شده و 10 حذف شده
  1. 1 1
      src/config.json
  2. 11 0
      src/jy/extract/extract.go
  3. 22 5
      src/jy/extract/extractudp.go
  4. 6 3
      src/jy/extract/score.go
  5. 1 1
      src/main.go

+ 1 - 1
src/config.json

@@ -59,5 +59,5 @@
 	"isSaveTag":false,
     "tomail": "zhangjinkun@topnet.net.cn,chenmingzhu@topnet.net.cn,zhaolongyue@topnet.net.cn",
     "api": "http://10.171.112.160:19281/_send/_mail",
-    "DeleteInstanceTimeMinute":60
+    "deleteInstanceTimeHour":1
 } 

+ 11 - 0
src/jy/extract/extract.go

@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"github.com/PuerkitoBio/goquery"
 	"jy/clear"
 	db "jy/mongodbutil"
 	"jy/pretreated"
@@ -244,6 +245,16 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 	if isextFile {
 		file2text(&doc) //附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 	}
+	//正文小于50个字,有附件把附件内容加到正文
+	tmpDeatil := detail
+	tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
+	if err == nil{
+		if utf8.RuneCountInString(strings.Trim(tmpdocument.Text()," ")) < 50{
+			if isextFile {
+				detail += qu.ObjToString(doc["detailfile"])
+			}
+		}
+	}
 	toptype := qu.ObjToString(doc["toptype"])
 	subtype := qu.ObjToString(doc["subtype"])
 	if qu.ObjToString(doc["type"]) == "bid" {

+ 22 - 5
src/jy/extract/extractudp.go

@@ -90,9 +90,15 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 	case mu.OP_NOOP: //下个节点回应
 		log.Debug(string(data))
 	case mu.OP_SEND_EMAIL:
-		log.Debug(data,ra.IP)
-		sendMail(string(data))
-		cluster.ModifyInstanceAutoReleaseTime(qu.ObjToString("InstanceId"),1)
+		log.Debug(data, ra.IP)
+		rep := make(map[string]interface{})
+		err := json.Unmarshal(data, &rep)
+		if err != nil {
+			log.Debug(err)
+		} else {
+			sendMail(string(data))
+			cluster.ModifyInstanceAutoReleaseTime(qu.ObjToString(rep["instanceId"]), qu.IntAll(ju.Config["deleteInstanceTimeHour"]))
+		}
 	}
 }
 func sendMail(content string) {
@@ -103,6 +109,7 @@ func sendMail(content string) {
 		log.Debug("邮件发送:", string(read), err)
 	}
 }
+
 var ext *ExtractTask
 
 //根据id区间抽取
@@ -226,12 +233,22 @@ func ExtractByUdp(sid, eid string, ra *net.UDPAddr, instanceId ...string) {
 					"pagecurrent": i + 1,
 				}}, true, false)
 		}
-		go Udpclient.WriteUdp([]byte(`分布式抽取完成,一小时后释放, sid:`+sid+`, eid:`+eid+`, count:`+fmt.Sprint(count)+`,index:`+fmt.Sprint(index)+`,bidtotal:`+fmt.Sprint(ext.BidTotal)+`,释放esc实例: `+instanceId[0]+`,`+instanceId[1]), mu.OP_SEND_EMAIL, ra)
+		des := make(map[string]interface{})
+		des["desc"]=`分布式抽取完成,一小时后释放, sid:`+sid+`, eid:`+eid+`, count:`+fmt.Sprint(count)+`,index:`+fmt.Sprint(index)+`,bidtotal:`+fmt.Sprint(ext.BidTotal)+`,释放esc实例: `+instanceId[0]+`,`+instanceId[1]
+		des["sid"] = sid
+		des["eid"] = eid
+		des["count"] = count
+		des["index"] = index
+		des["bidtotal"] = ext.BidTotal
+		des["instanceId"] = instanceId[0]
+		des["instanceIP"] = instanceId[1]
+		udpbytes, _ := json.Marshal(des)
+		go Udpclient.WriteUdp(udpbytes, mu.OP_SEND_EMAIL, ra)
 		log.Debug("抽取完成", "count:", count, "index:", index, "bidtotal:", ext.BidTotal)
 	} else { //普通抽取
 		query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(sid), "$lte": bson.ObjectIdHex(eid)}}
 		count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
-		log.Debug("查询条件为:",query,"查询条数:",count)
+		log.Debug("查询条件为:", query, "查询条数:", count)
 		pageNum := (count + PageSize - 1) / PageSize
 		limit := PageSize
 		if count < PageSize {

+ 6 - 3
src/jy/extract/score.go

@@ -110,9 +110,6 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 		}
 		locktag.Lock()
 		taglength := len(ftag[field])
-		if taglength == 0{
-			continue
-		}
 		locktag.Unlock()
 		for tmpsindex, tmpsvalue := range tmps {
 			//没有抽取到值,不打分
@@ -169,6 +166,9 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 			//kv权重打分
 			if fieldscore != nil { //指定抽取属性打分配置
 				if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+					if taglength == 0{
+						continue
+					}
 					weightscore := ju.FloatFormat(float64(qu.Float64All(fieldscore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
 					tmps[tmpsindex].Score += weightscore
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})
@@ -177,6 +177,9 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 				}
 			} else {
 				if tmpsvalue.Type == "colon" || tmpsvalue.Type == "space" || tmpsvalue.Type == "table" {
+					if taglength == 0{
+						continue
+					}
 					weightscore := ju.FloatFormat(float64(qu.Float64All(CommonScore["kvweight"]))+float64(tmps[tmpsindex].Weight)/float64(taglength), 4)
 					tmps[tmpsindex].Score += weightscore
 					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: "kv权重打分", Code: "kv-weight", RuleText: describe, ScoreFrom: "fieldscore.json", Value: tmpsvalue.Value, Score: weightscore})

+ 1 - 1
src/main.go

@@ -11,7 +11,7 @@ import (
 	"jy/util"
 	qu "qfw/util"
 	//"qfw/util/elastic"
-	redis "qfw/util/redis"
+	"qfw/util/redis"
 
 	log "github.com/donnie4w/go-logger/logger"
 )