Jelajahi Sumber

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

apple 5 tahun lalu
induk
melakukan
c3280a0b08

+ 11 - 7
src/jy/extract/extract.go

@@ -26,12 +26,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 100                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 100                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -301,6 +301,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		CategorySecond: subtype,
 		Content:        qu.ObjToString(doc["detail"]),
 		SpiderCode:     qu.ObjToString(doc["spidercode"]),
+		Site:           qu.ObjToString(doc["site"]),
 		//Domain:     qu.ObjToString(doc["domain"]),
 		//Href:       qu.ObjToString(doc["href"]),
 		Title:     qu.ObjToString(doc["title"]),
@@ -318,6 +319,7 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 			Category:   toptype,
 			Content:    qu.ObjToString(doc["detailfile"]),
 			SpiderCode: qu.ObjToString(doc["spidercode"]),
+			Site:       qu.ObjToString(doc["site"]),
 			Title:      qu.ObjToString(doc["title"]),
 			Data:       &doc,
 			City:       qu.ObjToString(doc["city"]),
@@ -1523,6 +1525,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		auxinfo := auxInfo(j)
 		//从排序结果中取值
 		tmp := map[string]interface{}{} //抽取值
+		tmp["spidercode"] = j.SpiderCode
+		tmp["site"] = j.Site
 		tmp["jsondata"] = j.Jsondata
 		tmp["fieldall"] = auxinfo
 		for _, val := range result {
@@ -1871,7 +1875,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 74 - 4
src/jy/extract/score_jsondata.go

@@ -39,10 +39,80 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				}
 				continue
 			}
-
-			extFields := make([]*util.ExtField, 0)
-			extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: strings.Trim(util2.ObjToString((*j.Jsondata)[v]), " "), Score: 0.1})
-			j.Result[v] = extFields
+			vv := strings.TrimSpace(util2.ObjToString((*j.Jsondata)[v]))
+			if vv == ""|| strings.Contains(vv,"详见公告"){
+				continue
+			}
+			lockscore.Lock()
+			scoreRule := SoreConfig[v]
+			lockscore.Unlock()
+			tmpExtField := &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: vv, Score: 0.1}
+			//1.长度打分
+			valueLen := utf8.RuneCountInString(fmt.Sprint(tmpExtField.Value))
+			if valueLen < 1 {
+				tmpExtField.Score = -5
+				continue
+			}
+			if valueLen > 100 {
+				tmpExtField.Score = -99
+			}
+			if lengths, ok := scoreRule["length"].([]interface{}); ok {
+				for _, tmp := range lengths {
+					if length, ok := tmp.(map[string]interface{}); ok {
+						if ranges, ok := length["range"].([]interface{}); ok {
+							gt := util2.IntAll(ranges[0])
+							lte := util2.IntAll(ranges[1])
+							if lte < 0 { //∞
+								lte = 999999
+							}
+							score := util2.Float64All(ranges[2])
+							if valueLen > gt && valueLen <= lte {
+								tmpExtField.Score += score
+								break
+							}
+						}
+					}
+				}
+			}
+			//2.负面词打分
+			if positions, ok := scoreRule["negativewords"].([]interface{}); ok {
+				for _, position := range positions {
+					if p, ok := position.(map[string]interface{}); ok {
+						util2.Try(func() {
+							if p["regexp"] != nil {
+								reg := p["regexp"].(*regexp.Regexp)
+								if reg.MatchString(util2.ObjToString(tmpExtField.Value)) {
+									tmpExtField.Score += util2.Float64All(p["score"])
+								}
+							}
+						}, func(err interface{}) {
+							log.Println(err)
+						})
+					}
+				}
+			}
+			//3.正面词打分
+			if positions, ok := scoreRule["positivewords"].([]interface{}); ok {
+				for _, position := range positions {
+					if p, ok := position.(map[string]interface{}); ok {
+						util2.Try(func() {
+							if p["regexp"] != nil {
+								reg := p["regexp"].(*regexp.Regexp)
+								if reg.MatchString(util2.ObjToString(tmpExtField.Value)) {
+									tmpExtField.Score += util2.Float64All(p["score"])
+								}
+							}
+						}, func(err interface{}) {
+							log.Println(err)
+						})
+					}
+				}
+			}
+			if tmpExtField.Score > 0{
+				extFields := make([]*util.ExtField, 0)
+				extFields = append(extFields,tmpExtField )
+				j.Result[v] = extFields
+			}
 			//AddExtLog("extract", j.SourceMid, nil, (*j.Jsondata)[v], &RegLuaInfo{  "JsonData_"+v, "", v, "", false, nil, nil}, e.TaskInfo) //抽取日志
 			//AddExtLog("clear", j.SourceMid, (*j.Jsondata)[v], (*j.Jsondata)[v], &RegLuaInfo{  "JsonData_"+v, "", v, "", false, nil, nil}, e.TaskInfo) //抽取日志
 			continue

+ 1 - 0
src/jy/util/article.go

@@ -12,6 +12,7 @@ type Job struct {
 	Content           string                            //正文
 	Title             string                            //标题
 	SpiderCode        string                            //爬虫代码
+	Site              string                            //站点
 	Domain            string                            //网站域名
 	Href              string                            //原文链接
 	City              string                            //城市

+ 20 - 2
src/res/fieldscore.json

@@ -75,7 +75,7 @@
             },
             {
                 "describe": "包含词",
-                "regstr": "(万元|本项目)",
+                "regstr": "(万元|本项目|详见公告)",
                 "score": -10
             },
             {
@@ -250,7 +250,13 @@
                 "score": 3
             }
         ],
-        "negativewords": [],
+        "negativewords": [
+            {
+                "describe": "不在展示",
+                "regstr": "(详见公告)",
+                "score": -10
+            }
+        ],
         "length": [
             {
                 "describe": "[gt,lte,score]",
@@ -297,6 +303,10 @@
                 "describe": "是数字",
                 "regstr": "^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$",
                 "score": -10
+            },  {
+                "describe": "包含负分",
+                "regstr": "(详见公告)",
+                "score": -10
             }
         ],
         "length": [
@@ -318,6 +328,10 @@
                 "describe": "出现中文汉字",
                 "regstr": "[\\u4e00-\\u9fa5]",
                 "score": -10
+            },  {
+                "describe": "包含负分",
+                "regstr": "(详见公告)",
+                "score": -10
             }
         ],
         "length": [
@@ -366,6 +380,10 @@
                 "describe": "是数字",
                 "regstr": "^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$",
                 "score": -10
+            },  {
+                "describe": "包含负分",
+                "regstr": "(详见公告)",
+                "score": -10
             }
         ],
         "length": [