maxiaoshan 5 years ago
parent
commit
54bdf0f5fd

+ 1 - 1
dataprocess/src/config.json

@@ -1,6 +1,6 @@
 {
     "port": "7000",
-    "mgodb": "192.168.3.207:27082",
+    "mgodb": "192.168.3.207:27092",
     "dbsize": 10,
     "dbname": "dataprocess"
 }

+ 6 - 6
dataprocess/src/front/front.go

@@ -5,7 +5,7 @@ import (
 	"strconv"
 	"time"
 
-	. "../util"
+	. "util"
 
 	"github.com/go-xweb/xweb"
 	"gopkg.in/mgo.v2/bson"
@@ -43,11 +43,11 @@ type Front struct {
 	roleDel        xweb.Mapper `xweb:"/front/role/edit/del"`    //权限编辑删除
 	roleSecondEdit xweb.Mapper `xweb:"/front/role/second/edit"` //二级权限编辑
 
-	logicManager	xweb.Mapper `xweb:"/front/logic"`			//逻辑管理
-	logicPre		xweb.Mapper	`xweb:"/front/logic/pre"`		//预处理逻辑
-	logicMatch		xweb.Mapper	`xweb:"/front/logic/match"`		//匹配逻辑
-	logicClean		xweb.Mapper	`xweb:"/front/logic/clean"`		//清洗
-	logicPreSav		xweb.Mapper	`xweb:"/front/logic/pre/save"`	//预处理逻辑保存
+	logicManager xweb.Mapper `xweb:"/front/logic"`          //逻辑管理
+	logicPre     xweb.Mapper `xweb:"/front/logic/pre"`      //预处理逻辑
+	logicMatch   xweb.Mapper `xweb:"/front/logic/match"`    //匹配逻辑
+	logicClean   xweb.Mapper `xweb:"/front/logic/clean"`    //清洗
+	logicPreSav  xweb.Mapper `xweb:"/front/logic/pre/save"` //预处理逻辑保存
 
 }
 

+ 1 - 2
dataprocess/src/front/logic.go

@@ -1,9 +1,9 @@
 package front
 
 import (
-	. "../util"
 	qu "qfw/util"
 	"time"
+	. "util"
 )
 
 func (f *Front) LogicManager() {
@@ -71,4 +71,3 @@ func (f *Front) LogicPreSav() {
 		"rep": b,
 	})
 }
-

+ 2 - 1
dataprocess/src/front/menu.go

@@ -4,7 +4,8 @@ import (
 	qu "qfw/util"
 	"time"
 
-	. "../util"
+	. "util"
+
 	"gopkg.in/mgo.v2/bson"
 )
 

+ 1 - 1
dataprocess/src/front/role.go

@@ -4,7 +4,7 @@ import (
 	"encoding/json"
 	qu "qfw/util"
 
-	. "../util"
+	. "util"
 
 	"gopkg.in/mgo.v2/bson"
 )

+ 1 - 1
dataprocess/src/logic/logic.go

@@ -3,7 +3,7 @@ package logic
 import (
 	qu "qfw/util"
 
-	. "../util"
+	. "util"
 
 	"github.com/go-xweb/xweb"
 )

+ 5 - 6
dataprocess/src/main.go

@@ -1,14 +1,13 @@
 package main
 
 import (
+	"front"
+	"logic"
+	"microservice"
 	qu "qfw/util"
+	"task"
 	"time"
-
-	"./front"
-	"./logic"
-	"./microservice"
-	"./task"
-	. "./util"
+	. "util"
 
 	"github.com/go-xweb/xweb"
 )

+ 1 - 1
dataprocess/src/microservice/microservice.go

@@ -3,7 +3,7 @@ package microservice
 import (
 	qu "qfw/util"
 
-	. "../util"
+	. "util"
 
 	"github.com/go-xweb/xweb"
 )

+ 1 - 1
dataprocess/src/task/task.go

@@ -5,7 +5,7 @@ import (
 	"strings"
 	"time"
 
-	. "../util"
+	. "util"
 
 	"github.com/go-xweb/xweb"
 	"gopkg.in/mgo.v2/bson"

+ 1 - 1
dataprocess/src/util/config.go

@@ -1,8 +1,8 @@
 package util
 
 import (
+	"qfw/mongodb"
 	qu "qfw/util"
-	"qfw/util/mongodb"
 )
 
 var (

+ 24 - 19
src/jy/extract/extract.go

@@ -27,12 +27,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask //任务列表
-	ClearTaskList map[string]*ClearTask   //清理任务列表
-	saveLimit     = 100                   //抽取日志批量保存
-	PageSize      = 5000                  //查询分页
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 100                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -529,7 +529,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				// log.Debug("抽取-规则", tmp)
 				//抽取-后置规则
 				for _, v := range vc.RuleBacks {
-					ExtRegBack(j, v, e.TaskInfo)
+					ExtRegBack(j, v, e.TaskInfo, vc)
 				}
 				//kv规则
 				for _, v := range vc.KVRuleCores {
@@ -557,7 +557,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 					}
 					for i := 0; i < 3; i++ {
 						for _, v := range vc.RuleBacks {
-							ExtRegBack(j, v, e.TaskInfo)
+							ExtRegBack(j, v, e.TaskInfo, vc)
 						}
 					}
 				}
@@ -566,11 +566,11 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 		//全局后置规则
 		if isSite {
 			for _, v := range e.SiteRuleBacks {
-				ExtRegBack(j, v, e.TaskInfo)
+				ExtRegBack(j, v, e.TaskInfo, nil)
 			}
 		} else {
 			for _, v := range e.RuleBacks {
-				ExtRegBack(j, v, e.TaskInfo)
+				ExtRegBack(j, v, e.TaskInfo, nil)
 			}
 		}
 		//函数清理
@@ -671,7 +671,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 				//抽取-后置规则
 				for _, v := range vc.RuleBacks {
 					if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
-						ExtRegBack(j, v, e.TaskInfo)
+						ExtRegBack(j, v, e.TaskInfo, vc)
 					}
 				}
 				// log.Debug("抽取-后置规则", tmp)
@@ -681,7 +681,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 		//全局后置规则
 		for _, v := range e.RuleBacks {
 			if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
-				ExtRegBack(j, v, e.TaskInfo)
+				ExtRegBack(j, v, e.TaskInfo, nil)
 			}
 		}
 		//函数清理
@@ -1396,7 +1396,7 @@ func extRegCoreToResult(extfrom, text string, tag *map[string]string, j *ju.Job,
 }
 
 //后置过滤
-func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
+func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo, vc *RuleCore) {
 	defer qu.Catch()
 	if in.IsLua {
 		result := GetResultMapForLua(j)
@@ -1426,10 +1426,17 @@ func ExtRegBack(j *ju.Job, in *RegLuaInfo, t *TaskInfo) {
 	} else {
 		extinfo := map[string]interface{}{}
 		if in.Field != "" {
+			clearByTitle := false
+			if vc != nil && vc.ExtFrom == "title" && in.Field == "buyer" { //buyer从title抽取到的单独走titile的清理
+				clearByTitle = true
+			}
 			if j.Result[in.Field] != nil {
 				tmp := j.Result[in.Field]
 				exts := []interface{}{}
 				for k, v := range tmp {
+					if clearByTitle && v.ExtFrom != "title" {
+						continue
+					}
 					//table抽取到的数据不清理
 					//					if v.Type == "table" && v.Field != "projectname" {
 					//						continue
@@ -2090,9 +2097,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 	if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
 		marshalbt, _ := json.Marshal(j.Jsondata)
 		tmpjddata := make(map[string]interface{})
-		json.Unmarshal(marshalbt,&tmpjddata)
-		//jsondata清理
-		clearJd(j.Jsondata)
+		json.Unmarshal(marshalbt, &tmpjddata)
 		for _, jdkey := range ju.JsonData {
 			if (*j.Jsondata)[jdkey] != nil && (*j.Jsondata)[jdkey] != "" && len(j.Result[jdkey]) >= 5 {
 				for tmpk, tmpv := range j.Result[jdkey][:5] {
@@ -2111,8 +2116,8 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 							delete((*j.Jsondata), jdkey)
 							break
 						}
-					}else {
-						if (*j.Jsondata)[jdkey] == tmpv.Value{
+					} else {
+						if (*j.Jsondata)[jdkey] == tmpv.Value {
 							extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: tmpv.Value, Score: 100}
 							j.Result[jdkey] = append(j.Result[jdkey], extField)
 							ju.Sort(j.Result[jdkey])
@@ -2123,7 +2128,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 				}
 			}
 		}
-		if len(*j.Jsondata)>0{
+		if len(*j.Jsondata) > 0 {
 			j.Result = JsonDataMergeProcessing(j, e)
 		}
 		j.Jsondata = &tmpjddata

+ 1 - 1
src/res/formattext.json

@@ -77,7 +77,7 @@
             "desc": ""
 		},
 		{
-			"reg": "([^((,,。、.;;::\r\n公司]{0,8})(联系人|地址)[::]([^\\s\u3000\u2003\u00a0,,]+?)(联系)?(电话(/传真)?|手机|传真|邮编)[::](.+)",
+			"reg": "([^((,,。、.;;::\r\n公司]{0,8})(联系人|地址)[::]([^\\s\u3000\u2003\u00a0,,]+?)(联系)?(电话(/传真)?|手机|传真|邮编)[::](.+)",
             "separator": "\n${1}${2}:${3}\n${1}${5}:${7}",
             "desc": ""
 		},