maxiaoshan 5 years ago
parent
commit
1c22742e42
4 changed files with 33 additions and 30 deletions
  1. 1 1
      src/config.json
  2. 30 27
      src/jy/extract/extract.go
  3. 1 1
      src/main_test.go
  4. 1 1
      src/res/fieldscore.json

+ 1 - 1
src/config.json

@@ -11,7 +11,7 @@
     "elasticPoolSize": 10,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": true,
+    "saveresult": false,
     "fieldsfind": false,
     "qualityaudit": false,
     "saveblock": false,

+ 30 - 27
src/jy/extract/extract.go

@@ -26,12 +26,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 100                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 100                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -584,11 +584,11 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				}
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
 				if key == "budget" || key == "bidamount" {
-						if istrue, ok := data[len(data)-1].(bool); istrue && ok {
-							j.Result[key][i].IsTrue = true
-						} else {
-							continue
-						}
+					if istrue, ok := data[len(data)-1].(bool); istrue && ok {
+						j.Result[key][i].IsTrue = true
+					} else {
+						continue
+					}
 				}
 				before, _ := v.Value.(string)
 				v.Value = data[0]
@@ -654,6 +654,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 				if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
 					ExtRuleCore(tmp, e, vc, j, isSite)
 				}
+
 				// log.Debug("抽取-规则", tmp)
 
 				//抽取-后置规则
@@ -757,9 +758,6 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
 					for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
 						if k == 0 {
 							tp = "colon"
-							//							for _, vv := range v.Kvs {
-							//								qu.Debug(vv.Key, vv.Value)
-							//							}
 						} else if k == 1 {
 							tp = "space"
 						} else if k == 2 {
@@ -1120,6 +1118,14 @@ func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kv
 				//				}
 			} else if k == 1 {
 				tp = "space"
+				//				for _, vv := range v.Kvs {
+				//					qu.Debug("space-kvs:", vv.Key, vv.Value)
+				//				}
+				//				for kkk, vv := range v.KvTags {
+				//					for _, vvv := range vv {
+				//						qu.Debug("space-tags", kkk, vvv.Key, vvv.Value)
+				//					}
+				//				}
 			} else if k == 2 {
 				tp = "table"
 				//				for _, vv := range v.Kvs {
@@ -1582,9 +1588,9 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				} else if v.Field == "projectname" {
 					tmp[v.Field] = v.Value
 					break
-				} else if v.Field == "bidamount"||v.Field =="budget"{
-					if v.IsTrue{
-						tmp[v.Field] =v.Value
+				} else if v.Field == "bidamount" || v.Field == "budget" {
+					if v.IsTrue {
+						tmp[v.Field] = v.Value
 						break
 					}
 				}
@@ -1699,6 +1705,10 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				}
 			}
 		}
+		//		fmt.Println("=============抽取结果================")
+		//		for k, v := range tmp {
+		//			qu.Debug(k, "---", v)
+		//		}
 		//tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
@@ -1743,13 +1753,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				e.RWMutex.Unlock()
 			}
 		} else { //测试结果
-			//			fmt.Println("=============抽取结果================")
-			//			for k, v := range tmp {
-			//				qu.Debug(k, "---", v)
-			//			}
-			//			for field, _ := range e.Fields {
-			//				qu.Debug(field, "---", tmp[field])
-			//			}
 			delete(tmp, "_id")
 			if len(j.BlockPackage) > 0 { //分包详情
 				bs, _ := json.Marshal(j.BlockPackage)
@@ -1888,8 +1891,8 @@ func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
 					standardized = true
 				}
 			}
-			if field == "budget"||field == "bidamount"{
-				if !v.IsTrue{
+			if field == "budget" || field == "bidamount" {
+				if !v.IsTrue {
 					continue
 				}
 			}
@@ -1945,7 +1948,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 1 - 1
src/main_test.go

@@ -28,7 +28,7 @@ func Test_han(t *testing.T) {
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_dev32")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "5df6e6a6e9d1f601e494b749", "1", "mxs_v1", "mxs_v1")
+	extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "5df59ee3e9d1f601e46fc3f9", "1", "mxs_v1", "mxs_v1")
 	//extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5e17e00e85a9271abf0860a6", "1", "mxs_v1", "mxs_v1")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)

+ 1 - 1
src/res/fieldscore.json

@@ -395,7 +395,7 @@
                     -10
                 ]
             },
-			 {
+			{
                 "describe": "[gt,∞,score]",
                 "range": [
                     90,