Sfoglia il codice sorgente

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

fengweiqiang 5 anni fa
parent
commit
c7c7bbf67b
4 ha cambiato i file con 25 aggiunte e 22 eliminazioni
  1. 1 1
      src/config.json
  2. 20 17
      src/jy/extract/extract.go
  3. 1 1
      src/main_test.go
  4. 3 3
      src/res/fieldscore.json

+ 1 - 1
src/config.json

@@ -11,7 +11,7 @@
     "elasticPoolSize": 10,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": true,
+    "saveresult": false,
     "fieldsfind": false,
     "qualityaudit": false,
     "saveblock": false,

+ 20 - 17
src/jy/extract/extract.go

@@ -27,12 +27,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 100                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 100                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -662,6 +662,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 				if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
 					ExtRuleCore(tmp, e, vc, j, isSite)
 				}
+
 				// log.Debug("抽取-规则", tmp)
 
 				//抽取-后置规则
@@ -765,9 +766,6 @@ func ExtRuleCore(doc map[string]interface{}, e *ExtractTask, vc *RuleCore, j *ju
 					for k, v := range []*ju.JobKv{bp.ColonKV, bp.SpaceKV, bp.TableKV} {
 						if k == 0 {
 							tp = "colon"
-							//							for _, vv := range v.Kvs {
-							//								qu.Debug(vv.Key, vv.Value)
-							//							}
 						} else if k == 1 {
 							tp = "space"
 						} else if k == 2 {
@@ -1135,6 +1133,14 @@ func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kv
 				//				}
 			} else if k == 1 {
 				tp = "space"
+				//				for _, vv := range v.Kvs {
+				//					qu.Debug("space-kvs:", vv.Key, vv.Value)
+				//				}
+				//				for kkk, vv := range v.KvTags {
+				//					for _, vvv := range vv {
+				//						qu.Debug("space-tags", kkk, vvv.Key, vvv.Value)
+				//					}
+				//				}
 			} else if k == 2 {
 				tp = "table"
 				//				for _, vv := range v.Kvs {
@@ -1769,6 +1775,10 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				}
 			}
 		}
+		//		fmt.Println("=============抽取结果================")
+		//		for k, v := range tmp {
+		//			qu.Debug(k, "---", v)
+		//		}
 		//tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
@@ -1813,13 +1823,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				e.RWMutex.Unlock()
 			}
 		} else { //测试结果
-			//			fmt.Println("=============抽取结果================")
-			//			for k, v := range tmp {
-			//				qu.Debug(k, "---", v)
-			//			}
-			//			for field, _ := range e.Fields {
-			//				qu.Debug(field, "---", tmp[field])
-			//			}
 			delete(tmp, "_id")
 			if len(j.BlockPackage) > 0 { //分包详情
 				bs, _ := json.Marshal(j.BlockPackage)
@@ -2015,7 +2018,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 1 - 1
src/main_test.go

@@ -28,7 +28,7 @@ func Test_han(t *testing.T) {
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_dev32")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "5df6e6a6e9d1f601e494b749", "1", "mxs_v1", "mxs_v1")
+	extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "5df59ee3e9d1f601e46fc3f9", "1", "mxs_v1", "mxs_v1")
 	//extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5e17e00e85a9271abf0860a6", "1", "mxs_v1", "mxs_v1")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)

+ 3 - 3
src/res/fieldscore.json

@@ -77,7 +77,7 @@
             },
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件|交易中心)$",
+                "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件|交易中心|\\d#)$",
                 "score": -5
             },
             {
@@ -402,7 +402,7 @@
                     -10
                 ]
             },
-			 {
+			{
                 "describe": "[gt,∞,score]",
                 "range": [
                     90,
@@ -537,7 +537,7 @@
         "negativewords": [
             {
                 "describe": "以什么开始的减分",
-                "regstr": "^[【|-]",
+                "regstr": "^[-]",
                 "score": -1
             },
             {