Prechádzať zdrojové kódy

删除部分注释 , 准备更新线上 ,备注

zhengkun 1 rok pred
rodič
commit
6c844f1ffb
2 zmenil súbory, kde vykonal 13 pridanie a 46 odobranie
  1. 12 45
      src/jy/extract/extract.go
  2. 1 1
      src/jy/extract/extraxtmethod.go

+ 12 - 45
src/jy/extract/extract.go

@@ -79,9 +79,6 @@ func RunExtractTestTask(ext *ExtractTask, startId, num string) bool {
 		query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(startId)}}
 		list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, n)
 		for _, v := range *list {
-			//if qu.ObjToString(v["sensitive"]) != ""||ggtest.MatchString(qu.ObjToString(v[""])) { //去除含敏感词数据
-			//	continue
-			//}
 			if spidercode[qu.ObjToString(v["spidercode"])] { //临时开标记录
 				continue
 			}
@@ -191,9 +188,6 @@ func RunExtractTask(taskId string) {
 		list, _ := ext.TaskInfo.FDB.Find(ext.TaskInfo.FromColl, query, nil, Fields, false, 0, limit)
 		fmt.Printf("page=%d,query=%v", i+1, query, len(*list))
 		for _, v := range *list {
-			//if qu.ObjToString(v["sensitive"]) != "" { //去除含敏感词数据
-			//	continue
-			//}
 			//根据标题判断是否抽取
 			b := IsExtract("title", qu.ObjToString(v["title"]), "")
 			if !b {
@@ -244,9 +238,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
 	if len(d1) >= len(d2) || d2 == "" {
 		detail = d1
-	} else {
+	} else { //选用contenthtml有一种特殊情况与detail不一致,综合考虑选取逻辑
 		detail = d2
-		//选用contenthtml有一种特殊情况与detail不一致,综合考虑选取逻辑
 		if SelectDetailSourceText(d1, d2) {
 			detail = d1
 		}
@@ -278,7 +271,6 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		toptype, subtype = "招标", "招标" //暂时按照"招标"
 	}
 	toMap := qu.ObjToMap(doc["jsondata"])
-	//log.Debug("toMap", toMap)
 	if (*toMap) != nil {
 		if (*toMap)["extweight"] == nil {
 			(*toMap)["extweight"] = ju.Config["jsondata_extweight"]
@@ -390,7 +382,6 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
 					}
 				} else {
 					if tmpk == "winner" && j.Category == "招标" && j.CategorySecond != "单一" {
-						//log.Debug("不采用~招标类附件中标信息")
 						continue
 					}
 					j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
@@ -406,7 +397,6 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
 					}
 					if !isUsed {
 						if j.Category == "招标" && j.CategorySecond != "单一" {
-							//log.Debug("不采用~招标类附件中标信息~")
 							continue
 						}
 						j.Result[tmpk] = append(j.Result[tmpk], jf.Result[tmpk]...)
@@ -416,7 +406,7 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
 		}
 		if len(j.Winnerorder) == 0 && jf.Winnerorder != nil && len(jf.Winnerorder) > 0 {
 			if j.Category == "招标" && j.CategorySecond != "单一" {
-				//log.Debug("不采用~招标类附件中标信息~~")
+
 			} else {
 				j.Winnerorder = append(j.Winnerorder, jf.Winnerorder...)
 			}
@@ -451,9 +441,6 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
 				log.Debug("pretreated.AnalyStart.ExtractProcess", err, j.SourceMid)
 			})
 			e.ExtractDetail(tmpj, false, "")
-			//if jf != nil && jf.IsFile {
-			//	e.ExtractFile(jf, false, "")
-			//}
 			//合并数据
 			j.Block = append(j.Block, tmpj.Block...)
 			j.Winnerorder = append(j.Winnerorder, tmpj.Winnerorder...)
@@ -478,10 +465,6 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
 func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 	qu.Try(func() {
 		doc := *j.Data
-		//全局前置规则,结果覆盖doc属性
-		//for _, v := range e.RulePres {
-		//	doc = ExtRegPre(doc, j, v, e.TaskInfo)
-		//}
 		tmprules := map[string][]*RuleCore{}
 		lockrule.Lock()
 		//加载分类抽取配置
@@ -557,8 +540,6 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				for _, v := range vc.KVRuleCores {
 					ExtRuleKV(j, v, e.TaskInfo)
 				}
-				// log.Debug("抽取-后置规则", tmp)
-
 				//项目名称未能抽取到,标题来凑
 				if vc.Field == "projectname" {
 					if vc.ExtFrom == "title" {
@@ -632,8 +613,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				before, _ := v.Value.(string)
 				v.Value = data[0]
 				BeforeAddClearFnLog(strings.Join(cfn, ","), "函数清理"+strings.Join(cfn, ","), j.SourceMid, before, v.MatchType, v, e)
-				//添加行数清理的日志
-				//清理特殊符号
+				//添加行数清理的日志 , 清理特殊符号
 				lockclear.Lock()
 				if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
 					text := qu.ObjToString(v.Value)
@@ -651,8 +631,8 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 			}
 		}
 		PackageDetail(j, e, isSite, codeSite) //处理分包信息-去重
-		//		bs, _ := json.Marshal(j.Result)
-		//		 log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
+		//bs, _ := json.Marshal(j.Result)
+		//log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
 	}, func(err interface{}) {
 		log.Debug("ExtractProcess err", err, j.SourceMid)
 	})
@@ -660,12 +640,6 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 	qu.Try(func() {
 		doc := *j.Data
-		//全局前置规则,结果覆盖doc属性
-		//		for _, v := range e.RulePres {
-		//			if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
-		//				doc = ExtRegPre(doc, j, v, e.TaskInfo)
-		//			}
-		//		}
 		//抽取规则
 		tmprules := map[string][]*RuleCore{}
 		lockrule.Lock()
@@ -687,30 +661,23 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 					continue
 				}
 				//抽取-前置规则
-				//				for _, v := range vc.RulePres {
-				//					if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
-				//						tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
-				//					}
-				//				}
-				// log.Debug("抽取-前置规则", tmp)
-
+				//for _, v := range vc.RulePres {
+				//	if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
+				//		tmp = ExtRegPre(tmp, j, v, e.TaskInfo)
+				//	}
+				//}
 				//抽取-规则
 				if value, ok := e.FileFields.Load(vc.Field); ok && qu.IntAllDef(value, 1) > 0 {
 					ExtRuleCore(tmp, e, vc, j, isSite)
 				}
-
-				// log.Debug("抽取-规则", tmp)
-
 				//抽取-后置规则
 				for _, v := range vc.RuleBacks {
 					if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
 						ExtRegBack(j, v, e.TaskInfo, vc)
 					}
 				}
-				// log.Debug("抽取-后置规则", tmp)
 			}
 		}
-
 		//全局后置规则
 		for _, v := range e.RuleBacks {
 			if value, ok := e.FileFields.Load(v.Field); ok && qu.IntAllDef(value, 1) > 0 {
@@ -749,8 +716,8 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 		}
 
 		PackageDetail(j, e, isSite, codeSite) //处理分包信息
-		//		bs, _ := json.Marshal(j.Result)
-		//		 log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
+		//bs, _ := json.Marshal(j.Result)
+		//log.Debug("抽取结果", j.Title, j.SourceMid, string(bs))
 	}, func(err interface{}) {
 		log.Debug("ExtractProcess err", err)
 	})

+ 1 - 1
src/jy/extract/extraxtmethod.go

@@ -30,7 +30,7 @@ var (
 	ClearTaskList                                map[string]*ClearTask                         //清理任务列表
 	saveLimit                                                                           = 100  //抽取日志批量保存
 	PageSize                                                                            = 5000 //查询分页
-	Fields                                                                              = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1,"month_tag":1}`
+	Fields                                                                              = `{"jyfb_data":1,"approvecode":1,"approvenumber":1,"projecttype":1,"approvestatus":1,"total_investment":1,"funds":1,"owner":1,"projectaddr":1,"projectperiod":1,"project_scale":1,"project_person":1,"project_phone":1,"project_startdate":1,"project_completedate":1,"construction_area":1,"floor_area":1,"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"review_experts":1,"purchasing":1}`
 	Fields2                                                                             = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 	NiJianField                                                                         = []string{
 		"string#approvecode",