wcj 6 سال پیش
والد
کامیت
c429e85d92
2فایلهای تغییر یافته به همراه19 افزوده شده و 24 حذف شده
  1. 16 15
      src/jy/extract/extract.go
  2. 3 9
      src/jy/pretreated/analytable.go

+ 16 - 15
src/jy/extract/extract.go

@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
-	"github.com/PuerkitoBio/goquery"
 	"jy/clear"
 	db "jy/mongodbutil"
 	"jy/pretreated"
@@ -18,6 +17,8 @@ import (
 	"time"
 	"unicode/utf8"
 
+	"github.com/PuerkitoBio/goquery"
+
 	log "github.com/donnie4w/go-logger/logger"
 	"gopkg.in/mgo.v2/bson"
 )
@@ -25,12 +26,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 100                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 100                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -248,8 +249,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job) {
 	//正文小于50个字,有附件把附件内容加到正文
 	tmpDeatil := detail
 	tmpdocument, err := goquery.NewDocumentFromReader(strings.NewReader(tmpDeatil))
-	if err == nil{
-		if utf8.RuneCountInString(strings.Trim(tmpdocument.Text()," ")) < 50{
+	if err == nil {
+		if utf8.RuneCountInString(strings.Trim(tmpdocument.Text(), " ")) < 50 {
 			if isextFile {
 				detail += qu.ObjToString(doc["detailfile"])
 			}
@@ -716,7 +717,7 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 //lua脚本根据属性设置提取kv值
 func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) (map[string][]map[string]interface{}, bool) {
 	kvmap := map[string][]map[string]interface{}{}
-	if len(j.Winnerorder) > 0 {
+	if len(j.Winnerorder) > 1 {
 		if vc.Field == "bidamount" {
 			for _, v := range j.Winnerorder {
 				kvmap[vc.Field] = append(kvmap[vc.Field], map[string]interface{}{
@@ -1280,8 +1281,8 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			tmp["kvtext"] = kvtext.String()
 		}
 		if len(blocks) > 0 {
-			if blocksBytes, err := json.Marshal(blocks);err == nil{
-				if utf8.RuneCount(blocksBytes) < 100000{
+			if blocksBytes, err := json.Marshal(blocks); err == nil {
+				if utf8.RuneCount(blocksBytes) < 100000 {
 					tmp["blocks"] = string(blocksBytes)
 				}
 			}
@@ -1431,7 +1432,7 @@ func delFiled(k string) bool {
 	return k == "summary" || k == "detail" || k == "contenthtml" || k == "site" || k == "spidercode" || k == "projectinfo" || k == "jsondata"
 }
 
-func funcAnalysis(j *ju.Job,  e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
+func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[string][]*ju.ExtField, string) {
 	defer qu.Catch()
 	doc := j.Data
 	result := j.Result
@@ -1442,7 +1443,7 @@ func funcAnalysis(j *ju.Job,  e *ExtractTask) (*map[string]interface{}, map[stri
 	for _, val := range result {
 		ju.Sort(val)
 	}
-	j.Result = JsonDataMergeProcessing(j,e)
+	j.Result = JsonDataMergeProcessing(j, e)
 	return doc, result, _id
 }
 
@@ -1512,7 +1513,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 3 - 9
src/jy/pretreated/analytable.go

@@ -275,12 +275,6 @@ func (table *Table) KVFilter() {
 		if len(table.WinnerOrder) > 0 {
 			//中标候选人合并
 			winnerOrderEntity.Merge(table.WinnerOrder, winnerOrder)
-			if len(table.StandKV["中标单位"]) == 0 {
-				ent := table.WinnerOrder[0]["entname"]
-				if ent != nil {
-					table.StandKV["中标单位"] = append(table.StandKV["中标单位"], &u.Tag{Key: "中标单位", Value: qutil.ObjToString(ent), Weight: -25})
-				}
-			}
 		} else if !table.BPackage { //没有table.WinnerOrder也没有分包 将td中的WinnerOrder赋值给table.WinnerOrder
 			if len(winnerOrder) > 1 {
 				table.WinnerOrder = winnerOrder
@@ -1439,7 +1433,7 @@ func (table *Table) FindKV() {
 							}
 							if len(td.SortKV.Map) > 0 {
 								for _, tdv := range td.SortKV.Keys {
-									if  tdv == "" || td.SortKV.Map[tdv] == ""{ //value为空或者null不再添加到table.SortKV
+									if tdv == "" || td.SortKV.Map[tdv] == "" { //value为空或者null不再添加到table.SortKV
 										continue
 									}
 									table.SortKV.AddKey(tdv, td.SortKV.Map[tdv])
@@ -1999,7 +1993,7 @@ func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int)
 				td := tn.GetTdByRCNo(tn.RowNum-1, 0)
 				if !td.BH && FindVal2_1.MatchString(td.Val) {
 					for _, v2 := range tn.SortKV.Keys {
-						tn.SortKV.AddKey(v2,[]string{tn.SortKV.Map[v2].(string)})
+						tn.SortKV.AddKey(v2, []string{tn.SortKV.Map[v2].(string)})
 					}
 				} else {
 					//没有处理成数组的情况下,继续调用正文查找分包的方法
@@ -3172,7 +3166,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{