Browse Source

金额及页面跳转

fengweiqiang 5 years ago
parent
commit
d83ad996df

BIN
src/github.com/shopspring.zip


+ 27 - 16
src/jy/clear/tonumber.go

@@ -3,6 +3,7 @@ package clear
 
 import (
 	"fmt"
+	"github.com/shopspring/decimal"
 	"qfw/util"
 	"regexp"
 	"strconv"
@@ -32,6 +33,7 @@ var NumChar = map[string]interface{}{
 var moneyUnit = map[string]float64{
 	"元": float64(1), "万": float64(10000), "亿": float64(100000000), "億": float64(100000000), //单位
 }
+var kxjsReg *regexp.Regexp
 
 func init() {
 	regOperator, _ = regexp.Compile(`[*|+|)*)]`)
@@ -44,10 +46,11 @@ func init() {
 	numCapitals, _ = regexp.Compile(`([〇|零|点|壹|贰|叁|肆|伍|陆|柒|捌|玖|拾|百|佰|千|仟|万|亿|億|元|圆|角|分|整|正]{4,40})`)
 
 	regQianw, _ = regexp.Compile(`\d{1,2}千万`)
+	kxjsReg = regexp.MustCompile("[0-9][E|e]{1}[-—+]{1}[0-9]{1,2}")
 }
 
 //转int
-func ObjToInt(data []interface{},spidercode ...string) []interface{} {
+func ObjToInt(data []interface{}, spidercode ...string) []interface{} {
 	tmp, err := strconv.Atoi(fmt.Sprint(data[0]))
 	if err != nil {
 		data[0] = 0
@@ -59,7 +62,7 @@ func ObjToInt(data []interface{},spidercode ...string) []interface{} {
 }
 
 //转float,精度小数点4位
-func ObjToFloat(data []interface{},spidercode ...string) []interface{} {
+func ObjToFloat(data []interface{}, spidercode ...string) []interface{} {
 	con := fmt.Sprint(data[0])
 	percent := strings.Contains(con, "%")
 	if percent {
@@ -81,7 +84,7 @@ func ObjToFloat(data []interface{},spidercode ...string) []interface{} {
 	}
 }
 
-func ChiToFloat(data []interface{},spidercode ...string) []interface{} {
+func ChiToFloat(data []interface{}, spidercode ...string) []interface{} {
 	tmp := ""
 	str := fmt.Sprint(data[0])
 	if strings.Contains(str, "百分之") {
@@ -105,10 +108,17 @@ func ChiToFloat(data []interface{},spidercode ...string) []interface{} {
 }
 
 //金额转换
-func ObjToMoney(data []interface{},spidercode ...string) []interface{} {
+func ObjToMoney(data []interface{}, spidercode ...string) []interface{} {
 	//isfindUnit := true
 	tmpstr := (data)[0]
 	totmpstr := util.ObjToString(tmpstr)
+	if kxjsReg.MatchString(totmpstr) {
+		fromString, err := decimal.NewFromString(totmpstr)
+		if err == nil {
+			totmpstr = fromString.String()
+			(data)[0] = totmpstr
+		}
+	}
 	if utf8.RuneCountInString(totmpstr) > 20 {
 		if numCapitals.MatchString(totmpstr) {
 			tmpstr = numCapitals.FindString(totmpstr)
@@ -145,7 +155,7 @@ func ObjToMoney(data []interface{},spidercode ...string) []interface{} {
 		return data
 	}
 	data = append(data, true)
-	return ClearMaxAmount(data,spidercode...)
+	return ClearMaxAmount(data, spidercode...)
 }
 
 //["中标金额","成交金额","合同金额","中标价","成交价","成交价格","中标(成交)金额","投标报价","中标标价","成交结果"]
@@ -271,7 +281,7 @@ func capitalMoney(data []interface{}) []interface{} {
 		str = str[0:index]
 		suffixUnit = float64(10000)
 	}
-	yy:=false
+	yy := false
 	moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
 		if key == "元" || key == "圆" || key == "点" {
 			ishaspoint = true
@@ -300,7 +310,7 @@ func capitalMoney(data []interface{}) []interface{} {
 					tmp = float64(0)
 				}
 				nodes = append(nodes, node*float64(v))
-				if v == 100000000{
+				if v == 100000000 {
 					yy = true
 				}
 				node = float64(0)
@@ -320,7 +330,7 @@ func capitalMoney(data []interface{}) []interface{} {
 	})
 	if yy {
 		nodes = append(nodes, node*suffixUnit, tmp)
-	}else {
+	} else {
 		nodes = append(nodes, node, tmp)
 	}
 	ret := float64(0)
@@ -329,7 +339,7 @@ func capitalMoney(data []interface{}) []interface{} {
 	}
 	if yy {
 		return []interface{}{(ret + decimals), data[1]}
-	}else {
+	} else {
 		return []interface{}{(ret + decimals) * suffixUnit, data[1]}
 	}
 }
@@ -353,7 +363,7 @@ func replaceString(con string, ret, rep []string) string {
 }
 
 //费率转小数
-func RateToFloat(con []interface{},spidercode ...string) []interface{} {
+func RateToFloat(con []interface{}, spidercode ...string) []interface{} {
 	tmp := fmt.Sprint(CutAllSpace(con)[0])
 	if strings.Contains(tmp, "%") || strings.Contains(tmp, "%") {
 		tmp = strings.Replace(tmp, "%", "", -1)
@@ -367,13 +377,13 @@ func RateToFloat(con []interface{},spidercode ...string) []interface{} {
 }
 
 //大于五千亿的过滤掉
-func ClearMaxAmount(data []interface{},spidercode ...string) []interface{} {
+func ClearMaxAmount(data []interface{}, spidercode ...string) []interface{} {
 	value, _ := data[0].(float64)
-	if len(spidercode) > 0{
-		if sp,ok:=moneyClearSpidercode[spidercode[0]];ok{
+	if len(spidercode) > 0 {
+		if sp, ok := moneyClearSpidercode[spidercode[0]]; ok {
 			maxmoney := util.Float64All(sp.(map[string]interface{})["maxmoney"])
 			divisor := util.Float64All(sp.(map[string]interface{})["divisor"])
-			if value>=maxmoney{
+			if value >= maxmoney {
 				value /= divisor
 				data[0] = value
 			}
@@ -385,8 +395,9 @@ func ClearMaxAmount(data []interface{},spidercode ...string) []interface{} {
 	}
 	return data
 }
+
 var moneyClearSpidercode map[string]interface{}
 
 func init() {
-	util.ReadConfig("res/moneyclear.json",&moneyClearSpidercode)
-}
+	util.ReadConfig("res/moneyclear.json", &moneyClearSpidercode)
+}

+ 41 - 28
src/jy/extract/extract.go

@@ -33,11 +33,22 @@ var (
 	ClearTaskList map[string]*ClearTask                  //清理任务列表
 	saveLimit     = 100                                  //抽取日志批量保存
 	PageSize      = 5000                                 //查询分页
-	//Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
-	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
-	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
+	//Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
+	Fields2 = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
+func closeDb(ext *ExtractTask) {
+	if ext.TaskInfo.FDB != nil {
+		s := ext.TaskInfo.FDB.Get()
+		db.Mgo.Close(s)
+	}
+	if ext.TaskInfo.TDB != nil {
+		s := ext.TaskInfo.TDB.Get()
+		db.Mgo.Close(s)
+	}
+}
+
 //启动测试抽取
 func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bool {
 	defer qu.Catch()
@@ -46,6 +57,7 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ext.IsRun = true
 	ext.InitTestTaskInfo(resultcoll, trackcoll)
 	ext.TaskInfo.FDB = db.MgoFactory(1, 3, 120, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
+	defer closeDb(ext)
 	ext.InitSite()
 	ext.InitRulePres()
 	ext.InitRuleBacks(false)
@@ -130,6 +142,7 @@ func StartExtractTaskId(taskId string) bool {
 	}
 	ext.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
 	ext.TaskInfo.TDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.ToDbAddr, ext.TaskInfo.ToDB)
+	defer closeDb(ext)
 	ext.InitSite()
 	ext.InitRulePres()
 	ext.InitRuleBacks(false)
@@ -192,6 +205,7 @@ func RunExtractTask(taskId string) {
 	ext := TaskList[taskId]
 	query := bson.M{"_id": bson.M{"$gte": bson.ObjectIdHex(ext.TaskInfo.LastExtId)}}
 	count := ext.TaskInfo.FDB.Count(ext.TaskInfo.FromColl, query)
+	defer closeDb(ext)
 	pageNum := (count + PageSize - 1) / PageSize
 	limit := PageSize
 	if count < PageSize {
@@ -324,15 +338,15 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		Site:           qu.ObjToString(doc["site"]),
 		//Domain:     qu.ObjToString(doc["domain"]),
 		//Href:       qu.ObjToString(doc["href"]),
-		Title:         qu.ObjToString(doc["title"]),
-		Data:          &doc,
-		City:          qu.ObjToString(doc["city"]),
-		Province:      qu.ObjToString(doc["area"]),
-		Jsondata:      toMap,
-		Result:        map[string][]*ju.ExtField{},
-		BuyerAddr:     qu.ObjToString(doc["buyeraddr"]),
-		RuleBlock:     e.RuleBlock,
-		Dataging:      qu.IntAll(doc["dataging"]),
+		Title:     qu.ObjToString(doc["title"]),
+		Data:      &doc,
+		City:      qu.ObjToString(doc["city"]),
+		Province:  qu.ObjToString(doc["area"]),
+		Jsondata:  toMap,
+		Result:    map[string][]*ju.ExtField{},
+		BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
+		RuleBlock: e.RuleBlock,
+		Dataging:  qu.IntAll(doc["dataging"]),
 	}
 	if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
 		delete((*j.Jsondata), "jsoncontent")
@@ -601,7 +615,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				if len(cfn) == 0 {
 					continue
 				}
-				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
+				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode)
 				if key == "budget" || key == "bidamount" {
 					if istrue, ok := data[len(data)-1].(bool); istrue && ok {
 						j.Result[key][i].IsTrue = true
@@ -699,7 +713,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 				lockclear.Lock()
 				cfn := e.ClearFn[key]
 				lockclear.Unlock()
-				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
+				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content}, j.SpiderCode)
 				v.Value = data[0]
 				//清理特殊符号
 				lockclear.Lock()
@@ -1012,7 +1026,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 								lock.Lock()
 								cfn := e.ClearFn[in.Field]
 								lock.Unlock()
-								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
+								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode)
 								if data[len(data)-1].(bool) {
 									j.BlockPackage[k].Budget = qu.Float64All(data[0])
 									j.BlockPackage[k].IsTrueBudget = true
@@ -1022,7 +1036,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 								lock.Lock()
 								cfn := e.ClearFn[in.Field]
 								lock.Unlock()
-								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
+								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content}, j.SpiderCode)
 								if data[len(data)-1].(bool) {
 									j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
 									j.BlockPackage[k].IsTrueBidamount = true
@@ -1084,7 +1098,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 						lock.Lock()
 						cfn := e.ClearFn[in.Field]
 						lock.Unlock()
-						data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
+						data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode)
 						if data[len(data)-1].(bool) {
 							j.BlockPackage[k].Budget = qu.Float64All(data[0])
 							j.BlockPackage[k].IsTrueBudget = true
@@ -1095,7 +1109,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 						lock.Lock()
 						cfn := e.ClearFn[in.Field]
 						lock.Unlock()
-						data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
+						data := clear.DoClearFn(cfn, []interface{}{val, j.Content}, j.SpiderCode)
 						if data[len(data)-1].(bool) {
 							j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
 							j.BlockPackage[k].IsTrueBidamount = true
@@ -1936,7 +1950,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//		}
 		//tmp["extract_content"] = j.Content
 		tmp["dataging"] = j.Dataging
-		
 		if attach_text, ok := (tmp)["new_attach_text"].(map[string]interface{}); ok {
 			//if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
 			for ai, attachs := range attach_text {
@@ -1945,18 +1958,18 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 						if ff, ok := fileinfo.(map[string]interface{}); ok {
 							attach_url := qu.ObjToString(ff["attach_url"])
 							if utf8.RuneCountInString(attach_url) > qu.IntAllDef(ju.Config["filelength"], 10000) {
-								(tmp)["new_attach_text"].(map[string]interface{})[ai].((map[string]interface{}))[fi].(map[string]interface{})["attach_url"] =  "文本过长..."
+								(tmp)["new_attach_text"].(map[string]interface{})[ai].((map[string]interface{}))[fi].(map[string]interface{})["attach_url"] = "文本过长..."
 							}
 						}
 					}
 				}
 			}
-		}//}budget bidamount
-		if bg,ok :=tmp["budget"].(float64);ok && bg>=500000000000{
-			delete(tmp,"budget")
+		} //}budget bidamount
+		if bg, ok := tmp["budget"].(float64); ok && bg >= 500000000000 {
+			delete(tmp, "budget")
 		}
-		if bg,ok :=tmp["bidamount"].(float64);ok && bg>=500000000000{
-			delete(tmp,"bidamount")
+		if bg, ok := tmp["bidamount"].(float64); ok && bg >= 500000000000 {
+			delete(tmp, "bidamount")
 		}
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
@@ -2128,7 +2141,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 	}
 	if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
 		//jsondata清理
-		clearJd(j.Jsondata, e,j.SpiderCode)
+		clearJd(j.Jsondata, e, j.SpiderCode)
 		marshalbt, _ := json.Marshal(j.Jsondata)
 		tmpjddata := make(map[string]interface{})
 		json.Unmarshal(marshalbt, &tmpjddata)
@@ -2142,7 +2155,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 						if len(cfn) == 0 {
 							continue
 						}
-						newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""},j.SpiderCode)
+						newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""}, j.SpiderCode)
 						if tmpv.Value == newNum[0] {
 							extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
 							j.Result[jdkey] = append(j.Result[jdkey], extField)
@@ -2345,7 +2358,7 @@ func resetWinnerorder(j *ju.Job) {
 	if maxlen > 0 {
 		winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
 		if j.Winnerorder[0]["price"] != nil {
-			tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""},j.SpiderCode)
+			tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""}, j.SpiderCode)
 			if tmpPrice[len(tmpPrice)-1].(bool) {
 				bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
 			}

+ 9 - 4
src/res/fieldscore.json

@@ -85,14 +85,19 @@
             },
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(的|招标|公示|公告|谈判|公开|通知|采购文件|交易中心|\\d#)$",
+                "regstr": ".{2,100}(的|招标|名称|公示|公告|谈判|公开|通知|采购文件|交易中心|\\d#)$",
                 "score": -5
             },
             {
-                "describe": "包含词",
-                "regstr": "(万元|本项目|详见公告|test)",
+                "describe": "包含词-10",
+                "regstr": "(万元|本项目|详见公告|test|[0-9]{1}、)",
                 "score": -10
             },
+            {
+                "describe": "包含词-5",
+                "regstr": "[.|,|,]",
+                "score": -5
+            },
             {
                 "describe": "纯数字字母标点",
                 "regstr": "^[0-9a-zA-Z-]*$",
@@ -105,7 +110,7 @@
             },
             {
                 "describe": "符合",
-                "regstr": "[,,.。!!]",
+                "regstr": "[。!!]",
                 "score": -10
             }
         ],

+ 1 - 1
src/res/formattext.json

@@ -77,7 +77,7 @@
             "desc": ""
 		},
 		{
-			"reg": "([^((,,。、.;;::\r\n公司局]{0,8})(联系人|地址)[::]([^\\s\u3000\u2003\u00a0,,]+?)(联系)?(电话(/传真)?|手机|传真|邮编)[::](.+)",
+			"reg": "([^((,,。、.;;::\r\n公司局政府卫生院]{0,8})(联系人|地址)[::]([^\\s\u3000\u2003\u00a0,,]+?)(联系)?(电话(/传真)?|手机|传真|邮编)[::](.+)",
             "separator": "\n${1}${2}:${3}\n${1}${5}:${7}",
             "desc": ""
 		},

+ 10 - 5
src/res/moneyclear.json

@@ -1,7 +1,12 @@
 {
-    "js_jsszbtbggfwpt_zhbhxrgs": {
-    	"descript":"金额除以10000",
-		"maxmoney":10000000000,
-		"divisor":10000
-    }
+  "js_jsszbtbggfwpt_zhbhxrgs": {
+    "descript": "金额除以10000",
+    "maxmoney": 10000000000,
+    "divisor": 10000
+  },
+  "js_jsjsgczbw_zbhxrgs_new": {
+    "descript": "金额除以10000",
+    "maxmoney": 10000000000,
+    "divisor": 10000
+  }
 }

+ 1 - 1
src/web/templates/admin/class.html

@@ -188,7 +188,7 @@
                 if(r.rep){
                     $("#userform")[0].reset();
                     $("#modal-info").modal("hide");
-                    ttable.ajax.reload();
+                    // ttable.ajax.reload();
                 }else{
                     alert("保存失败,可能是要添加的分类已存在");
                 }

+ 1 - 1
src/web/templates/admin/onetag.html

@@ -230,7 +230,7 @@ function saveeditdata(){
 			if(r.rep){
 				$("#editone-dataform")[0].reset();
 				$("#modal-info-editonetag").modal("hide");
-				ttableonetag.ajax.reload();
+				// ttableonetag.ajax.reload();
 			}else{
 				alert("保存失败");
 			}