Pārlūkot izejas kodu

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

apple 5 gadi atpakaļ
vecāks
revīzija
e71867da8f

+ 5 - 5
fullproject/src_v1/config.json

@@ -1,13 +1,13 @@
 {
     "loadStart": 0,
 	"validdays":150,
-    "statusdays": 7,
-	"mongodbServers": "192.168.3.166:27082",
+    "statusdays": 15,
+	"mongodbServers": "192.168.3.207:27092",
     "mongodbPoolSize": 10,
-    "mongodbName": "zhaolongyue",
+    "mongodbName": "extract_kf",
 	"hints":"publishtime_1",
-    "extractColl": "huawei_bidding_all_0110_v2",
-    "projectColl": "huawei_project_0113_v2",
+    "extractColl": "jh_info",
+    "projectColl": "jh_project",
     "backupFlag": false,
     "siteColl": "site",
     "thread": 1,

+ 1 - 0
fullproject/src_v1/init.go

@@ -325,6 +325,7 @@ type Site struct {
 	City     string `json:"city"`     //市
 	District string `json:"district"` //区、县
 	Domain   string `json:"domain"`   //地址
+	Status	 int	`json:"status"`	 //
 }
 
 //二分字符串查找

+ 2 - 1
fullproject/src_v1/load_data.go

@@ -115,7 +115,8 @@ func (p *ProjectTask) loadSite() {
 			select {
 			case tmp := <-pool:
 				n++
-				if tmp != nil {
+				//站点有效标记state
+				if tmp != nil && tmp.Status == 5 {
 					p.mapSite[tmp.Site] = tmp
 				}
 			case <-over:

+ 5 - 5
fullproject/src_v1/main.go

@@ -58,9 +58,9 @@ func DealSign() {
 	}
 }
 
-func main() {
+func mainT() {
 	//udp跑增量  id段   project
-	//udp跑全量			ql
+	//udp跑全量			qlT
 	//udp跑历史数据  信息id1,id2/或id段  ls
 	//udp强制合并  信息id1,id2,id3 [项目id] 不存在时新建  qzhb
 	//udp强制拆分  项目id,信息id1,id2          qzcf
@@ -77,9 +77,9 @@ func main() {
 }
 
 //测试组人员使用
-func mainT() {
-	sid = "5d18eca4a5cb26b9b7c7f587"
-	eid = "5e381b7650b5ea296ed16e51"
+func main() {
+	sid = "5c90370ca5cb26b9b72b3d0a"
+	eid = "5d3a88ffa5cb26b9b7755564"
 	//flag.StringVar(&sid, "sid", "", "开始id")
 	//flag.StringVar(&eid, "eid", "", "结束id")
 	//flag.Parse()

+ 2 - 4
fullproject/src_v1/project.go

@@ -5,14 +5,12 @@ import (
 	"log"
 	"time"
 
-	//	"log"
+	"go.mongodb.org/mongo-driver/bson"
+	"go.mongodb.org/mongo-driver/bson/primitive"
 	"math"
 	qu "qfw/util"
 	"sort"
 	"strings"
-	//"gopkg.in/mgo.v2/bson"
-	"go.mongodb.org/mongo-driver/bson"
-	"go.mongodb.org/mongo-driver/bson/primitive"
 )
 
 /**

+ 6 - 6
src/jy/clear/clear.go

@@ -7,7 +7,7 @@ import (
 )
 
 //方法清单
-var clearfns = make(map[string]func(data []interface{}) []interface{})
+var clearfns = make(map[string]func(data []interface{},spidercode ... string) []interface{})
 var lock sync.RWMutex
 
 func init() {
@@ -33,20 +33,20 @@ func init() {
 }
 
 //绑定清理方法
-func BindFn(fnname string, fn func(data []interface{}) []interface{}) {
+func BindFn(fnname string, fn func(data []interface{},spidercode ...string) []interface{}) {
 	lock.Lock()
 	clearfns[fnname] = fn
 	lock.Unlock()
 }
 
 //执行清理动作,如果调用的清理方法不存在,则不做处理
-func DoClearFn(clear []string, data []interface{}) []interface{} {
+func DoClearFn(clear []string, data []interface{},spidercode ...string) []interface{} {
 	if len(clear) == 0 {
 		return data
 	}
 	for _, fnname := range clear {
 		if v, ok := clearfns[fnname]; ok {
-			data = v(data)
+			data = v(data,spidercode...)
 		}
 	}
 	return data
@@ -55,13 +55,13 @@ func DoClearFn(clear []string, data []interface{}) []interface{} {
 //取手机号
 var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,5})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—-\\-])+\\d{2,}[×―—-\\-]+(\\d{3,})+|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
 
-func GetPhone(data []interface{}) []interface{} {
+func GetPhone(data []interface{},spidercode ...string) []interface{} {
 	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))
 	return data
 }
 
 //去除数字
-func ClearNumber(data []interface{}) []interface{} {
+func ClearNumber(data []interface{},spidercode ...string) []interface{} {
 	data[0] = clearNum.ReplaceAllString(fmt.Sprint(data[0]), "")
 	return data
 }

+ 9 - 10
src/jy/clear/cutspace.go

@@ -17,11 +17,10 @@ var (
 )
 
 var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n", "\u0001"}
-
 func init() {
 	cutSpace, _ = regexp.Compile(`^[\s]*|[\s]*$`)
 	cutAllSpace, _ = regexp.Compile(`\s*`)
-	catSymbol, _ = regexp.Compile(`[]+`)
+	catSymbol, _ = regexp.Compile(`\\[\\]+`)
 	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/。|]")
 	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|科|部|队|联合(会|体)|工作室)$")
 	clearNum, _ = regexp.Compile("[\\d-]+")
@@ -78,7 +77,7 @@ func CutLableStr(con string) string {
 }
 
 //清理开始、结尾的空白字符
-func CutSpace(data []interface{}) []interface{} {
+func CutSpace(data []interface{},spidercode ...string) []interface{} {
 	tmp := cutSpace.ReplaceAllString(strings.Replace(fmt.Sprint(data[0]), " ", " ", -1), "")
 	tmp = replaceSymbol(tmp, spaces)
 	//fmt.Println("cutspace", tmp)
@@ -87,7 +86,7 @@ func CutSpace(data []interface{}) []interface{} {
 }
 
 //清理所有空白符
-func CutAllSpace(data []interface{}) []interface{} {
+func CutAllSpace(data []interface{},spidercode ...string) []interface{} {
 	tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "")
 	tmp = replaceSymbol(tmp, spaces)
 	data[0] = tmp
@@ -95,7 +94,7 @@ func CutAllSpace(data []interface{}) []interface{} {
 }
 
 //清理尾部符号
-func ClearEndSymblo(data []interface{}) []interface{} {
+func ClearEndSymblo(data []interface{},spidercode ...string) []interface{} {
 	text := fmt.Sprint(data[0])
 	for i := 0; i <= 2; i++ {
 		text = endSymblo.ReplaceAllString(text, "")
@@ -105,7 +104,7 @@ func ClearEndSymblo(data []interface{}) []interface{} {
 }
 
 //清理符号
-func CutSymbol(data []interface{}) []interface{} {
+func CutSymbol(data []interface{},spidercode ...string) []interface{} {
 	value := fmt.Sprint(CutSpace(data)[0])
 	symbol := ",,;;::'\"“”。.\\??、/+=\\_—\\-*&……\\^%$¥@!!`~·"
 	startSymbol := "^[" + ")\\)>》】\\]}}〕" + symbol + "]+"
@@ -119,7 +118,7 @@ func CutSymbol(data []interface{}) []interface{} {
 }
 
 //不成对出现的符号,把符号后面的内容清理掉
-func CutNotPrs(data []interface{}) []interface{} {
+func CutNotPrs(data []interface{},spidercode ...string) []interface{} {
 	return childCutNotPrs(data, 1)
 }
 
@@ -163,7 +162,7 @@ func childCutNotPrs(data []interface{}, count int) []interface{} {
 }
 
 //全部是汉字或者特殊符号的情况,清理掉
-func ClearAllWord(data []interface{}) []interface{} {
+func ClearAllWord(data []interface{},spidercode ...string) []interface{} {
 	value := fmt.Sprint(data[0])
 	reg := regexp.MustCompile("^[\u4e00-\u9fa5、,,.。??'\"“”‘’·~!@#¥$%…&*()()\\-—+=【】\\[\\]{}{}<>《》|\\/\\s]+$")
 	data[0] = reg.ReplaceAllString(value, "")
@@ -171,7 +170,7 @@ func ClearAllWord(data []interface{}) []interface{} {
 }
 
 //中文符号转英文
-func ChiToEng(data []interface{}) []interface{} {
+func ChiToEng(data []interface{},spidercode ...string) []interface{} {
 	value := fmt.Sprint(data[0])
 	startChars := []string{"(", "【", "{", "“", ")", "】", "}", "”"}
 	endChars := []string{"(", "[", "{", "\"", ")", "]", "}", "\""}
@@ -186,7 +185,7 @@ func ChiToEng(data []interface{}) []interface{} {
 	return data
 }
 
-func ClearBuyerPerson(data []interface{}) []interface{} {
+func ClearBuyerPerson(data []interface{},spidercode ...string) []interface{} {
 	value := fmt.Sprint(data[0])
 	//tmp := []string{}
 	if len([]rune(value)) > 4 { //名字默认最长4

+ 1 - 1
src/jy/clear/getratecurrency.go

@@ -18,7 +18,7 @@ func init() {
 }
 
 //获取币种
-func GetCurrency(data []interface{}) []interface{} {
+func GetCurrency(data []interface{},spidercode ...string) []interface{} {
 	val := "人民币"
 	currency.ReplaceAllStringFunc(fmt.Sprint(data[0]), func(key string) string {
 		v := encyitem[key]

+ 1 - 1
src/jy/clear/projectname.go

@@ -15,7 +15,7 @@ var clearSymbol = regexp.MustCompile(`["“”]`)
 var noclearNum = regexp2.MustCompile(`^(?!.+(?:标段|包|子项目|升级改造)[0-9123456789]{1,5})(.*)[0-9123456789]$`, regexp2.None)
 var mustHan = regexp.MustCompile(`[\p{Han}]+`) //项目名称必须包含汉子
 
-func ClearProjectName(data []interface{}) []interface{} {
+func ClearProjectName(data []interface{},spidercode ...string) []interface{} {
 	value := clearPreRegNameCode.ReplaceAllString(CutSpace(data)[0].(string), "$2")
 	value = clearEndRegNameCode.ReplaceAllString(value, "$1")
 	b := mustHan.MatchString(value)

+ 42 - 14
src/jy/clear/tonumber.go

@@ -47,7 +47,7 @@ func init() {
 }
 
 //转int
-func ObjToInt(data []interface{}) []interface{} {
+func ObjToInt(data []interface{},spidercode ...string) []interface{} {
 	tmp, err := strconv.Atoi(fmt.Sprint(data[0]))
 	if err != nil {
 		data[0] = 0
@@ -59,7 +59,7 @@ func ObjToInt(data []interface{}) []interface{} {
 }
 
 //转float,精度小数点4位
-func ObjToFloat(data []interface{}) []interface{} {
+func ObjToFloat(data []interface{},spidercode ...string) []interface{} {
 	con := fmt.Sprint(data[0])
 	percent := strings.Contains(con, "%")
 	if percent {
@@ -81,7 +81,7 @@ func ObjToFloat(data []interface{}) []interface{} {
 	}
 }
 
-func ChiToFloat(data []interface{}) []interface{} {
+func ChiToFloat(data []interface{},spidercode ...string) []interface{} {
 	tmp := ""
 	str := fmt.Sprint(data[0])
 	if strings.Contains(str, "百分之") {
@@ -89,7 +89,7 @@ func ChiToFloat(data []interface{}) []interface{} {
 		moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
 			if v, ok := moneyChar[key].(float64); ok {
 				tmp += strconv.FormatFloat(v, 'f', 0, 64)
-			}else if v, ok := moneyChar[key].(string); ok {
+			} else if v, ok := moneyChar[key].(string); ok {
 				tmp += v
 			}
 			return tmp
@@ -98,14 +98,14 @@ func ChiToFloat(data []interface{}) []interface{} {
 		if err != nil {
 			return data
 		}
-		return []interface{}{tmpF/100, data[1]}
-	}else {
+		return []interface{}{tmpF / 100, data[1]}
+	} else {
 		return data
 	}
 }
 
 //金额转换
-func ObjToMoney(data []interface{}) []interface{} {
+func ObjToMoney(data []interface{},spidercode ...string) []interface{} {
 	//isfindUnit := true
 	tmpstr := (data)[0]
 	totmpstr := util.ObjToString(tmpstr)
@@ -145,7 +145,7 @@ func ObjToMoney(data []interface{}) []interface{} {
 		return data
 	}
 	data = append(data, true)
-	return data
+	return ClearMaxAmount(data,spidercode...)
 }
 
 //["中标金额","成交金额","合同金额","中标价","成交价","成交价格","中标(成交)金额","投标报价","中标标价","成交结果"]
@@ -271,6 +271,7 @@ func capitalMoney(data []interface{}) []interface{} {
 		str = str[0:index]
 		suffixUnit = float64(10000)
 	}
+	yy:=false
 	moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
 		if key == "元" || key == "圆" || key == "点" {
 			ishaspoint = true
@@ -299,6 +300,9 @@ func capitalMoney(data []interface{}) []interface{} {
 					tmp = float64(0)
 				}
 				nodes = append(nodes, node*float64(v))
+				if v == 100000000{
+					yy = true
+				}
 				node = float64(0)
 			} else {
 				if v == 10 && tmp == 0 {
@@ -314,12 +318,20 @@ func capitalMoney(data []interface{}) []interface{} {
 		}
 		return ""
 	})
-	nodes = append(nodes, node, tmp)
+	if yy {
+		nodes = append(nodes, node*suffixUnit, tmp)
+	}else {
+		nodes = append(nodes, node, tmp)
+	}
 	ret := float64(0)
 	for _, v := range nodes {
 		ret += v
 	}
-	return []interface{}{(ret + decimals) * suffixUnit, data[1]}
+	if yy {
+		return []interface{}{(ret + decimals), data[1]}
+	}else {
+		return []interface{}{(ret + decimals) * suffixUnit, data[1]}
+	}
 }
 
 //过滤符号
@@ -341,7 +353,7 @@ func replaceString(con string, ret, rep []string) string {
 }
 
 //费率转小数
-func RateToFloat(con []interface{}) []interface{} {
+func RateToFloat(con []interface{},spidercode ...string) []interface{} {
 	tmp := fmt.Sprint(CutAllSpace(con)[0])
 	if strings.Contains(tmp, "%") || strings.Contains(tmp, "%") {
 		tmp = strings.Replace(tmp, "%", "", -1)
@@ -354,11 +366,27 @@ func RateToFloat(con []interface{}) []interface{} {
 	}
 }
 
-//大于一万亿的过滤掉
-func ClearMaxAmount(data []interface{}) []interface{} {
+//大于五千亿的过滤掉
+func ClearMaxAmount(data []interface{},spidercode ...string) []interface{} {
 	value, _ := data[0].(float64)
-	if value >= 1000000000000 {
+	if len(spidercode) > 0{
+		if sp,ok:=moneyClearSpidercode[spidercode[0]];ok{
+			maxmoney := util.Float64All(sp.(map[string]interface{})["maxmoney"])
+			divisor := util.Float64All(sp.(map[string]interface{})["divisor"])
+			if value>=maxmoney{
+				value /= divisor
+				data[0] = value
+			}
+		}
+	}
+	if value >= 500000000000 {
 		data[0] = float64(0)
+		data[1] = false
 	}
 	return data
 }
+var moneyClearSpidercode map[string]interface{}
+
+func init() {
+	util.ReadConfig("res/moneyclear.json",&moneyClearSpidercode)
+}

+ 1 - 1
src/jy/clear/totimestamp.go

@@ -42,7 +42,7 @@ func init() {
 2006%01%02%15%04->时间戳
 2006%01%02%15%04%05->时间戳
 */
-func ObjToTimestamp(data []interface{}) []interface{} {
+func ObjToTimestamp(data []interface{},spidercode ...string) []interface{} {
 	tmp := fmt.Sprint(data[0])
 	//处理类似:二〇一五年十一月四日十五时
 	cht := regD.FindStringSubmatch(tmp)

+ 53 - 29
src/jy/extract/extract.go

@@ -33,7 +33,8 @@ var (
 	ClearTaskList map[string]*ClearTask                  //清理任务列表
 	saveLimit     = 100                                  //抽取日志批量保存
 	PageSize      = 5000                                 //查询分页
-	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
+	//Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -323,15 +324,15 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		Site:           qu.ObjToString(doc["site"]),
 		//Domain:     qu.ObjToString(doc["domain"]),
 		//Href:       qu.ObjToString(doc["href"]),
-		Title:     qu.ObjToString(doc["title"]),
-		Data:      &doc,
-		City:      qu.ObjToString(doc["city"]),
-		Province:  qu.ObjToString(doc["area"]),
-		Jsondata:  toMap,
-		Result:    map[string][]*ju.ExtField{},
-		BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
-		RuleBlock: e.RuleBlock,
-		Dataging:  qu.IntAll(doc["dataging"]),
+		Title:         qu.ObjToString(doc["title"]),
+		Data:          &doc,
+		City:          qu.ObjToString(doc["city"]),
+		Province:      qu.ObjToString(doc["area"]),
+		Jsondata:      toMap,
+		Result:        map[string][]*ju.ExtField{},
+		BuyerAddr:     qu.ObjToString(doc["buyeraddr"]),
+		RuleBlock:     e.RuleBlock,
+		Dataging:      qu.IntAll(doc["dataging"]),
 	}
 	if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
 		delete((*j.Jsondata), "jsoncontent")
@@ -396,7 +397,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 func file2text(doc *map[string]interface{}) {
 	tmpstr := ""
-	if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
+	//if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
+		if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
 		for _, attachs := range attach_text {
 			if fileinfos, ok := attachs.(map[string]interface{}); ok {
 				for _, fileinfo := range fileinfos {
@@ -423,6 +425,7 @@ func file2text(doc *map[string]interface{}) {
 
 //抽取
 func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
+	
 	e.ExtractDetail(j, isSite, j.SpiderCode)
 	if jf != nil && jf.IsFile {
 		e.ExtractFile(jf, isSite, j.SpiderCode)
@@ -598,7 +601,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				if len(cfn) == 0 {
 					continue
 				}
-				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
+				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
 				if key == "budget" || key == "bidamount" {
 					if istrue, ok := data[len(data)-1].(bool); istrue && ok {
 						j.Result[key][i].IsTrue = true
@@ -696,7 +699,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 				lockclear.Lock()
 				cfn := e.ClearFn[key]
 				lockclear.Unlock()
-				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
+				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
 				v.Value = data[0]
 				//清理特殊符号
 				lockclear.Lock()
@@ -1009,7 +1012,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 								lock.Lock()
 								cfn := e.ClearFn[in.Field]
 								lock.Unlock()
-								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
+								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
 								if data[len(data)-1].(bool) {
 									j.BlockPackage[k].Budget = qu.Float64All(data[0])
 									j.BlockPackage[k].IsTrueBudget = true
@@ -1019,7 +1022,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 								lock.Lock()
 								cfn := e.ClearFn[in.Field]
 								lock.Unlock()
-								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
+								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
 								if data[len(data)-1].(bool) {
 									j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
 									j.BlockPackage[k].IsTrueBidamount = true
@@ -1081,7 +1084,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 						lock.Lock()
 						cfn := e.ClearFn[in.Field]
 						lock.Unlock()
-						data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
+						data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
 						if data[len(data)-1].(bool) {
 							j.BlockPackage[k].Budget = qu.Float64All(data[0])
 							j.BlockPackage[k].IsTrueBudget = true
@@ -1092,7 +1095,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 						lock.Lock()
 						cfn := e.ClearFn[in.Field]
 						lock.Unlock()
-						data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
+						data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
 						if data[len(data)-1].(bool) {
 							j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
 							j.BlockPackage[k].IsTrueBidamount = true
@@ -1687,7 +1690,6 @@ var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
-		
 		//重新取出清理过后的中标候选人
 		resetWinnerorder(j)
 		doc, result, _id := funcAnalysis(j, e)
@@ -1711,9 +1713,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				if v.Score > -1 {
 					tmp[v.Field] = v.Value
 					break
-				} else if v.Field == "projectname" {
-					tmp[v.Field] = v.Value
-					break
 				}
 			}
 		}
@@ -1794,7 +1793,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if len(j.Winnerorder) > 0 { //候选人信息
 			for i, v := range j.Winnerorder {
 				if v["price"] != nil {
-					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""})[0]
+					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""},j.SpiderCode)[0]
 				}
 			}
 			tmp["winnerorder"] = j.Winnerorder
@@ -1823,10 +1822,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			tmp["ffield"] = ffield
 		}
 		for k, v := range *doc {
-			//去重冗余字段
-			if delFiled(k) {
-				continue
+			if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
+				(*doc)[k] = []rune(qu.ObjToString(v))[:100000]
 			}
+			//去重冗余字段
+			//if delFiled(k) {
+			//	continue
+			//}
 			if tmp[k] == nil {
 				tmp[k] = v
 			}
@@ -1934,6 +1936,28 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//		}
 		//tmp["extract_content"] = j.Content
 		tmp["dataging"] = j.Dataging
+		
+		if attach_text, ok := (tmp)["new_attach_text"].(map[string]interface{}); ok {
+			//if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
+			for ai, attachs := range attach_text {
+				if fileinfos, ok := attachs.(map[string]interface{}); ok {
+					for fi, fileinfo := range fileinfos {
+						if ff, ok := fileinfo.(map[string]interface{}); ok {
+							attach_url := qu.ObjToString(ff["attach_url"])
+							if utf8.RuneCountInString(attach_url) > qu.IntAllDef(ju.Config["filelength"], 10000) {
+								(tmp)["new_attach_text"].(map[string]interface{})[ai].((map[string]interface{}))[fi].(map[string]interface{})["attach_url"] =  "文本过长..."
+							}
+						}
+					}
+				}
+			}
+		}//}budget bidamount
+		if bg,ok :=tmp["budget"].(float64);ok && bg>=500000000000{
+			delete(tmp,"budget")
+		}
+		if bg,ok :=tmp["bidamount"].(float64);ok && bg>=500000000000{
+			delete(tmp,"bidamount")
+		}
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				/*	if len(e.SiteFields) <= 0 {
@@ -1988,7 +2012,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				}
 			}
 			tmp["result"] = result
-			tmp["resultf"] = resultf
+			//tmp["resultf"] = resultf
 			b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
 			if !b {
 				log.Debug(e.TaskInfo.TestColl, _id)
@@ -2104,7 +2128,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 	}
 	if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
 		//jsondata清理
-		clearJd(j.Jsondata, e)
+		clearJd(j.Jsondata, e,j.SpiderCode)
 		marshalbt, _ := json.Marshal(j.Jsondata)
 		tmpjddata := make(map[string]interface{})
 		json.Unmarshal(marshalbt, &tmpjddata)
@@ -2118,7 +2142,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 						if len(cfn) == 0 {
 							continue
 						}
-						newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""})
+						newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""},j.SpiderCode)
 						if tmpv.Value == newNum[0] {
 							extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
 							j.Result[jdkey] = append(j.Result[jdkey], extField)
@@ -2321,7 +2345,7 @@ func resetWinnerorder(j *ju.Job) {
 	if maxlen > 0 {
 		winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
 		if j.Winnerorder[0]["price"] != nil {
-			tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""})
+			tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""},j.SpiderCode)
 			if tmpPrice[len(tmpPrice)-1].(bool) {
 				bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
 			}

+ 5 - 5
src/jy/extract/score_jsondata.go

@@ -24,7 +24,7 @@ var endOfPunctuationClrear = regexp.MustCompile("[,,.。??;;]+$")
 var keysClrear = regexp.MustCompile("(详见|公告|X|内文|某单位|某部|文件|\\*|暂无|?|\\?)")
 
 //jsondata清理
-func clearJd(jd *map[string]interface{}, e *ExtractTask) {
+func clearJd(jd *map[string]interface{}, e *ExtractTask,spiderCode string) {
 	for k, v := range *jd {
 		if k == "buyer" || k == "winner" || k == "agency" || k == "projectcode" || k == "projectname" {
 			vstring := util2.ObjToString(v)
@@ -37,7 +37,7 @@ func clearJd(jd *map[string]interface{}, e *ExtractTask) {
 			cfn := e.ClearFn[k]
 			lockclear.Unlock()
 			if len(cfn) > 0 {
-				data := clear.DoClearFn(cfn, []interface{}{vstring, ""})
+				data := clear.DoClearFn(cfn, []interface{}{vstring, ""},spiderCode)
 				lockclear.Lock()
 				if clear.AsyField[k] != nil || clear.SymField[k] != nil || clear.MesField[k] != nil {
 					vstring = clear.OtherClean(k, util2.ObjToString(data[0]))
@@ -85,7 +85,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if len(cfn) == 0 {
 					continue
 				}
-				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""})
+				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
 				//if util2.IntAll(newNum[0]) != 0 {
 				extFields := make([]*util.ExtField, 0)
 				extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1, IsTrue: newNum[len(newNum)-1].(bool)})
@@ -105,7 +105,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if bt,ok :=(*j.Jsondata)[v].(float64);ok && bt>0{
 					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: bt, Score: 0.1})
 				}else {
-					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""})
+					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
 					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1})
 				}
 				j.Result[v] = extFields
@@ -203,7 +203,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if len(cfn) == 0 {
 					continue
 				}
-				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""})
+				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
 				//if util2.IntAll(newNum[0]) != 0 {
 				extFields := make([]*util.ExtField, 0)
 				if jdextweight > 1 {

+ 6 - 0
src/main_test.go

@@ -3,6 +3,7 @@ package main
 import (
 	"fmt"
 	"jy/admin/track"
+	"jy/clear"
 	"jy/extract"
 	. "jy/mongodbutil"
 	"log"
@@ -133,3 +134,8 @@ func Test_buyer(t *testing.T) {
 		}
 	}
 }
+
+func Test_util1(t *testing.T) {
+	data := clear.CutSymbol([]interface{}{"----------123123", "-----123123"})
+	fmt.Println(data)
+}

+ 3 - 3
src/res/fieldscore.json

@@ -248,8 +248,8 @@
         "negativewords": [
             {
                 "describe": "包含负分",
-                "regstr": "(定标|通知|异议|要求|代理|详细|test|意见|原因|具体|结果|负责|付款|附件|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
-                "score": -10
+                "regstr": "(我公司|定标|通知|异议|要求|代理|详细|test|意见|原因|具体|结果|负责|付款|附件|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+                "score": -20
             },
 			{
                 "describe": "非结尾",
@@ -600,7 +600,7 @@
                 "score": -10
             },  {
                 "describe": "包含负分",
-                "regstr": "(详(见|情)|公告|test|招标人)",
+                "regstr": "(详(见|情)|公告|test|招标人|我公司)",
                 "score": -20
             }
         ],

+ 7 - 0
src/res/moneyclear.json

@@ -0,0 +1,7 @@
+{
+    "js_jsszbtbggfwpt_zhbhxrgs": {
+    	"descript":"金额除以10000",
+		"maxmoney":10000000000,
+		"divisor":10000
+    }
+}

+ 9 - 3
udpcreateindex/src/biddingall.go

@@ -100,15 +100,21 @@ func biddingAllTask(data []byte, mapInfo map[string]interface{}) {
 				tid := qutil.BsonIdToSId(tmp["_id"])
 				if cid == tid {
 					bnil = false
-					//更新bidding表,生成索引
+					//更新bidding表,生成索引;bidding表中modifyinfo中的字段不更新
+					modifyinfo := make(map[string]bool)
+					if tmpmodifyinfo, ok := tmp["modifyinfo"].(map[string]interface{}); ok && tmpmodifyinfo != nil {
+						for k, v := range tmpmodifyinfo {
+							modifyinfo[k] = v.(bool)
+						}
+					}
 					for _, k := range fields { //fields更新到mongo的字段
 						v1 := compare[k] //extract
 						v2 := tmp[k]     //bidding
 						if v2 == nil && v1 != nil {
 							update[k] = v1
-						} else if v2 != nil && v1 != nil {
+						} else if v2 != nil && v1 != nil && !modifyinfo[k] {
 							update[k] = v1
-						} else if v2 != nil && v1 == nil { //
+						} else if v2 != nil && v1 == nil && !modifyinfo[k] { //
 							if k == "s_subscopeclass" && del["subscopeclass"] == nil {
 								continue
 							} else if k == "s_topscopeclass" && del["topscopeclass"] == nil {

+ 1 - 1
udpcreateindex/src/bidingpurchasing.go

@@ -7,7 +7,7 @@ import (
 	"sync"
 	"unicode/utf8"
 
-	u "util"
+	u "./util"
 
 	"gopkg.in/mgo.v2/bson"
 )

+ 2 - 8
udpcreateindex/src/config.json

@@ -30,8 +30,8 @@
     },
     "bidding": {
         "db": "mxs",
-        "collect": "test",
-        "index": "bidding_v2",
+        "collect": "test1",
+        "index": "bidding_v1",
         "type": "bidding",
         "extractdb": "mxs",
         "extractcollect": "extract",
@@ -95,11 +95,5 @@
     "elastic": {
         "addr": "http://192.168.3.128:9800",
         "pool": 12
-    },
-    "elastic_other": {
-        "addr": "http://127.0.0.1:9800",
-        "pool": 12,
-        "index": "bidding_v2",
-        "type": "bidding"
     }
 }

+ 4 - 4
udpcreateindex/src/main.go

@@ -1,6 +1,7 @@
 package main
 
 import (
+	u "./util"
 	"encoding/json"
 	"log"
 	mu "mfw/util"
@@ -10,7 +11,6 @@ import (
 	"qfw/util/mongodb"
 	"strings"
 	"time"
-	u "util"
 )
 
 var (
@@ -40,8 +40,8 @@ var (
 
 func init() {
 	util.ReadConfig(&Sysconfig)
-	inits()
-	go checkMapJob()
+	//inits()
+	//go checkMapJob()
 	detailLength = util.IntAllDef(Sysconfig["detaillength"], 50000)
 	fileLength = util.IntAllDef(Sysconfig["filelength"], 50000)
 	updport, _ = Sysconfig["updport"].(string)
@@ -141,7 +141,7 @@ func init() {
 }
 
 func main() {
-	go task_index()
+	//go task_index()
 	//task_qyxyindex()
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}

+ 1 - 1
udpcreateindex/src/util/ossclient.go

@@ -11,7 +11,7 @@ import (
 )
 
 var (
-	ossEndpoint        = "http://oss-cn-beijing-internal.aliyuncs.com" //正式环境用:oss-cn-beijing-internal.aliyuncs.com 测试:oss-cn-beijing.aliyuncs.com
+	ossEndpoint        = "http://oss-cn-beijing.aliyuncs.com" //正式环境用:oss-cn-beijing-internal.aliyuncs.com 测试:oss-cn-beijing.aliyuncs.com
 	ossAccessKeyId     = "LTAI4FvLSWN3Wz9F6dUxQGMR"
 	ossAccessKeySecret = "WnQpnNVEiRfZsz5hIqFSr0phayMo3U"
 	ossBucketName      = "topjy"

+ 4 - 4
udps/main.go

@@ -21,13 +21,13 @@ func main() {
 	flag.StringVar(&startDate, "start", "", "开始日期2006-01-02")
 	flag.StringVar(&endDate, "end", "", "结束日期2006-01-02")
 	flag.StringVar(&ip, "ip", "127.0.0.1", "ip")
-	flag.IntVar(&p, "p", 0, "端口")
+	flag.IntVar(&p, "p", 1483, "端口")
 	flag.IntVar(&tmptime, "tmptime", 0, "时间查询")
 	flag.StringVar(&tmpkey, "tmpkey", "", "时间字段")
-	flag.StringVar(&id1, "gtid", "", "gtid")
-	flag.StringVar(&id2, "lteid", "", "lteid")
+	flag.StringVar(&id1, "gtid", "5ed869759e628c5991636bca", "gtid")
+	flag.StringVar(&id2, "lteid", "5eec14bd4c87bb08581c03cb", "lteid")
 	flag.StringVar(&ids, "ids", "", "id1,id2")
-	flag.StringVar(&stype, "stype", "", "stype,传递类型")
+	flag.StringVar(&stype, "stype", "biddingall", "stype,传递类型")
 	flag.StringVar(&bkey, "bkey", "", "bkey,加上此参数表示不生关键词和摘要")
 	flag.StringVar(&q, "q", "", "q查询语句\"{'':''}\",有q就不要gtid,lteid")
 	flag.StringVar(&param, "param", "", "param,生信息发布或其他索引时用双引号套单引号\"{'mgoaddr':'','d':'','c':'','index':'','type':''}\"")