Browse Source

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

maxiaoshan 5 years ago
parent
commit
5e2cd1d331

+ 6 - 6
src/jy/clear/clear.go

@@ -7,7 +7,7 @@ import (
 )
 
 //方法清单
-var clearfns = make(map[string]func(data []interface{}) []interface{})
+var clearfns = make(map[string]func(data []interface{},spidercode ... string) []interface{})
 var lock sync.RWMutex
 
 func init() {
@@ -33,20 +33,20 @@ func init() {
 }
 
 //绑定清理方法
-func BindFn(fnname string, fn func(data []interface{}) []interface{}) {
+func BindFn(fnname string, fn func(data []interface{},spidercode ...string) []interface{}) {
 	lock.Lock()
 	clearfns[fnname] = fn
 	lock.Unlock()
 }
 
 //执行清理动作,如果调用的清理方法不存在,则不做处理
-func DoClearFn(clear []string, data []interface{}) []interface{} {
+func DoClearFn(clear []string, data []interface{},spidercode ...string) []interface{} {
 	if len(clear) == 0 {
 		return data
 	}
 	for _, fnname := range clear {
 		if v, ok := clearfns[fnname]; ok {
-			data = v(data)
+			data = v(data,spidercode...)
 		}
 	}
 	return data
@@ -55,13 +55,13 @@ func DoClearFn(clear []string, data []interface{}) []interface{} {
 //取手机号
 var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,5})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2,}[×―—-\\-])+\\d{2,}[×―—-\\-]+(\\d{3,})+|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
 
-func GetPhone(data []interface{}) []interface{} {
+func GetPhone(data []interface{},spidercode ...string) []interface{} {
 	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))
 	return data
 }
 
 //去除数字
-func ClearNumber(data []interface{}) []interface{} {
+func ClearNumber(data []interface{},spidercode ...string) []interface{} {
 	data[0] = clearNum.ReplaceAllString(fmt.Sprint(data[0]), "")
 	return data
 }

+ 9 - 10
src/jy/clear/cutspace.go

@@ -17,11 +17,10 @@ var (
 )
 
 var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n", "\u0001"}
-
 func init() {
 	cutSpace, _ = regexp.Compile(`^[\s]*|[\s]*$`)
 	cutAllSpace, _ = regexp.Compile(`\s*`)
-	catSymbol, _ = regexp.Compile(`[]+`)
+	catSymbol, _ = regexp.Compile(`\\[\\]+`)
 	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/。|]")
 	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|科|部|队|联合(会|体)|工作室)$")
 	clearNum, _ = regexp.Compile("[\\d-]+")
@@ -78,7 +77,7 @@ func CutLableStr(con string) string {
 }
 
 //清理开始、结尾的空白字符
-func CutSpace(data []interface{}) []interface{} {
+func CutSpace(data []interface{},spidercode ...string) []interface{} {
 	tmp := cutSpace.ReplaceAllString(strings.Replace(fmt.Sprint(data[0]), " ", " ", -1), "")
 	tmp = replaceSymbol(tmp, spaces)
 	//fmt.Println("cutspace", tmp)
@@ -87,7 +86,7 @@ func CutSpace(data []interface{}) []interface{} {
 }
 
 //清理所有空白符
-func CutAllSpace(data []interface{}) []interface{} {
+func CutAllSpace(data []interface{},spidercode ...string) []interface{} {
 	tmp := cutAllSpace.ReplaceAllString(fmt.Sprint(data[0]), "")
 	tmp = replaceSymbol(tmp, spaces)
 	data[0] = tmp
@@ -95,7 +94,7 @@ func CutAllSpace(data []interface{}) []interface{} {
 }
 
 //清理尾部符号
-func ClearEndSymblo(data []interface{}) []interface{} {
+func ClearEndSymblo(data []interface{},spidercode ...string) []interface{} {
 	text := fmt.Sprint(data[0])
 	for i := 0; i <= 2; i++ {
 		text = endSymblo.ReplaceAllString(text, "")
@@ -105,7 +104,7 @@ func ClearEndSymblo(data []interface{}) []interface{} {
 }
 
 //清理符号
-func CutSymbol(data []interface{}) []interface{} {
+func CutSymbol(data []interface{},spidercode ...string) []interface{} {
 	value := fmt.Sprint(CutSpace(data)[0])
 	symbol := ",,;;::'\"“”。.\\??、/+=\\_—\\-*&……\\^%$¥@!!`~·"
 	startSymbol := "^[" + ")\\)>》】\\]}}〕" + symbol + "]+"
@@ -119,7 +118,7 @@ func CutSymbol(data []interface{}) []interface{} {
 }
 
 //不成对出现的符号,把符号后面的内容清理掉
-func CutNotPrs(data []interface{}) []interface{} {
+func CutNotPrs(data []interface{},spidercode ...string) []interface{} {
 	return childCutNotPrs(data, 1)
 }
 
@@ -163,7 +162,7 @@ func childCutNotPrs(data []interface{}, count int) []interface{} {
 }
 
 //全部是汉字或者特殊符号的情况,清理掉
-func ClearAllWord(data []interface{}) []interface{} {
+func ClearAllWord(data []interface{},spidercode ...string) []interface{} {
 	value := fmt.Sprint(data[0])
 	reg := regexp.MustCompile("^[\u4e00-\u9fa5、,,.。??'\"“”‘’·~!@#¥$%…&*()()\\-—+=【】\\[\\]{}{}<>《》|\\/\\s]+$")
 	data[0] = reg.ReplaceAllString(value, "")
@@ -171,7 +170,7 @@ func ClearAllWord(data []interface{}) []interface{} {
 }
 
 //中文符号转英文
-func ChiToEng(data []interface{}) []interface{} {
+func ChiToEng(data []interface{},spidercode ...string) []interface{} {
 	value := fmt.Sprint(data[0])
 	startChars := []string{"(", "【", "{", "“", ")", "】", "}", "”"}
 	endChars := []string{"(", "[", "{", "\"", ")", "]", "}", "\""}
@@ -186,7 +185,7 @@ func ChiToEng(data []interface{}) []interface{} {
 	return data
 }
 
-func ClearBuyerPerson(data []interface{}) []interface{} {
+func ClearBuyerPerson(data []interface{},spidercode ...string) []interface{} {
 	value := fmt.Sprint(data[0])
 	//tmp := []string{}
 	if len([]rune(value)) > 4 { //名字默认最长4

+ 1 - 1
src/jy/clear/getratecurrency.go

@@ -18,7 +18,7 @@ func init() {
 }
 
 //获取币种
-func GetCurrency(data []interface{}) []interface{} {
+func GetCurrency(data []interface{},spidercode ...string) []interface{} {
 	val := "人民币"
 	currency.ReplaceAllStringFunc(fmt.Sprint(data[0]), func(key string) string {
 		v := encyitem[key]

+ 1 - 1
src/jy/clear/projectname.go

@@ -15,7 +15,7 @@ var clearSymbol = regexp.MustCompile(`["“”]`)
 var noclearNum = regexp2.MustCompile(`^(?!.+(?:标段|包|子项目|升级改造)[0-9123456789]{1,5})(.*)[0-9123456789]$`, regexp2.None)
 var mustHan = regexp.MustCompile(`[\p{Han}]+`) //项目名称必须包含汉子
 
-func ClearProjectName(data []interface{}) []interface{} {
+func ClearProjectName(data []interface{},spidercode ...string) []interface{} {
 	value := clearPreRegNameCode.ReplaceAllString(CutSpace(data)[0].(string), "$2")
 	value = clearEndRegNameCode.ReplaceAllString(value, "$1")
 	b := mustHan.MatchString(value)

+ 1 - 1
src/jy/clear/totimestamp.go

@@ -42,7 +42,7 @@ func init() {
 2006%01%02%15%04->时间戳
 2006%01%02%15%04%05->时间戳
 */
-func ObjToTimestamp(data []interface{}) []interface{} {
+func ObjToTimestamp(data []interface{},spidercode ...string) []interface{} {
 	tmp := fmt.Sprint(data[0])
 	//处理类似:二〇一五年十一月四日十五时
 	cht := regD.FindStringSubmatch(tmp)

+ 53 - 29
src/jy/extract/extract.go

@@ -33,7 +33,8 @@ var (
 	ClearTaskList map[string]*ClearTask                  //清理任务列表
 	saveLimit     = 100                                  //抽取日志批量保存
 	PageSize      = 5000                                 //查询分页
-	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
+	//Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1}`
+	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1,"infoformat":1,"attach_text":1,"dataging":1,"new_attach_text":1,"createtime":1,"currency":1,"id":1,"company_email":1,"buyerclass":1,"tagname":1,"company_phone":1,"appid":1,"industry":1,"projectscope":1,"item":1,"s_subscopeclass":1,"matchkey":1,"jybxhref":1,"legal_person":1,"matchtype":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
 
@@ -323,15 +324,15 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		Site:           qu.ObjToString(doc["site"]),
 		//Domain:     qu.ObjToString(doc["domain"]),
 		//Href:       qu.ObjToString(doc["href"]),
-		Title:     qu.ObjToString(doc["title"]),
-		Data:      &doc,
-		City:      qu.ObjToString(doc["city"]),
-		Province:  qu.ObjToString(doc["area"]),
-		Jsondata:  toMap,
-		Result:    map[string][]*ju.ExtField{},
-		BuyerAddr: qu.ObjToString(doc["buyeraddr"]),
-		RuleBlock: e.RuleBlock,
-		Dataging:  qu.IntAll(doc["dataging"]),
+		Title:         qu.ObjToString(doc["title"]),
+		Data:          &doc,
+		City:          qu.ObjToString(doc["city"]),
+		Province:      qu.ObjToString(doc["area"]),
+		Jsondata:      toMap,
+		Result:        map[string][]*ju.ExtField{},
+		BuyerAddr:     qu.ObjToString(doc["buyeraddr"]),
+		RuleBlock:     e.RuleBlock,
+		Dataging:      qu.IntAll(doc["dataging"]),
 	}
 	if (j.Jsondata != nil || (*j.Jsondata) != nil) && (*j.Jsondata)["jsoncontent"] != nil {
 		delete((*j.Jsondata), "jsoncontent")
@@ -396,7 +397,8 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 //遍历附件字段内容,拼接在一起;附件文本堆一起(后期可以考虑,分开处理),方法里修改了doc["detailfile"]结果
 func file2text(doc *map[string]interface{}) {
 	tmpstr := ""
-	if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
+	//if attach_text, ok := (*doc)["new_attach_text"].(map[string]interface{}); ok {
+		if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
 		for _, attachs := range attach_text {
 			if fileinfos, ok := attachs.(map[string]interface{}); ok {
 				for _, fileinfo := range fileinfos {
@@ -423,6 +425,7 @@ func file2text(doc *map[string]interface{}) {
 
 //抽取
 func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
+	
 	e.ExtractDetail(j, isSite, j.SpiderCode)
 	if jf != nil && jf.IsFile {
 		e.ExtractFile(jf, isSite, j.SpiderCode)
@@ -598,7 +601,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 				if len(cfn) == 0 {
 					continue
 				}
-				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
+				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
 				if key == "budget" || key == "bidamount" {
 					if istrue, ok := data[len(data)-1].(bool); istrue && ok {
 						j.Result[key][i].IsTrue = true
@@ -696,7 +699,7 @@ func (e *ExtractTask) ExtractFile(j *ju.Job, isSite bool, codeSite string) {
 				lockclear.Lock()
 				cfn := e.ClearFn[key]
 				lockclear.Unlock()
-				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
+				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content},j.SpiderCode)
 				v.Value = data[0]
 				//清理特殊符号
 				lockclear.Lock()
@@ -1009,7 +1012,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 								lock.Lock()
 								cfn := e.ClearFn[in.Field]
 								lock.Unlock()
-								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
+								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
 								if data[len(data)-1].(bool) {
 									j.BlockPackage[k].Budget = qu.Float64All(data[0])
 									j.BlockPackage[k].IsTrueBudget = true
@@ -1019,7 +1022,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 								lock.Lock()
 								cfn := e.ClearFn[in.Field]
 								lock.Unlock()
-								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content})
+								data := clear.DoClearFn(cfn, []interface{}{strings.TrimSpace(rep[in.Field+"_"+fmt.Sprint(i)]), j.Content},j.SpiderCode)
 								if data[len(data)-1].(bool) {
 									j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
 									j.BlockPackage[k].IsTrueBidamount = true
@@ -1081,7 +1084,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 						lock.Lock()
 						cfn := e.ClearFn[in.Field]
 						lock.Unlock()
-						data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
+						data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
 						if data[len(data)-1].(bool) {
 							j.BlockPackage[k].Budget = qu.Float64All(data[0])
 							j.BlockPackage[k].IsTrueBudget = true
@@ -1092,7 +1095,7 @@ func ExtRuleCoreByPkgReg(j *ju.Job, in *RegLuaInfo, e *ExtractTask) {
 						lock.Lock()
 						cfn := e.ClearFn[in.Field]
 						lock.Unlock()
-						data := clear.DoClearFn(cfn, []interface{}{val, j.Content})
+						data := clear.DoClearFn(cfn, []interface{}{val, j.Content},j.SpiderCode)
 						if data[len(data)-1].(bool) {
 							j.BlockPackage[k].Bidamount = qu.Float64All(data[0])
 							j.BlockPackage[k].IsTrueBidamount = true
@@ -1687,7 +1690,6 @@ var clearWinnerReg = regexp.MustCompile("名称|施工|拟定供应商名称|:
 //分析抽取结果并保存
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
-		
 		//重新取出清理过后的中标候选人
 		resetWinnerorder(j)
 		doc, result, _id := funcAnalysis(j, e)
@@ -1711,9 +1713,6 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				if v.Score > -1 {
 					tmp[v.Field] = v.Value
 					break
-				} else if v.Field == "projectname" {
-					tmp[v.Field] = v.Value
-					break
 				}
 			}
 		}
@@ -1794,7 +1793,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		if len(j.Winnerorder) > 0 { //候选人信息
 			for i, v := range j.Winnerorder {
 				if v["price"] != nil {
-					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""})[0]
+					j.Winnerorder[i]["price"] = clear.ObjToMoney([]interface{}{v["price"], ""},j.SpiderCode)[0]
 				}
 			}
 			tmp["winnerorder"] = j.Winnerorder
@@ -1823,10 +1822,13 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			tmp["ffield"] = ffield
 		}
 		for k, v := range *doc {
-			//去重冗余字段
-			if delFiled(k) {
-				continue
+			if utf8.RuneCountInString(qu.ObjToString(v)) > 100000 {
+				(*doc)[k] = []rune(qu.ObjToString(v))[:100000]
 			}
+			//去重冗余字段
+			//if delFiled(k) {
+			//	continue
+			//}
 			if tmp[k] == nil {
 				tmp[k] = v
 			}
@@ -1934,6 +1936,28 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//		}
 		//tmp["extract_content"] = j.Content
 		tmp["dataging"] = j.Dataging
+		
+		if attach_text, ok := (tmp)["new_attach_text"].(map[string]interface{}); ok {
+			//if attach_text, ok := (*doc)["attach_text"].(map[string]interface{}); ok {
+			for ai, attachs := range attach_text {
+				if fileinfos, ok := attachs.(map[string]interface{}); ok {
+					for fi, fileinfo := range fileinfos {
+						if ff, ok := fileinfo.(map[string]interface{}); ok {
+							attach_url := qu.ObjToString(ff["attach_url"])
+							if utf8.RuneCountInString(attach_url) > qu.IntAllDef(ju.Config["filelength"], 10000) {
+								(tmp)["new_attach_text"].(map[string]interface{})[ai].((map[string]interface{}))[fi].(map[string]interface{})["attach_url"] =  "文本过长..."
+							}
+						}
+					}
+				}
+			}
+		}//}budget bidamount
+		if bg,ok :=tmp["budget"].(float64);ok && bg>=500000000000{
+			delete(tmp,"budget")
+		}
+		if bg,ok :=tmp["bidamount"].(float64);ok && bg>=500000000000{
+			delete(tmp,"bidamount")
+		}
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				/*	if len(e.SiteFields) <= 0 {
@@ -1988,7 +2012,7 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				}
 			}
 			tmp["result"] = result
-			tmp["resultf"] = resultf
+			//tmp["resultf"] = resultf
 			b := db.Mgo.Update(e.TaskInfo.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
 			if !b {
 				log.Debug(e.TaskInfo.TestColl, _id)
@@ -2104,7 +2128,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 	}
 	if !(len(j.Result) <= 0 || j.Jsondata == nil || len(*j.Jsondata) <= 0) {
 		//jsondata清理
-		clearJd(j.Jsondata, e)
+		clearJd(j.Jsondata, e,j.SpiderCode)
 		marshalbt, _ := json.Marshal(j.Jsondata)
 		tmpjddata := make(map[string]interface{})
 		json.Unmarshal(marshalbt, &tmpjddata)
@@ -2118,7 +2142,7 @@ func funcAnalysis(j *ju.Job, e *ExtractTask) (*map[string]interface{}, map[strin
 						if len(cfn) == 0 {
 							continue
 						}
-						newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""})
+						newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[jdkey], ""},j.SpiderCode)
 						if tmpv.Value == newNum[0] {
 							extField := &ju.ExtField{Code: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), Field: jdkey, ExtFrom: "JsonData_" + jdkey + "_j.Result" + fmt.Sprint(tmpk), SourceValue: (*j.Jsondata)[jdkey], Value: newNum[0], Score: 100, IsTrue: newNum[len(newNum)-1].(bool)}
 							j.Result[jdkey] = append(j.Result[jdkey], extField)
@@ -2321,7 +2345,7 @@ func resetWinnerorder(j *ju.Job) {
 	if maxlen > 0 {
 		winners = append(winners, &ju.ExtField{Code: "winnerorder", Field: "winner", ExtFrom: "j.Winnerorder", Value: j.Winnerorder[0]["entname"], Score: 0.5})
 		if j.Winnerorder[0]["price"] != nil {
-			tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""})
+			tmpPrice := clear.ObjToMoney([]interface{}{j.Winnerorder[0]["price"], ""},j.SpiderCode)
 			if tmpPrice[len(tmpPrice)-1].(bool) {
 				bidamounts = append(bidamounts, &ju.ExtField{Code: "winnerorder", Field: "bidamount", ExtFrom: "j.Winnerorder", SourceValue: j.Winnerorder[0]["price"], Value: tmpPrice[0], Score: 0.5})
 			}

+ 2 - 0
src/jy/extract/isextract.go

@@ -23,6 +23,8 @@ func init() {
 
 func IsExtract(filed, title, content string) bool {
 	defer qu.Catch()
+	//临时的,抽取所有
+	return true
 	b := true
 	if N_extract[filed] != nil {
 		nregs := N_extract[filed]

+ 5 - 5
src/jy/extract/score_jsondata.go

@@ -24,7 +24,7 @@ var endOfPunctuationClrear = regexp.MustCompile("[,,.。??;;]+$")
 var keysClrear = regexp.MustCompile("(详见|公告|X|内文|某单位|某部|文件|\\*|暂无|?|\\?)")
 
 //jsondata清理
-func clearJd(jd *map[string]interface{}, e *ExtractTask) {
+func clearJd(jd *map[string]interface{}, e *ExtractTask,spiderCode string) {
 	for k, v := range *jd {
 		if k == "buyer" || k == "winner" || k == "agency" || k == "projectcode" || k == "projectname" {
 			vstring := util2.ObjToString(v)
@@ -37,7 +37,7 @@ func clearJd(jd *map[string]interface{}, e *ExtractTask) {
 			cfn := e.ClearFn[k]
 			lockclear.Unlock()
 			if len(cfn) > 0 {
-				data := clear.DoClearFn(cfn, []interface{}{vstring, ""})
+				data := clear.DoClearFn(cfn, []interface{}{vstring, ""},spiderCode)
 				lockclear.Lock()
 				if clear.AsyField[k] != nil || clear.SymField[k] != nil || clear.MesField[k] != nil {
 					vstring = clear.OtherClean(k, util2.ObjToString(data[0]))
@@ -85,7 +85,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if len(cfn) == 0 {
 					continue
 				}
-				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""})
+				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
 				//if util2.IntAll(newNum[0]) != 0 {
 				extFields := make([]*util.ExtField, 0)
 				extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1, IsTrue: newNum[len(newNum)-1].(bool)})
@@ -105,7 +105,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if bt,ok :=(*j.Jsondata)[v].(float64);ok && bt>0{
 					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: bt, Score: 0.1})
 				}else {
-					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""})
+					newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
 					extFields = append(extFields, &util.ExtField{Code: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), Field: v, ExtFrom: "JsonData_" + v + "_" + fmt.Sprint(jdextweight), SourceValue: (*j.Jsondata)[v], Value: newNum[0], Score: 0.1})
 				}
 				j.Result[v] = extFields
@@ -203,7 +203,7 @@ func JsonDataMergeProcessing(j *util.Job, e *ExtractTask) map[string][]*util.Ext
 				if len(cfn) == 0 {
 					continue
 				}
-				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""})
+				newNum := clear.DoClearFn(cfn, []interface{}{(*j.Jsondata)[v], ""},j.SpiderCode)
 				//if util2.IntAll(newNum[0]) != 0 {
 				extFields := make([]*util.ExtField, 0)
 				if jdextweight > 1 {

+ 6 - 6
src/jy/pretreated/division.go

@@ -58,13 +58,13 @@ var (
 	regStrWrap         = regexp.MustCompile("分包名称[::]")
 	regBZJWarap        = regexp.MustCompile("(每标段|保证金.*|标示|标[\\d一二三四五六七八九十]+室|型号[::]+[\\d]*包|每包[0-9]*元|包/[袋|箱]|标志|享受一包服务|一包一投|上包|标线|国标|第[\\d一二三四五六七八九十]+标室|[\\d一二三四五六七八九十]包密封|(^一包|商务|资格|价格标(每包内含相应文件正副本))|[未|不]+划分标段)")
 	regFJWarap         = regexp.MustCompile("[a-zA-Z0-9](包|标段).*.(pdf|PDF|docx|doc|DOCX|DOC|swf|SWF)")
-	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张/包|纸[\\d]*包|/*[\\d]+包|相机包)")
+	regAZWarap         = regexp.MustCompile("(标[a-zA-Z]取值|标段划分|标液|分包个数|物资[\\d一二三四五六七八九十]?包|[x]*项目[x]*标段|张\\/包|纸[\\d]*包|\\*[\\d]+包|相机包)")
 	replSerial         = regexp.MustCompile("(\r\n|^)([\\d一二三四五六七八九十][、..::,])+\\d")
 	moreColonReg       = regexp.MustCompile("[::]+")
 	regFilter          = regexp.MustCompile("等$")
 	pkgFilter          = regexp.MustCompile("第[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ](子|合同|分|施工|监理)?(标段?|包|合同段|标包)|(子|合同|分|施工|监理)?[标|包]+[段|号]+")
-	indexTile          = regexp.MustCompile("[0-9.]{2,3}[\\s\u4e00-\u9fa5]{2,8}[::]+") //小标题
-	indexTile2         = regexp.MustCompile("[\\s\u4e00-\u9fa5]{2,8}")
+	indexTile          = regexp.MustCompile(`[0-9.]{2,3}[^包标段][\s\\u4e00-\\u9fa5]{2,8}[::]+`) //小标题
+	indexTile2         = regexp.MustCompile(`[\s\\u4e00-\\u9fa5]{2,8}[::]\n`)
 	regReplAllSpace2   = regexp.MustCompile("[\u3000\u2003\u00a0\\s0-9.::、\\(\\)]+")
 	confusion          = map[string]string{
 		"参与": "canyu",
@@ -735,6 +735,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 	startEndMap := map[int]int{}
 	pkgIndexMap := map[string][]int{}
 	indexPkgMap := map[int]string{}
+	
 	//小标题
 	titleindexs := indexTile.FindAllStringIndex(con, -1)
 	if len(titleindexs) == 0 {
@@ -765,9 +766,9 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			headkey := con[key[4]:key[5]]
 			headkey = regReplAllSpace.ReplaceAllString(headkey, "")
 			if !regDivision.MatchString(headkey) {
-				headkey += ""
+				headkey += ":"
 			}
-			headkey = moreColonReg.ReplaceAllString(headkey, "")
+			headkey = moreColonReg.ReplaceAllString(headkey, ":")
 			colonIndexs := regDivision.FindAllStringIndex(headkey, -1)
 			if len(colonIndexs) > 1 {
 				headkey = headkey[colonIndexs[len(colonIndexs)-2][1]:colonIndexs[len(colonIndexs)-1][1]]
@@ -836,7 +837,6 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {
 				headKey = indexKeyStringMap[iv]
-				text = indexKeyStringMap[iv] + "  " + text
 				//}
 				for _, pkgIndexMap_v := range pkgIndexMap[bv[0]] {
 					delete(indexKeyStringMap, pkgIndexMap_v)

+ 1 - 1
src/jy/pretreated/multipackage.go

@@ -21,7 +21,7 @@ var (
 	//替换容易混淆的词
 	PreCon1 = regexp.MustCompile("(\\d+\\.?)+万?元")
 	//提取分包标识
-	MultiReg = regexp.MustCompile("[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)#?((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|分|合同|分|施工|监理)?(标|包件?)(段|号|项)?)[     ]*((\\d[.])+\\d|[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)[::]?")
+	MultiReg = regexp.MustCompile("(([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-])+(包|标段))[::]?|(?:^|\\n)([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+(包|标段))|[第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)#?((子|合同|分|施工|监理)?(标段?|合同段|标包))|((子|分|合同|分|施工|监理)?(标|包件?)(段|号|项|组)?)[     ]*((\\d[.])+\\d|[一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ\\-]+)[::]?|(子项目[0-9]+)")
 	//匹配到的包格式分类统计
 	keyregs = []map[*regexp.Regexp]int{
 		map[*regexp.Regexp]int{

+ 1 - 1
src/jy/util/article.go

@@ -40,7 +40,7 @@ type Job struct {
 	SimAreaScore      map[string]float64                //简称province得分
 	SimCityScore      map[string]float64                //简称city得分
 	SimDistrictScore  map[string]float64                //简称district得分
-	Dataging int
+	Dataging          int
 }
 
 type ExtField struct {

+ 3 - 3
src/res/fieldscore.json

@@ -248,8 +248,8 @@
         "negativewords": [
             {
                 "describe": "包含负分",
-                "regstr": "(定标|通知|异议|要求|代理|详细|test|意见|原因|具体|结果|负责|付款|附件|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
-                "score": -10
+                "regstr": "(我公司|定标|通知|异议|要求|代理|详细|test|意见|原因|具体|结果|负责|付款|附件|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+                "score": -20
             },
 			{
                 "describe": "非结尾",
@@ -600,7 +600,7 @@
                 "score": -10
             },  {
                 "describe": "包含负分",
-                "regstr": "(详(见|情)|公告|test|招标人)",
+                "regstr": "(详(见|情)|公告|test|招标人|我公司)",
                 "score": -20
             }
         ],

+ 7 - 0
src/res/moneyclear.json

@@ -0,0 +1,7 @@
+{
+    "js_jsszbtbggfwpt_zhbhxrgs": {
+    	"descript":"金额除以10000",
+		"maxmoney":10000000000,
+		"divisor":10000
+    }
+}

+ 3 - 3
udpfilterdup/src/config.json

@@ -5,10 +5,10 @@
         "addr": "192.168.3.207:27092",
         "pool": 5,
         "db": "extract_kf",
-        "extract": "zk_xiufu_test01",
-        "extract_back": "zk_xiufu_test01",
+        "extract": "zk_task_test",
+        "extract_back": "zk_task_test",
         "site": {
-            "dbname": "extract_kf",
+            "dbname": "zhaolongyue",
             "coll": "site"
         }
     },

+ 10 - 5
udpfilterdup/src/datamap.go

@@ -53,7 +53,7 @@ type datamap struct {
 	keys   map[string]bool
 }
 
-func TimedTaskDatamap(days int,lasttime int64) *datamap {
+func TimedTaskDatamap(days int,lasttime int64 ,coll string) *datamap {
 	log.Println("数据池开始重新构建")
 	datelimit = qutil.Float64All(days * 86400)
 	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{}, []string{},map[string]bool{}}
@@ -69,11 +69,12 @@ func TimedTaskDatamap(days int,lasttime int64) *datamap {
 		"$lt": lasttime,
 	}}
 	log.Println("query", query)
-	it := sess.DB(mgo.DbName).C(extract_back).Find(query).Sort("-publishtime").Iter()
+	it := sess.DB(mgo.DbName).C(coll).Find(query).Sort("-publishtime").Iter()
 	n, continuSum := 0, 0
 	for tmp := make(map[string]interface{}); it.Next(&tmp); n++ {
 		//qutil.IntAll(tmp["dataging"]) == 1
-		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 {
+		if qutil.IntAll(tmp["repeat"]) == 1 || qutil.IntAll(tmp["repeat"]) == -1 ||
+			qutil.IntAll(tmp["dataging"]) == 1 {
 
 		} else {
 			pt := tmp["publishtime"]
@@ -113,13 +114,14 @@ func TimedTaskDatamap(days int,lasttime int64) *datamap {
 	}
 
 
-	log.Printf("数据池构建完成:%d秒,%d个\n", int(time.Now().Unix())-start, n)
+	log.Printf("数据池构建完成:%d秒,%d个\n", int(time.Now().Unix())-start, n)
 
 
 	return dm
 }
 
 
+
 func NewDatamap(days int, lastid string) *datamap {
 	datelimit = qutil.Float64All(days * 86400 * 2)
 	dm := &datamap{sync.Mutex{}, days, map[string][]*Info{}, []string{},[]string{}, map[string]bool{}}
@@ -614,6 +616,9 @@ func dealWithSpecialWordNumber(info*Info,v*Info) int {
 
  //快速低质量数据判重
 func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
+	if !isTheSameDay(v.publishtime,info.publishtime) {
+		return false,reason
+	}
 	//首先判定是否为低质量数据    info目标数据
 	if info.agency==v.agency&&info.title!=""&&
 		info.title==v.title &&
@@ -635,7 +640,7 @@ func fastLowQualityHeavy(v *Info, info *Info, reason string) (bool, string) {
 			isValue++
 		}
 		if isValue==0 {
-			reason = reason + "---低质量-要素均为空标题包含关系"
+			reason = reason + "---低质量-要素均为空-标题包含关系"
 			return true, reason
 		}else if isValue==1 {
 			isMeet := false

+ 79 - 32
udpfilterdup/src/main.go

@@ -107,6 +107,7 @@ func init() {
 }
 
 func main() {
+
 	go checkMapJob()
 	updport := Sysconfig["udpport"].(string)
 	udpclient = mu.UdpClient{Local: updport, BufSize: 1024}
@@ -135,8 +136,8 @@ func mainT() {
 		5e933b1a50b5ea296ef0e839
 		*/
 		//IdType = true
-		sid = "5eca4d52511b120337790325"
-		eid = "5eca4d55511b120337790329"
+		sid = "5ee1d3d59e628c599167adf1"
+		eid = "5eea4291801f744d045c3169"
 		log.Println("正常判重测试开始")
 		log.Println(sid, "---", eid)
 		mapinfo := map[string]interface{}{}
@@ -229,31 +230,36 @@ func task(data []byte, mapInfo map[string]interface{}) {
 			log.Println("current:", n, tmp["_id"], "repeateN:", repeateN)
 		}
 
-		if util.IntAll(tmp["repeat"]) == 1 || util.IntAll(tmp["repeat"]) == -1||
-			util.IntAll(tmp["dataging"]) == 1 ||util.IntAll(tmp["sourcewebsite"]) == 1{
+		source := util.ObjToMap(tmp["jsondata"])
+		if util.IntAll((*source)["sourcewebsite"]) == 1 {
+			repeateN++
+			updateExtract = append(updateExtract, []map[string]interface{}{
+				map[string]interface{}{
+					"_id": tmp["_id"],
+				},
+				map[string]interface{}{
+					"$set": map[string]interface{}{
+						"repeat": 1,
+						"repeat_reason": "sourcewebsite为1,重复",
+					},
+				},
+			})
+			if len(updateExtract) >= 200 {
+				mgo.UpSertBulk(extract, updateExtract...)
+				updateExtract = [][]map[string]interface{}{}
+			}
+
+
 			tmp = make(map[string]interface{})
+			continue
+		}
+
+		if util.IntAll(tmp["repeat"]) == 1 || util.IntAll(tmp["repeat"]) == -1||
+			util.IntAll(tmp["dataging"]) == 1 {
 			if util.IntAll(tmp["repeat"]) == 1 {
 				repeateN++
 			}
-			if util.IntAll(tmp["sourcewebsite"]) == 1 {
-				repeateN++
-				updateExtract = append(updateExtract, []map[string]interface{}{
-					map[string]interface{}{
-						"_id": tmp["_id"],
-					},
-					map[string]interface{}{
-						"$set": map[string]interface{}{
-							"repeat": 1,
-							"repeat_reason": "sourcewebsite为1,重复",
-						},
-					},
-				})
-				if len(updateExtract) >= 200 {
-					mgo.UpSertBulk(extract, updateExtract...)
-					updateExtract = [][]map[string]interface{}{}
-				}
-
-			}
+			tmp = make(map[string]interface{})
 			continue
 		}
 		pool <- true
@@ -476,15 +482,15 @@ func task(data []byte, mapInfo map[string]interface{}) {
 func timedTaskDay() {
 	log.Println("部署定时任务")
 	c := cron.New()
-	c.AddFunc("0 0 1 * * ?", func() { movedata() })      //每天凌晨1点执行一次
-	c.AddFunc("0 0 */4 * * ?", func() { timedTaskOnce() }) //每天凌晨2点执行一次
+	//c.AddFunc("0 0 */4 * * ?", func() { movedata() })
+	c.AddFunc("0 0 */4 * * ?", func() { timedTaskOnce() })
 	c.Start()
-	//timedTaskOnce()
 }
 func timedTaskOnce() {
-
-	log.Println("开始一次定时任务")
 	defer util.Catch()
+	log.Println("开始一次迁移任务")
+	movedata()
+	log.Println("开始一次任务判重")
 	//当前时间-8   -4 小时
 	now := time.Now()
 	log.Println(now)
@@ -513,7 +519,9 @@ func timedTaskOnce() {
 		if num%10000 == 0 {
 			log.Println("正序遍历:", num)
 		}
-		if util.IntAll(tmp["sourcewebsite"]) == 1 {
+
+		source := util.ObjToMap(tmp["jsondata"])
+		if util.IntAll((*source)["sourcewebsite"]) == 1 {
 			updateExtract = append(updateExtract, []map[string]interface{}{
 				map[string]interface{}{
 					"_id": tmp["_id"],
@@ -530,8 +538,14 @@ func timedTaskOnce() {
 				mgo.UpSertBulk(extract, updateExtract...)
 				updateExtract = [][]map[string]interface{}{}
 			}
+
+
+			tmp = make(map[string]interface{})
 			continue
 		}
+
+
+
 		//取-符合-发布时间X年内的数据
 		if util.IntAll(tmp["dataging"]) == 1 {
 			pubtime := util.Int64All(tmp["publishtime"])
@@ -602,10 +616,13 @@ func timedTaskOnce() {
 	log.Println("本地构建分组完成:",len(pendAllArr),"组","测试-总计数量:",testNum)
 
 	n, repeateN := 0, 0
-	for k,v:=range pendAllArr {
+	for k,v:=range pendAllArr { //每组结束更新一波数据
 		//构建当前组的数据池
 		log.Println("构建第",k,"组---(数据池)")
-		DM = TimedTaskDatamap(dupdays, util.Int64All(v[0]["publishtime"]))
+		//当前组的第一个发布时间
+		first_pt :=util.Int64All(v[0]["publishtime"])
+		coll :=extract_back
+		DM = TimedTaskDatamap(dupdays, first_pt,coll)
 		log.Println("开始遍历判重第",k,"组  共计数量:",len(v))
 		n = n+len(v)
 		log.Println("统计目前总数量:",n,"重复数量:",repeateN)
@@ -667,6 +684,13 @@ func timedTaskOnce() {
 				updateExtract = [][]map[string]interface{}{}
 			}
 		}
+
+		//每组数据结束-更新数据
+		if len(updateExtract) > 0 {
+			mgo.UpSertBulk(extract, updateExtract...)
+			updateExtract = [][]map[string]interface{}{}
+		}
+
 	}
 
 
@@ -699,6 +723,27 @@ func timedTaskOnce() {
 		}
 	}
 }
+//判断是否在周期天内
+func isTaskTimeCycle(pt int64) bool {
+
+	year, month, day := time.Now().Date()
+	predur_pt:=time.Date(year, month, day, 0, 0, 0, 0, time.Local).Add(-time.Duration(dupdays) * 24 * time.Hour).Unix()
+	log.Println(predur_pt)
+
+	if pt >= predur_pt {
+		return true
+	}else  {
+		return false
+	}
+
+}
+
+
+
+
+
+
+
 
 //合并字段-并更新merge字段的值
 func mergeDataFields(source *Info, info *Info) (*Info, []int64, bool) {
@@ -1025,9 +1070,11 @@ func movedata() {
 	sess := mgo.GetMgoConn()
 	defer mgo.DestoryMongoConn(sess)
 	year, month, day := time.Now().Date()
+	now:=time.Now()
+	move_time := time.Date(now.Year(), now.Month(), now.Day(), now.Hour()-8, 0, 0, 0, time.Local).Unix()
 	q := map[string]interface{}{
 		"comeintime": map[string]interface{}{
-			"$lt": time.Date(year, month, day, 0, 0, 0, 0, time.Local).Add(-time.Duration(dupdays) * 24 * time.Hour).Unix(),
+			"$lt": move_time,
 		},
 	}
 	log.Println(q)

+ 1 - 1
udpfilterdup/src/mgo.go

@@ -144,7 +144,7 @@ func (m *MongodbSim) InitPool() {
 	opts := options.Client()
 	opts.SetConnectTimeout(3 * time.Second)
 	opts.ApplyURI("mongodb://" + m.MongodbAddr)
-	opts.SetMaxPoolSize(uint16(m.Size))
+	opts.SetMaxPoolSize(uint64(m.Size))
 	m.pool = make(chan bool, m.Size)
 	opts.SetMaxConnIdleTime(2 * time.Hour)
 	m.Ctx, _ = context.WithTimeout(context.Background(), 99999*time.Hour)