浏览代码

buyerperson优化

maxiaoshan 6 年之前
父节点
当前提交
24ad29d273

+ 1 - 1
src/jy/clear/clear.go

@@ -50,7 +50,7 @@ func DoClearFn(clear []string, data []interface{}) []interface{} {
 }
 
 //取手机号
-var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,4}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{4})?|\\d{3,4}\\*{3,4}\\d{3,4}|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
+var PhoneReg = regexp.MustCompile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|\\d{3,4}\\*{3,4}\\d{3,4}|\\d{3,4}[\u3000\u2003\u00a0\\s]*\\d{4,5}[\u3000\u2003\u00a0\\s]*\\d{3,4}|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
 
 func GetPhone(data []interface{}) []interface{} {
 	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))

+ 31 - 10
src/jy/clear/cutspace.go

@@ -6,17 +6,22 @@ import (
 	"strings"
 )
 
-var cutSpace *regexp.Regexp
-var cutAllSpace *regexp.Regexp
-var catSymbol *regexp.Regexp
-var separateSymbol *regexp.Regexp
+var (
+	cutSpace       *regexp.Regexp
+	cutAllSpace    *regexp.Regexp
+	catSymbol      *regexp.Regexp
+	separateSymbol *regexp.Regexp
+	placeReg       *regexp.Regexp
+)
+
 var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"}
 
 func init() {
 	cutSpace, _ = regexp.Compile(`^\s*|\s*$`)
 	cutAllSpace, _ = regexp.Compile(`\s*`)
 	catSymbol, _ = regexp.Compile(`[]+`)
-	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n、.,,.。、|]")
+	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/|]")
+	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$")
 }
 
 var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)"
@@ -169,16 +174,32 @@ func ChiToEng(data []interface{}) []interface{} {
 
 func ClearBuyerPerson(data []interface{}) []interface{} {
 	value := fmt.Sprint(data[0])
-	tmp := []string{}
+	//tmp := []string{}
 	if len([]rune(value)) > 4 { //名字默认最长4
+		tmp := ""
 		valuearr := separateSymbol.Split(value, -1)
-		for _, v := range valuearr {
-			if len([]rune(v)) <= 4 { //长度大于4认为不是名字
-				tmp = append(tmp, v)
+		length := len(valuearr)
+		for i, v := range valuearr {
+			if v == "" {
+				continue
+			}
+			if i == 0 && placeReg.MatchString(v) {
+				if length == 1 {
+					tmp = tmp + v
+				} else {
+					tmp = tmp + v + "-"
+				}
+			} else if len([]rune(v)) <= 4 {
+				if i+1 != length {
+					tmp = tmp + v + ","
+				} else {
+					tmp = tmp + v
+				}
 			}
 		}
-		data[0] = strings.Join(tmp, ",")
+		data[0] = tmp
 	} else {
+		value = separateSymbol.ReplaceAllString(value, "")
 		data[0] = value
 	}
 	return data

+ 1 - 1
src/jy/clear/specialsymbols.go

@@ -371,7 +371,7 @@ func AnotherRemoveStart(text []rune) []rune {
 				text = text[surplusMax-nb+1:]
 				nb = surplusMax + 1
 				//非对称符号清理后判断剩余部分长度小于6,取删除部分
-				if len(CommonWords.ReplaceAllString(string(text), "")) < 6 { //{"_id":ObjectId("5d42425ba5cb26b9b7850640")}
+				if len([]rune(CommonWords.ReplaceAllString(string(text), ""))) < 6 { //{"_id":ObjectId("5d42425ba5cb26b9b7850640")}
 					result := beforedel[:deleteindex]
 					return result
 				}

+ 1 - 4
src/jy/extract/extract.go

@@ -457,10 +457,7 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job) {
 				if clear.AsyField[key] != nil || clear.SymField[key] != nil || clear.MesField[key] != nil {
 					text := qu.ObjToString(v.Value)
 					before = text
-					text = clear.OtherClean(key, text)
-					if text != "" {
-						v.Value = text
-					}
+					v.Value = clear.OtherClean(key, text)
 					BeforeAddClearFnLog(v.Type, "特殊符号清理", j.SourceMid, before, v.MatchType, v, e)
 				}
 				//AddClearFnLog("clearfn", j.SourceMid, v.Value, extinfo, v.Code, "函数清理", key, e.TaskInfo)

+ 3 - 3
src/jy/pretreated/analytable.go

@@ -96,8 +96,8 @@ var (
 	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公)?((电话([//]传真)?|手机)(号码)?|邮箱(地址)?|(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|(项目)?经办)人)|采购方代表")
 	ContactInfoMustReg  = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$")
 	ContactType         = map[string]*regexp.Regexp{
-		"采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|发布人?|甲|招标(服务)?|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方)|(项目|建(库|设))单位|招标人信息|采购中心地址|业主|收料人|采购部"),
-		"代理机构": regexp.MustCompile("(代理|受托).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"),
+		"采购单位": regexp.MustCompile("(^采购(项目.{2}|服务)?|比选|询价|甲|招标(服务)?|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心地址|业主|收料人|采购部"),
+		"代理机构": regexp.MustCompile("(代理|受托|集中采购).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"),
 	}
 	ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$")
 	MultipleValueSplitReg       = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]")
@@ -2541,7 +2541,7 @@ L:
 				td_v := td_kv.Value
 				td_k := FilterContactKey(td_kv.Key) //带括号()[]的采购单位,代理机构处理
 				td_k_length := len([]rune(td_k))
-				if td_k_length < 3 || td_k_length > 15 {
+				if td_k_length < 2 || td_k_length > 15 {
 					continue
 				}
 				//都为正序查询

+ 50 - 33
src/jy/pretreated/colonkv.go

@@ -263,7 +263,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	//处理联系人信息
 	var indexMap map[int]string
 	var matchMap map[string]map[string]bool
-	hasMatch := make(map[string]bool)
+	//hasMatch := make(map[string]bool)
 	if contactFormat == nil || title != "" {
 		indexMap = map[int]string{}
 		matchMap = map[string]map[string]bool{}
@@ -330,7 +330,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	//		}
 	//	}
 	startIndex := 0
-	prevKey := ""
+	//prevKey := ""
 	index, notmatchCount, allMatchCount := 0, 0, 0
 	weightMap := map[string]map[string]interface{}{}     //权重
 	mapIndexInKvs := map[string]map[string]interface{}{} //map在数组总的索引位置
@@ -405,7 +405,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 				}
 				isAddToMatchMap := false
 				addToMatchMapKey := ""
-				if ContactInfoVagueReg.MatchString(k) {
+				if ContactInfoVagueReg.MatchString(k) { //判断是不是电话、地址。。。
 					isAddToMatchMap = true
 					if matchMap[ct_k] == nil {
 						matchMap[ct_k] = map[string]bool{}
@@ -450,7 +450,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 							weightMap[ct_k][ct_k] = weightVal.(int)
 							matchMap[ct_k] = map[string]bool{}
 							isBreak = false
-							prevKey = ""
+							//prevKey = ""
 						}
 					}
 				}
@@ -463,14 +463,14 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 					continue
 				}
 				isBreak = false
-				//				if index != 0 || notmatchCount != 0 {
-				//					startIndex = 0
-				//					indexMap = map[int]string{}
-				//				}
+				if index != 0 || notmatchCount != 0 {
+					startIndex = 0
+					indexMap = map[int]string{}
+				}
 				if startIndex == 0 {
 					indexMap = map[int]string{}
 				}
-				prevKey = ""
+				//prevKey = ""
 				startIndex++
 				indexMap[startIndex] = ct_k
 				isContinue = true
@@ -514,28 +514,24 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			continue
 		}
 		isBreak = false
-
-		//		if again > 0 {
-		//			if again%2 == 1 {
-		//				index = 1
-		//			} else {
-		//				index = 2
-		//			}
-		//		} else {
-		//			if prevKey != k {
-		//				prevKey = k
-		//				index = 1
-		//			} else if prevKey == k {
-		//				index++
-		//			}
+		//		if prevKey != k && !hasMatch[k] {
+		//			prevKey = k
+		//			index = 1
+		//		} else if index < 2 {
+		//			index++
+		//		}
+		//		if prevKey != k {
+		//			prevKey = k
+		//			index = 1
+		//		} else if prevKey == k {
+		//			index++
 		//		}
-		if prevKey != k && !hasMatch[k] {
-			prevKey = k
+		if startIndex == 0 || startIndex%2 == 1 {
 			index = 1
-		} else if index < 2 {
-			index++
+		} else if startIndex%2 == 0 {
+			index = 2
 		}
-		hasMatch[k] = true
+		//hasMatch[k] = true
 		//过滤值
 		if filterValue.MatchString(v) {
 			continue
@@ -682,9 +678,30 @@ func HasOrderContactType(text string) []string {
 func GetKVAll(content, title string, contactFormat *ContactFormat, from int) *JobKv {
 	content = formatText(content, "kv")
 	m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from)
+	//	for _, kvs := range m1Kvs {
+	//		qutil.Debug(kvs.Key, kvs.Value)
+	//	}
 	kvTags := GetKvTags(m1Kvs, title, nil)
+	//	for k, kvs := range kvTags {
+	//		qutil.Debug("kkkkk--", k)
+	//		for _, kv := range kvs {
+	//			qutil.Debug(kv.Key, kv.Value)
+	//		}
+	//	}
 	m2Kvs, m2KvTags := GetKvFromtxt(content, title, from)
+	//	for k, kvs := range m2KvTags {
+	//		qutil.Debug("kkkkk--", k)
+	//		for _, kv := range kvs {
+	//			qutil.Debug(kv.Key, kv.Value)
+	//		}
+	//	}
 	MergeKvTags(kvTags, m2KvTags)
+	//	for k, kvs := range kvTags {
+	//		qutil.Debug("kkkkk--", k)
+	//		for _, kv := range kvs {
+	//			qutil.Debug(kv.Key, kv.Value)
+	//		}
+	//	}
 	return &JobKv{
 		Kvs:    m1Kvs,
 		Kvs_2:  m2Kvs,
@@ -779,13 +796,13 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string) map[string][]*Tag {
 				} else if nextval != "" && utf8.RuneCountInString(nextval) < 30 {
 					if strings.Contains(nextval, ":") || strings.Contains(nextval, ":") {
 						if len(strings.Split(nextval, ":")) > 1 || len(strings.Split(nextval, ":")) > 1 {
-							tmpnextval := ""
-							tmpnextval = strings.Split(nextval, ":")[0]
-							tmpnextval = strings.Split(nextval, ":")[0]
-							if strings.TrimSpace(tmpnextval) == "" {
+							//tmpnextval := ""
+							nextval = strings.Split(nextval, ":")[0]
+							nextval = strings.Split(nextval, ":")[0]
+							if strings.TrimSpace(nextval) == "" {
 								continue
 							}
-							if GetAppointTags(tmpnextval, tagdbs).Len() > 0 || GetAppointTags(k, tagdbs).Len() > 0 {
+							if GetAppointTags(nextval, tagdbs).Len() > 0 || GetAppointTags(k, tagdbs).Len() > 0 {
 								continue
 							}
 						}

+ 1 - 1
src/jy/util/article.go

@@ -34,7 +34,7 @@ type Job struct {
 	IsFile         bool                              //有附件
 	AreaScore      map[string]int                    //province得分
 	CityScore      map[string]int                    //city得分
-	DistrictScore  map[string]int                    //istrict得分
+	DistrictScore  map[string]int                    //district得分
 }
 
 type ExtField struct {

+ 2 - 1
src/main_blocktest.go

@@ -49,7 +49,7 @@ func all() {
 }
 func one() {
 	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
-	d, _ := m.FindById("bidding", "5d41cddba5cb26b9b7ea6631", extract.Fields)
+	d, _ := m.FindById("bidding", "5d4260f7a5cb26b9b7ea8c63", extract.Fields)
 	com(*d)
 }
 func com(doc map[string]interface{}) {
@@ -115,6 +115,7 @@ func com(doc map[string]interface{}) {
 	e.TaskInfo.ProcessPool <- true
 	pretreated.AnalyStart(j)
 	e.ExtractProcess(j, nil)
+
 	log.Println("=============块信息================")
 	for _, v := range j.Block {
 		log.Println("----", v.Title, v.Titles, "----")

+ 4 - 12
src/main_test.go

@@ -40,18 +40,10 @@ func Test_extractcity(t *testing.T) {
 	extract.FindBuyer()
 }
 func Test_reg(t *testing.T) {
-	value := `名字1名字斯蒂fasd`
-	separateSymbol, _ := regexp.Compile("[\\s\u3000\u2003\u00a0\\n、.,,.。、]")
-	tmp := []string{}
-	if len([]rune(value)) > 4 { //名字默认最长4
-		valuearr := separateSymbol.Split(value, -1)
-		for _, v := range valuearr {
-			if len([]rune(v)) < 4 { //长度大于4认为不是名字
-				tmp = append(tmp, v)
-			}
-		}
-	}
-	log.Println(strings.Join(tmp, ","))
+	reg1, _ := regexp.Compile("((([((]\\d{3,4}[))])?(\\d{6,12}([×―—-\\-]+\\d{3,4})?|\\d{3,5}[×―—-\\-]+[\u3000\u2003\u00a0\\s]*\\d{6,12}([×―—-\\-]+\\d{3,})?|(\\d{2}[×―—-\\-])+\\d{8}[×―—-\\-](\\d{3}[、])+)(转\\d{3,4})?[或/、,,;;\u3000\u2003\u00a0\\s]*)+(\\d{3,})?)")
+	log.Println("---", reg1.FindAllString("05939-5365001(兰陵县芦柞镇人民政府)", -1))
+	reg2, _ := regexp.Compile("^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$")
+	log.Println("---", reg2.MatchString("张女士/"))
 }
 
 func Test_reg1(t *testing.T) {

+ 5 - 0
src/res/fieldscore.json

@@ -266,6 +266,11 @@
                 "describe": "出现符号",
                 "regstr": "[*]",
                 "score": -10
+            },
+			{
+                "describe": "是数字",
+                "regstr": "^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$",
+                "score": -10
             }
         ],
         "length": [

+ 2 - 1
src/res/specialsymbols.json

@@ -4,7 +4,8 @@
             "projectname": true,
             "buyer": true,
             "winner": true,
-            "agency": true
+            "agency": true,
+			"buyerperson":true
         },
         "symbol": [
 			{