소스 검색

项目编号抽取

fengweiqiang 6 년 전
부모
커밋
489783f23a
5개의 변경된 파일28개의 추가작업 그리고 16개의 파일을 삭제
  1. 3 3
      src/jy/extract/extract.go
  2. 7 5
      src/jy/pretreated/analystep.go
  3. 2 1
      src/jy/pretreated/analytable.go
  4. 7 3
      src/jy/pretreated/tablev2.go
  5. 9 4
      src/res/fieldscore.json

+ 3 - 3
src/jy/extract/extract.go

@@ -749,7 +749,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 		}
 		for _, bl := range j.Block {
 			//冒号kv
-			if bl.ColonKV != nil && len(bl.ColonKV.Kvs) > 0 {
+			if bl.ColonKV != nil {
 				kvs := bl.ColonKV.Kvs
 				kvs2 := bl.ColonKV.Kvs_2
 				// log.Debug("ColonKV1", kvs)
@@ -832,7 +832,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 				}
 			}
 			//空格kv
-			if bl.SpaceKV != nil && len(bl.SpaceKV.Kvs) > 0 {
+			if bl.SpaceKV != nil  {
 				kvs := bl.SpaceKV.Kvs
 				// log.Debug("SpaceKV", kvs)
 				for _, tag := range tags {
@@ -876,7 +876,7 @@ func getKvByLuaFields(extfrom string, j *ju.Job, in *RegLuaInfo, t map[string][]
 				}
 			}
 			//表格kv
-			if bl.TableKV != nil && len(bl.TableKV.Kv) > 0 {
+			if bl.TableKV != nil  {
 				tkv := bl.TableKV
 				// log.Debug("tkv", tkv)
 				for k, v := range tkv.Kv {

+ 7 - 5
src/jy/pretreated/analystep.go

@@ -104,20 +104,23 @@ func FindProjectCode(newCon string, job *util.Job) {
 	var proCode string
 	proCode = projectcodeReg.FindString(newCon)
 	blCode := &util.Block{}
+	blCode.Text = proCode
 	if proCode != "" {
 		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.Text = proCode
 		blCode.ColonKV = ckv
 		job.Block = append(job.Block, blCode)
 	}else if proCode = projectcodeReg2.FindString(newCon);proCode !=""{
 		ckv := GetKVAll(proCode, job.Title, nil, 1)
-		blCode.Text = proCode
 		blCode.ColonKV = ckv
 		job.Block = append(job.Block, blCode)
-	}else if proCode = jsonReg.FindString(newCon);proCode != ""{
+	}else if proCode = projectcodeReg3.FindString(newCon) ;proCode !=""{
+		ckv := GetKVAll(proCode, job.Title, nil, 1)
+		blCode.ColonKV = ckv
+		job.Block = append(job.Block, blCode)
+	}
+	if proCode = jsonReg.FindString(newCon);proCode != ""{
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode),&jsonMap)
-		blCode.Text = proCode
 		jobKv := util.NewJobKv()
 		for k,v := range jsonMap{
 			tmpkv := new(util.Kv)
@@ -130,7 +133,6 @@ func FindProjectCode(newCon string, job *util.Job) {
 		blCode.ColonKV = jobKv
 		job.Block = append(job.Block, blCode)
 	}
-
 }
 
 //分析table解析结果

+ 2 - 1
src/jy/pretreated/analytable.go

@@ -108,7 +108,8 @@ var (
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
 	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(编号|项目编号|标段编号){1}(:|:)(.){4,30}()|\)|\])`)
-	projectcodeReg2             = regexp.MustCompile(`((?:^|\n)编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9]`)
+	projectcodeReg2             = regexp.MustCompile(`((?:^|\n)编号|项目编号|标段编号){1}(:|:)(.){4,30}[0-9a-zA-Z]`)
+	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 )

+ 7 - 3
src/jy/pretreated/tablev2.go

@@ -163,6 +163,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 	for k,v :=range sKV.Kv{
 		td.SortKV.AddKey(k,v)
 	}
+	//抽取不到走正则抽
 	proCode := projectcodeReg.FindString(text)
 	if proCode != "" {
 		ckv := GetKVAll(proCode, "", nil, 1)
@@ -174,7 +175,8 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table) *TD {
 		for k,v :=range ckv.Kv{
 			td.SortKV.AddKey(k,v)
 		}
-	}else if proCode = jsonReg.FindString(text);proCode != ""{
+	}
+	if proCode = jsonReg.FindString(text);proCode != ""{
 		jsonMap := make(map[string]string)
 		json.Unmarshal([]byte(proCode),&jsonMap)
 		for k,v := range jsonMap{
@@ -433,8 +435,10 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
 			}
 		*/
 
-		td.SortKV = FindKv(td.Val, "", 2)
-
+		fSortKV := FindKv(td.Val, "", 2)
+		for k,v := range fSortKV.Map{
+			td.SortKV.AddKey(k,v)
+		}
 		//		td.LeftNode.Val
 		//		for _, vvv := range *td.TR {
 		//			u.Debug(">>>>>")

+ 9 - 4
src/res/fieldscore.json

@@ -185,17 +185,22 @@
             {
                 "describe": "全为中文汉字或符号",
                 "regstr": "^[\\u4e00-\\u9fa5()()【】\\[\\],,。、::《》]+$",
-                "score": -10
+                "score": -2
             },
             {
-                "describe": "以个汉字以上结束",
-                "regstr": "[\\u4e00-\\u9fa5]{2,}$",
-                "score": -10
+                "describe": "以个汉字以上结束",
+                "regstr": "[\\u4e00-\\u9fa5]{1,}$",
+                "score": -1
             },
             {
                 "describe": "包含负分",
                 "regstr": "(勘察|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天|[,,。、::“”‘’\"])",
                 "score": -10
+            },
+            {
+                "describe": "标段编号匹配-2",
+                "regstr": "/.{2}",
+                "score": -2
             }
         ],
         "length": [