Ver Fonte

td title

fengweiqiang há 6 anos atrás
pai
commit
5672ccc7d9
2 ficheiros alterados com 34 adições e 0 exclusões
  1. 21 0
      src/jy/util/clearHtml.go
  2. 13 0
      src/res/fieldscore.json

+ 21 - 0
src/jy/util/clearHtml.go

@@ -1,8 +1,11 @@
 package util
 
 import (
+	"github.com/PuerkitoBio/goquery"
+	"log"
 	"regexp"
 	"strings"
+	"unicode/utf8"
 )
 
 //
@@ -60,6 +63,24 @@ func (c *Cut) ClearHtml(src string) string {
 	//清理input
 	src = c.hiddentag.ReplaceAllString(src, "")
 	src = c.inputag.ReplaceAllString(src, "$2")
+	document, err := goquery.NewDocumentFromReader(strings.NewReader(src))
+	if err == nil {
+		if tmpstr,err := document.Each(func(i int, sel *goquery.Selection) {
+			sel.Find("td").Each(func(i int, selection *goquery.Selection) {
+				val, b := selection.Attr("title")
+				if b && strings.Trim(val, " ") != "" {
+					tmpstr := strings.TrimFunc(selection.Text(), func(r rune) bool {
+						return r == 9|| r == 32
+					})
+					if utf8.RuneCountInString(strings.Trim(tmpstr, " ")) < utf8.RuneCountInString(strings.Trim(val, " ")) {
+						selection.SetText(strings.Trim(val, " "))
+					}
+				}
+			})
+		}).Html();err == nil{
+			src = tmpstr
+		}
+	}
 	//换结束标签
 	src = c.tag.ReplaceAllStringFunc(src, func(tmp string) string {
 		tmp = strings.Replace(tmp, " ", "", -1)

+ 13 - 0
src/res/fieldscore.json

@@ -26,6 +26,14 @@
                 "winnerorder": 3,
                 "kvweight": 1
             },
+            "projectcode": {
+                "table": 3,
+                "colon": 3,
+                "space": 3,
+                "regexp": 3,
+                "winnerorder": 3,
+                "kvweight": 1
+            },
             "buyertel": {
                 "table": 3,
                 "colon": 3,
@@ -403,6 +411,11 @@
                 "describe": "-结束没有抽取完",
                 "regstr": "-$",
                 "score": -1
+            },
+            {
+                "describe": "开始到结束连续字符-0.5",
+                "regstr": "^[A-Za-z0-9]{3,9}$",
+                "score": -0.5
             }
         ],
         "length": [