Sfoglia il codice sorgente

调整detail抽取

zhengkun 1 anno fa
parent
commit
832ae8a2c9
2 ha cambiato i file con 22 aggiunte e 20 eliminazioni
  1. 14 13
      src/jy/extract/extract.go
  2. 8 7
      src/jy/pretreated/analyrepair.go

+ 14 - 13
src/jy/extract/extract.go

@@ -2,6 +2,8 @@ package extract
 
 import (
 	"fmt"
+	log "github.com/donnie4w/go-logger/logger"
+	"gopkg.in/mgo.v2/bson"
 	"jy/clear"
 	db "jy/mongodbutil"
 	"jy/pretreated"
@@ -12,9 +14,6 @@ import (
 	"strings"
 	"time"
 	"unicode/utf8"
-
-	log "github.com/donnie4w/go-logger/logger"
-	"gopkg.in/mgo.v2/bson"
 )
 
 // 启动测试抽取-、、、、结果追踪
@@ -234,16 +233,18 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 	}
 	detail := ""
 	summary := qu.ObjToString(doc["summary"])
-	d1 := CleanDetailText(qu.ObjToString(doc["detail"]), summary)
-	d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
-	if len(d1) >= len(d2) || d2 == "" {
-		detail = d1
-	} else { //选用contenthtml有一种特殊情况与detail不一致,综合考虑选取逻辑
-		detail = d2
-		if SelectDetailSourceText(d1, d2) {
-			detail = d1
-		}
-	}
+	detail = CleanDetailText(qu.ObjToString(doc["detail"]), summary)
+	//d1 := CleanDetailText(qu.ObjToString(doc["detail"]), summary)
+	//d2 := CleanDetailText(qu.ObjToString(doc["contenthtml"]), summary)
+	//if len(d1) >= len(d2) || d2 == "" {
+	//	detail = d1
+	//} else { //选用contenthtml有一种特殊情况与detail不一致,综合考虑选取逻辑
+	//	detail = d2
+	//	if SelectDetailSourceText(d1, d2) {
+	//		detail = d1
+	//	}
+	//}
+	//调整采用detail抽取
 	if utf8.RuneCountInString(detail) >= 100000 {
 		detail = detail[:100000]
 	}

+ 8 - 7
src/jy/pretreated/analyrepair.go

@@ -7,20 +7,21 @@ import (
 	"strings"
 )
 
-/**
+/*
+*
 之前爬虫过来的数据对table表格的抓取异常问题
 查找并修正不规则表格的字符串,只对全文做处理,块内的表格不需要修正
-**/
+*
+*/
 var thbf = regexp.MustCompile("(?i)</?t(head|body|foot)>")
 
-//需要保留thead
+// 需要保留thead
 var saveThead = regexp.MustCompile("(?is)<thead>(.+?)</thead>")
 var clearpkg = regexp.MustCompile("(标示|标识)")
 var clearMoneyReg1 = regexp.MustCompile("(总成交金额:[0-9.]+)[\\s ]+([((]?万元[))]?)")
 
 func RepairCon(con string) string {
 	con = clearpkg.ReplaceAllString(con, "")
-
 	con = clearMoneyReg1.ReplaceAllString(con, "$1$2")
 	res := saveThead.FindAllStringSubmatch(con, 1)
 	th := ""
@@ -47,7 +48,7 @@ func RepairCon(con string) string {
 	return con
 }
 
-//修复表格
+// 修复表格
 func findpos(con string, iLen, start int) (newcon string) {
 	defer qutil.Catch()
 	n := len(con)
@@ -133,7 +134,7 @@ func findpos(con string, iLen, start int) (newcon string) {
 	return
 }
 
-//td的值里面有一个包,并且没有冒号kv
+// td的值里面有一个包,并且没有冒号kv
 func isHasOnePkgAndNoKv(v1 string) (bool, string) {
 	v1s := FindVal_1.FindAllString(v1, -1)
 	colonCount := len(regDivision.FindAllString(v1, -1))
@@ -146,7 +147,7 @@ func isHasOnePkgAndNoKv(v1 string) (bool, string) {
 	return false, v1
 }
 
-//替换分包中混淆的词
+// 替换分包中混淆的词
 func replPkgConfusion(v1 string) string {
 	v1 = PreReg.ReplaceAllString(v1, "")
 	v1 = PreReg1.ReplaceAllString(v1, "")