maxiaoshan 6 år sedan
förälder
incheckning
b3a223cc82

+ 38 - 39
src/jy/extract/extract.go

@@ -722,53 +722,52 @@ func ExtRuleCoreByReg(extfrom string, doc map[string]interface{}, j *ju.Job, in
 //lua脚本根据属性设置提取kv值
 func getKvByLuaFields(vc *RuleCore, j *ju.Job, et *ExtractTask) map[string][]map[string]interface{} {
 	kvmap := map[string][]map[string]interface{}{}
-	blocks := []*ju.Block{}
-	for _, bl := range j.Block {
-		if len(bl.Block) > 0 {
-			blocks = append(blocks, bl.Block...)
-		} else {
-			blocks = append(blocks, bl)
-		}
-	}
 	for fieldname, field := range vc.LFields {
 		if field != vc.Field {
 			continue
 		}
-		for _, bl := range blocks {
-			tp := ""
-			for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
-				if k == 0 {
-					tp = "colon"
-				} else if k == 1 {
-					tp = "space"
-				} else if k == 2 {
-					tp = "table"
-				}
-				if v == nil || v.KvTags == nil {
-					continue
-				}
-				for _, vv := range v.KvTags[fieldname] {
-					text := ju.TrimLRSpace(vv.Value, "")
-					if text != "" {
-						kvmap[field] = append(kvmap[field], map[string]interface{}{
-							"code":        "CL_" + vv.Key,
-							"field":       field,
-							"ruletext":    vv.Key,
-							"extfrom":     vc.ExtFrom,
-							"sourcevalue": text,
-							"value":       text,
-							"type":        tp,
-							"matchtype":   "tag_string",
-							"blocktag":    bl.Classify,
-							"weight":      vv.Weight,
-						})
-					}
+		extractFromKv(field, fieldname, j.Block, vc, kvmap)
+	}
+	AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
+	return kvmap
+}
+
+func extractFromKv(field, fieldname string, blocks []*ju.Block, vc *RuleCore, kvmap map[string][]map[string]interface{}) {
+	for _, bl := range blocks {
+		tp := ""
+		for k, v := range []*ju.JobKv{bl.ColonKV, bl.SpaceKV, bl.TableKV} {
+			if k == 0 {
+				tp = "colon"
+			} else if k == 1 {
+				tp = "space"
+			} else if k == 2 {
+				tp = "table"
+			}
+			if v == nil || v.KvTags == nil {
+				continue
+			}
+			for _, vv := range v.KvTags[fieldname] {
+				text := ju.TrimLRSpace(vv.Value, "")
+				if text != "" {
+					kvmap[field] = append(kvmap[field], map[string]interface{}{
+						"code":        "CL_" + vv.Key,
+						"field":       field,
+						"ruletext":    vv.Key,
+						"extfrom":     vc.ExtFrom,
+						"sourcevalue": text,
+						"value":       text,
+						"type":        tp,
+						"matchtype":   "tag_string",
+						"blocktag":    bl.Classify,
+						"weight":      vv.Weight,
+					})
 				}
 			}
 		}
+		if len(kvmap[field]) == 0 {
+			extractFromKv(field, fieldname, bl.Block, vc, kvmap)
+		}
 	}
-	AddExtLog("extract", j.SourceMid, nil, kvmap, &RegLuaInfo{Field: vc.Field}, et.TaskInfo) //抽取日志
-	return kvmap
 }
 
 //正则提取结果

+ 0 - 2
src/jy/extract/score.go

@@ -41,7 +41,6 @@ func init() {
 			CommonScore[k] = qu.Float64All(v)
 		}
 	}
-	log.Println(CommonScore)
 	//指定抽取属性打分配置
 	if tmp, ok := SoreConfig["extractype"]["fields"].(map[string]interface{}); ok {
 		FieldsScore = map[string]map[string]float64{}
@@ -55,7 +54,6 @@ func init() {
 			FieldsScore[key] = fieldscore
 		}
 	}
-	log.Println(FieldsScore)
 	//实例化正则
 	for _, tmp := range SoreConfig {
 		//log.Println(tmp)

+ 14 - 8
src/jy/pretreated/analystep.go

@@ -7,6 +7,7 @@ import (
 	"encoding/json"
 	"jy/util"
 	//"log"
+	"unicode/utf8"
 	"strings"
 
 	"github.com/PuerkitoBio/goquery"
@@ -30,7 +31,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 {                                                //有分块
+	if len(blockArrays) > 0 { //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -70,8 +71,9 @@ func AnalyStart(job *util.Job) {
 			//从正文里面找分包
 			job.BlockPackage = FindPackageFromText(job.Title, newCon)
 		}
-		FindProjectCode(newCon, job) //匹配项目编号
 		bl.Text = HtmlToText(con)
+		//log.Println(bl.Text)
+		FindProjectCode(bl.Text, job) //匹配项目编号
 		//调用kv解析
 		bl.ColonKV = GetKVAll(bl.Text, "", nil, 1)
 		bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil)
@@ -80,7 +82,6 @@ func AnalyStart(job *util.Job) {
 			bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1)
 			job.Winnerorder = bl.Winnerorder
 		}
-		//log.Println(bl.Text)
 		job.Block = append(job.Block, bl)
 	}
 }
@@ -107,18 +108,23 @@ func FindProjectCode(newCon string, job *util.Job) {
 	}
 	var proCode string
 	blCode := &util.Block{}
-	if newCon = projectcodeRegAll.FindString(newCon); newCon != "" { //项目名称项目编号一起的
+	if newConTMP := projectcodeRegAll.FindString(newCon); newConTMP != "" { //项目名称项目编号一起的
 		//5d424bdfa5cb26b9b7ac7a85
 		//5d425a48a5cb26b9b7df5fec
 		//5d425506a5cb26b9b7cd2c3c
-		splitStr := strings.Split(newCon, " ")
+		splitStr := strings.Split(newConTMP, " ")
 		if len(splitStr) >= 2 {
-			newCon = "项目编号:" + splitStr[len(splitStr)-1]
+			if utf8.RuneCountInString(splitStr[len(splitStr)-1]) > 5 {
+				newCon = "项目编号:" + splitStr[len(splitStr)-1]
+			} else if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
+				//5d4253f3a5cb26b9b7ca2662
+				newCon = "项目编号:" + tmpstr
+			}
 		} else if len(splitStr) == 1 {
 			if tmpstr := projectcodeRegAll2.FindString(splitStr[0]); tmpstr != "" {
 				newCon = "项目编号:" + tmpstr
-			}else if strings.Contains(newCon,"、"){
-				tmpstrs :=strings.Split(newCon,"、")
+			} else if strings.Contains(newConTMP, "、") {
+				tmpstrs := strings.Split(newCon, "、")
 				newCon = "项目编号:" + tmpstrs[len(tmpstrs)-1]
 			}
 		}

+ 2 - 2
src/jy/pretreated/analytable.go

@@ -107,8 +107,8 @@ var (
 	underline                   = regexp.MustCompile("_+$")
 	iswinnertabletag            = regexp.MustCompile("(中标|候选人|成交|结果)")
 	nswinnertabletag            = regexp.MustCompile("[评得分估]+")
-	projectcodeRegAll           = regexp.MustCompile(`(采购项目|项目)名称及[项目]?编号[:|:]?.*[\n]?`)
-	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]")
+	projectcodeRegAll           = regexp.MustCompile(`(采购)?项目名称及(项目)?编号[:|:]?.*[\n]?`)
+	projectcodeRegAll2          = regexp.MustCompile("[((].{4,30}[))]?")
 	projectcodeReg              = regexp.MustCompile(`((|\(|\[){1}(^([\s]?编号)|项目编号|标段编号|招标编号){1}(:|:)(.){4,30}()|\)|\])`)
 	projectcodeReg2             = regexp.MustCompile(`(^([\s]?编号)|项目编号){1}(:|:)(.{4,39})[0-9a-zA-Z)号]`)
 	projectcodeReg3             = regexp.MustCompile("(^询价单编号[A-Za-z0-9/-]*|公告编号[A-Za-z0-9/-]*)")

+ 20 - 3
src/jy/pretreated/colonkv.go

@@ -203,17 +203,34 @@ func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
 
 //根据配置文件中的规则,格式化正文
 func formatText(content, key string) string {
-	segment := DivideSegment(content)
+	segments := make([]*Segment, 0)
+	if key == "all" {
+		segments = DivideSegmentHtml(content)
+	} else if key == "kv" {
+		segments = DivideSegment(content)
+		//log.Println("清理前:\n",content)
+	}
 	newCon := ""
-	for _, v := range segment {
-		if v.Index > len(segment)-3 {
+	for _, v := range segments {
+		if v.Index > len(segments)-3 {
 			if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
 				break
 			}
 		}
+		if key == "kv" && utf8.RuneCountInString(v.Text)>=1{
+			//log.Println("清理前:",v.Text, []rune(v.Text)[len([]rune(v.Text))-1])
+			v.Text = strings.TrimRightFunc(v.Text, func(r rune) bool {
+				return r == 19968 || r == 20108 || r == 19977 ||
+					r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061
+			})
+			//log.Println("清理前后",v.Text)
+		}
 		newCon += v.Text + "\n"
 	}
 	content = regEndWrap.ReplaceAllString(newCon, "")
+	//if key == "kv"{
+	//	log.Println("清理前后\n",content)
+	//}
 	for _, v := range FormatTextMap[key] {
 		reg, _ := v["reg"].(*regexp.Regexp)
 		separator, isString := v["separator"].(string)

+ 66 - 6
src/jy/pretreated/division.go

@@ -1,6 +1,7 @@
 package pretreated
 
 import (
+	"fmt"
 	"jy/util"
 	qutil "qfw/util"
 	"regexp"
@@ -65,6 +66,24 @@ var (
 	*/
 	regPackageFilter  = regexp.MustCompile("([第]?([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)((子|合同|分|施工|监理)?(标段?|包|合同段|标包))|((子|合同|分|施工|监理)?(标|包)(段|号)?)[  \u3000\u2003\u00a0]*([一二三四五六七八九十0-9A-Za-zⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)).+[\r\n]?<table>")
 	filterPkgTitleKey = regexp.MustCompile("结果[::]?$")
+	xuhao             = map[string]bool{
+		"19968_12289": true,
+		"19968_46":    true,
+		"20108_12289": true,
+		"20108_46":    true,
+		"19977_12289": true,
+		"19977_46":    true,
+		"22235_12289": true,
+		"22235_46":    true,
+		"20116_12289": true,
+		"20116_46":    true,
+		"20845_12289": true,
+		"20845_46":    true,
+		"19971_12289": true,
+		"19971_46":    true,
+		"20843_12289": true,
+		"20061_46":    true,
+	}
 )
 
 //分块
@@ -409,11 +428,52 @@ func appendWarpStop(text string) string {
 	}
 	return text
 }
-
+//分段
+func DivideSegmentHtml(txt string) []*util.Segment {
+	//先分段
+	_segs := strings.FieldsFunc(txt, func(r rune) bool {
+		return r == 10 || r == 13
+	})
+	//再去除空行
+	segs := make([]*util.Segment, 0)
+	_index := 0
+	for _, seg := range _segs {
+		if seg != " " && len(seg) > 1 {
+			_seg := util.Segment{}
+			_index = _index + 1
+			_seg.Index = _index
+			_seg.Text = seg
+			segs = append(segs, &_seg)
+		}
+	}
+	return segs
+}
 //分段
 func DivideSegment(txt string) []*util.Segment {
 	//先分段
+	tmpstr := ""
 	_segs := strings.FieldsFunc(txt, func(r rune) bool {
+		if r == 19968 || r == 20108 || r == 19977 || r == 12289 || r == 46 ||
+			r == 22235 || r == 20116 || r == 20845 || r == 19971 || r == 20843 || r == 20061 {
+			if tmpstr == "" {
+				tmpstr += fmt.Sprint(r)
+				return false
+			} else if strings.Contains(tmpstr, "_") {
+				tmpstr = ""
+				tmpstr += fmt.Sprint(r)
+				return false
+			} else if tmpstr == fmt.Sprint(r) {
+				if r == 46 || r == 12289{
+					tmpstr = ""
+				}
+				return false
+			}
+			tmpstr += "_" + fmt.Sprint(r)
+			if xuhao[tmpstr] {
+				return true
+			}
+		}
+		tmpstr= ""
 		return r == 10 || r == 13
 	})
 	//再去除空行
@@ -528,7 +588,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			return false, ""
 		}
 		//
-		is := regexp.MustCompile(v[0]+"[::]*").FindAllStringIndex(con, -1)
+		is := regexp.MustCompile(v[0] + "[::]*").FindAllStringIndex(con, -1)
 		for _, sv := range is {
 			appendWarpIndex = append(appendWarpIndex, sv[0])
 		}
@@ -568,13 +628,13 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			indexPkgMap[sv[0]] = v[0]
 		}
 		//key在包前面,并且在一行的开头
-		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)"+pgflag).FindAllStringSubmatchIndex(con, -1)
+		keys := regexp.MustCompile("([\r\n]|^)([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::\\s\u3000\u2003\u00a0]+.*?)" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		if len(keys) == 0 {
 			//key在包前面,并且key以冒号结尾
-			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()([\u4e00-\u9fa5]{2,30}?([((].{1,8}?[))])?[::]+[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		if len(keys) == 0 {
-			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])"+pgflag).FindAllStringSubmatchIndex(con, -1)
+			keys = regexp.MustCompile("()注[::]([\u4e00-\u9fa5]{2,8}?([((].{1,8}?[))])?[\\s\u3000\u2003\u00a0]*[\r\n])" + pgflag).FindAllStringSubmatchIndex(con, -1)
 		}
 		for _, key := range keys {
 			startEndMap[key[5]] = key[4]
@@ -628,7 +688,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 			}
 			index := util.PackageNumberConvert(bk)
 			//去掉前缀,空格必须要加,分kv的时候要用
-			text = regexp.MustCompile(bv[0]+"[::]*").ReplaceAllString(text, "")
+			text = regexp.MustCompile(bv[0] + "[::]*").ReplaceAllString(text, "")
 			headKey := ""
 			if indexKeyStringMap[iv] != "" {
 				//if !filterPkgTitleKey.MatchString(indexKeyStringMap[iv]) {

+ 6 - 1
src/jy/pretreated/tablev2.go

@@ -396,7 +396,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable bool) {
 		if len(td.SortKV.Keys) > 0 {
 			//td.KVDirect = 3 //不当头也不当值,忽略
 			if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
-				td.Val = td.SortKV.Keys[0]
+				td.Val, _ = td.SortKV.Map[td.SortKV.Keys[0]].(string)
 				td.BH = true
 			}
 		} else if !bsontable {
@@ -877,6 +877,11 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
 //纯文本
 func HtmlToText(con string) string {
 	doc2, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
+	//log.Println(doc2.Html())
+	doc2.Find("tr").Each(func(i int, selection *goquery.Selection) {
+		selection.AfterHtml(string(rune(10)))
+	})
+	//log.Println(doc2.Html())
 	return doc2.Text()
 }
 

+ 13 - 4
src/jy/pretreated/winnerorder.go

@@ -26,7 +26,7 @@ var (
 	numberReg         = regexp.MustCompile("[一二三四五六七八九十0-9]+")
 	numberReg2        = regexp.MustCompile("[\\d一二三四五六七八九十.,,]+")
 	thisNumberReg     = regexp.MustCompile("第" + numberReg.String())
-	winnerReg1        = regexp.MustCompile("(^|[^为])(【?(推荐)?第[一二三四五六七八九十1-9]+(合格|名|包|标段)?】?((候|侯)选)?(备选|成交|中(标|选))人?([((]成交[))])?((候|侯)选|排序)?(人(单位)?|供应商|单位|机构)(名称)?为?)($|[^,;;。,])")
+	winnerReg1        = regexp.MustCompile("(^|[^为])(【?(推荐)?第[一二三四五六七八九十1-9]+(合格|名|包|标段)?】?((候|侯)选)?(入围|备选|成交|中(标|选))人?([((]成交[))])?((候|侯)选|排序)?(人(单位)?|供[货]商|单位|机构)(名称)?为?)($|[^,;;。,])")
 	winnerReg2        = regexp.MustCompile("(排名第[一二三四五六七八九十1-9]+|第[一二三四五六七八九十1-9]+(候|侯)选人)")
 	//winnerReg2     = regexp.MustCompile("(第[一二三四五六七八九十1-9]+(候|侯)选人)")
 	winnerReg3     = regexp.MustCompile("(第[一二三四五六七八九十1-9]+名)")
@@ -43,6 +43,7 @@ var (
 	findCandidate  = regexp.MustCompile("(^.{5,}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体))|工作室)")
 	findCandidate2 = regexp.MustCompile("(^.{5,}(公司|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$)")
 	clearSpace1    = regexp.MustCompile("([((][\\d一二三四五六七八九十][))][\\s\u3000\u2003\u00a0\\t]*|<[^>].+?>)")
+	offerReg       = regexp.MustCompile("(中标|磋商|投标|报|单|成交)总?(价|金额)")
 )
 
 /*
@@ -227,10 +228,18 @@ func (wo *WinnerOrderEntity) findByReg(content string, blocks []string, reg_2 *r
 				object["type"] = 1
 			}
 		} else { //中标金额
-			kvTags := GetKvTags([]*util.Kv{&util.Kv{Key: k, Value: v}}, "", []string{"中标金额"})
+			findOfferFlag := false
+			if offerReg.MatchString(k) {
+				findOfferFlag = true
+			} else {
+				kvTags := GetKvTags([]*util.Kv{&util.Kv{Key: k, Value: v}}, "", []string{"中标金额"})
+				if len(kvTags["中标金额"]) > 0 {
+					findOfferFlag = true
+				}
+			}
 			//找到了中标金额
-			if len(kvTags["中标金额"]) > 0 && object["entname"] != nil {
-				val := wo.clear("中标金额", kvTags["中标金额"][0].Value)
+			if findOfferFlag && object["entname"] != nil {
+				val := wo.clear("中标金额", v)
 				if val != nil {
 					object["price"] = val
 				}

+ 14 - 2
src/main_blocktest.go

@@ -17,6 +17,8 @@ var f *os.File
 var m = map[string]bool{}
 
 func main12() {
+	//winnerorder()
+	//return
 	//log.Println(pretreated.ProcTitle("以上公告内容如有变动将在相关网络媒体上另行通知凡购买本招标文件的单位必须就此采购项目的相关事宜详细咨询否则参与投标即被视为已经充分了解了招标方的需求中标后承担该文件范围内的所有要求投标前如对招标文件存有疑问请在投标截止日期前三个工作日以实名制书面文件向我公司询问否则视为接受已报名购买招标文件的投标商未递交投标文件或虽递交投标文件但未参加开标大会的投标商不得再参加该项目的采购活动"))
 	//return
 	//f, _ = os.OpenFile("./title.txt", os.O_RDWR|os.O_CREATE, 777)
@@ -152,8 +154,7 @@ func com(doc map[string]interface{}) {
 	}
 	log.Println("=============抽取结果 result================")
 	for k, v := range set["result"].(map[string][]*ju.ExtField) {
-		break
-		if k != "budget" {
+		if k != "winner" {
 			continue
 		}
 		for _, vv := range v {
@@ -241,3 +242,14 @@ func GetDetail(doc map[string]interface{}) (detail string) {
 	}
 	return detail
 }
+func winnerorder() {
+	text := `评审专家名单:
+吴殿波、韩屹、孙胜进、郑丹、李海波
+ 
+中标标的名称、规格型号、数量、单价、服务要求:
+2019年沈阳惠涌供热有限责任公司、沈阳圣达热力供暖有限责任公司、沈阳惠盛供热有限责任公司PE管保温
+第一入围供货商:沈阳曲暖鼎盛保温安装有限公司 、总单价:11.833300
+第二入围供货商:沈阳国盛防腐保温有限公司、总单价:11.102100
+第三入围供货商:沈阳泰豪管材有限公司、总单价:13.258100`
+	log.Println((&pretreated.WinnerOrderEntity{}).Find(text, true, 1))
+}

+ 6 - 1
src/res/fieldscore.json

@@ -122,7 +122,7 @@
         "positivewords": [
             {
                 "describe": "以*结尾",
-                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|会|场)$",
+                "regstr": ".{2,100}(委员会|办公室|学校|幼儿园|动物园|管理站|图书馆|殡仪馆|博物馆|基地|青年宫|少年宫|艺术宫|电视台|协会|政府|初中|集团|银行|[大中小]学|院|厂|店|所|队|社|室|厅|段|场)$",
                 "score": 3
             }
         ],
@@ -136,6 +136,11 @@
                 "describe": "包含负分不再展示",
                 "regstr": "(详见|提出|面向|施工)",
                 "score": -50
+            },
+            {
+                "describe": "一个字或者两个字不再显示",
+                "regstr": "^[\\s]*[\\u4e00-\\u9fa5]{1,2}[\\s]*$",
+                "score": -50
             }
         ],
         "length": [

+ 4 - 4
versioncomparison/config.json

@@ -1,9 +1,9 @@
 {
-    "premgo": "192.168.3.207:27081",
-    "predb": "qfw",
+    "premgo": "192.168.3.207:27082",
+    "predb": "extract_kf",
     "prec": "result_v3",
-    "newmgo": "192.168.3.207:27081",
-    "newdb": "extract_v3",
+    "newmgo": "192.168.3.207:27082",
+    "newdb": "extract_kf",
     "newc": "result_data",
     "fields": [
         "projectname",

+ 124 - 2
versioncomparison/main.go

@@ -23,6 +23,7 @@ var (
 	Sid, Eid    string
 	Fields      []string
 	FieldsQuery string
+	Url         = "https://www.jianyu360.com/article/content/%s.html"
 )
 
 type Compare struct {
@@ -59,7 +60,9 @@ func init() {
 func main() {
 	getVersionData()
 	createXlsx()
+	//biaozhucompare()
 }
+
 func createXlsx() {
 	xf, err := xlsx.OpenFile("template.xlsx")
 	if err != nil {
@@ -87,7 +90,7 @@ func createXlsx() {
 		}
 	}
 	//生成信息sheet
-	url := "https://www.jianyu360.com/article/content/%s.html"
+
 	for _, field := range Fields {
 		sh, _ := xf.AddSheet(field)
 		rowh := sh.AddRow()
@@ -102,7 +105,7 @@ func createXlsx() {
 				row.AddCell().SetString(k)
 				row.AddCell().SetString(v.PreVal)
 				row.AddCell().SetString(v.NewVal)
-				row.AddCell().SetString(fmt.Sprintf(url, qu.CommonEncodeArticle("content", v.Id)))
+				row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", v.Id)))
 			}
 		}
 	}
@@ -184,3 +187,122 @@ func getVersionData() {
 		Compares[k] = cp
 	}
 }
+
+type BidData struct {
+	id  string
+	key map[string]interface{}
+}
+type BidCom struct {
+	Val []int
+	Ids []map[string]interface{}
+}
+
+//标注正确率统计
+func biaozhucompare() {
+	exts, _ := Newmgo.Find("bid_v3", `{}`, `{"_id":1}`, nil, false, -1, -1)
+	extDatas := []BidData{}
+	for _, v := range *exts {
+		key := map[string]interface{}{
+			"projectname": v["projectname"],
+			"projectcode": v["projectcode"],
+			"buyer":       v["buyer"],
+			"budget":      qu.Float64All(v["budget"]),
+			"bidamount":   qu.Float64All(v["bidamount"]),
+			"agency":      v["agency"],
+			"buyerperson": v["buyerperson"],
+			"buyertel":    v["buyertel"],
+		}
+		ext := BidData{
+			id:  qu.BsonIdToSId(v["_id"]),
+			key: key,
+		}
+		extDatas = append(extDatas, ext)
+	}
+	log.Println("exts ok")
+	bzs, _ := Newmgo.Find("bid_biaozhuid", `{}`, `{"_id":1}`, nil, false, -1, -1)
+	bzDatas := []BidData{}
+	for _, v := range *bzs {
+		bidamount := float64(0)
+		if bigprices, ok := v["bigprice"].([]interface{}); ok {
+			bidamount = qu.Float64All(bigprices[0])
+		}
+		key := map[string]interface{}{
+			"projectname": qu.ObjToString(v["projectname"]),
+			"projectcode": qu.ObjToString(v["projectcode"]),           //qu.If(qu.ObjToString(v["t_bidno"]) == "", qu.ObjToString(v["b_projectno"]), qu.ObjToString(v["t_bidno"])),
+			"buyer":       qu.ObjToString(v["buyer"]),                 // qu.If(qu.ObjToString(v["t_buyer"]) == "", qu.ObjToString(v["b_buyer"]), qu.ObjToString(v["t_buyer"])),
+			"budget":      qu.Float64All(qu.ObjToString(v["budget"])), //  qu.Float64All(qu.ObjToString(v["t_budget"])),
+			"bidamount":   bidamount,
+			"agency":      qu.ObjToString(v["agency"]),
+			"buyerperson": qu.ObjToString(v["buyerperson"]),
+			"buyertel":    qu.ObjToString(v["buyertel"]),
+		}
+		bz := BidData{
+			id:  qu.BsonIdToSId(v["_id"]),
+			key: key,
+		}
+		bzDatas = append(bzDatas, bz)
+	}
+	log.Println("bzs ok")
+
+	bcoms := map[string]*BidCom{}
+	for _, ext := range extDatas {
+		for _, bz := range bzDatas {
+			if bz.id == ext.id {
+				for key, val := range ext.key {
+					//					if key == "budget" {
+					//						log.Println(key, ext.key[key], ";;;;;", bz.key[key])
+					//					}
+					if qu.ObjToString(val) != "" || qu.Float64All(val) > 0 {
+						bcom := bcoms[key]
+						if bcom == nil {
+							bcom = &BidCom{
+								Val: []int{0, 0},
+								Ids: []map[string]interface{}{},
+							}
+						}
+						if val == bz.key[key] {
+							bcom.Val[0] += 1
+						} else {
+							bcom.Val[1] += 1
+							tmp := map[string]interface{}{
+								"id":  ext.id,
+								"ext": val,
+								"bz":  bz.key[key],
+							}
+							bcom.Ids = append(bcom.Ids, tmp)
+						}
+						bcoms[key] = bcom
+					}
+				}
+				break
+			}
+		}
+	}
+	xl := xlsx.NewFile()
+	sh, _ := xl.AddSheet("统计")
+	h := sh.AddRow()
+	h.AddCell().SetString("field")
+	h.AddCell().SetString("相同")
+	h.AddCell().SetString("不同")
+	for k, v := range bcoms {
+		row := sh.AddRow()
+		row.AddCell().SetString(k)
+		row.AddCell().SetInt(v.Val[0])
+		row.AddCell().SetInt(v.Val[1])
+		ksh, _ := xl.AddSheet(k)
+		rh := ksh.AddRow()
+		rh.AddCell().SetString("id")
+		rh.AddCell().SetString("标注")
+		rh.AddCell().SetString("抽取")
+		rh.AddCell().SetString("url")
+		for _, v := range v.Ids {
+			rw := ksh.AddRow()
+			rw.AddCell().SetString(qu.ObjToString(v["id"]))
+			rw.AddCell().SetString(fmt.Sprint(v["bz"]))
+			rw.AddCell().SetString(fmt.Sprint(v["ext"]))
+			rw.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", qu.ObjToString(v["id"]))))
+		}
+		log.Println(k, v.Val)
+	}
+	xl.Save("ext_bz.xlsx")
+}