Selaa lähdekoodia

冲突Merge branch 'dev3.2' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.2

# Conflicts:
#	src/jy/pretreated/tablev2.go
fengweiqiang 6 vuotta sitten
vanhempi
commit
2b7f23fe3c

+ 6 - 6
src/jy/clear/tonumber.go

@@ -208,11 +208,11 @@ func capitalMoney(data []interface{}) []interface{} {
 	if len(strmatch) > 0 {
 		str = strmatch[0][0]
 	}
-	//修正单位类似:捌万伍仟肆佰捌拾贰万元整
-	if strings.Contains(str, "万元") {
-		str = strings.Replace(str, "万元", "#B#", -1)
-		str = strings.Replace(str, "万", "亿", -1)
-		str = strings.Replace(str, "#B#", "万元", -1)
+	suffixUnit := float64(1)
+	if strings.HasSuffix(str, "万") || strings.HasSuffix(str, "万元") || strings.HasSuffix(str, "万元整") {
+		index := strings.LastIndex(str, "万")
+		str = str[0:index]
+		suffixUnit = float64(10000)
 	}
 	moneyRegChar.ReplaceAllStringFunc(str, func(key string) string {
 		if key == "元" || key == "圆" || key == "点" {
@@ -262,7 +262,7 @@ func capitalMoney(data []interface{}) []interface{} {
 	for _, v := range nodes {
 		ret += v
 	}
-	return []interface{}{ret + decimals, data[1]}
+	return []interface{}{(ret + decimals) * suffixUnit, data[1]}
 }
 
 //过滤符号

+ 45 - 1
src/jy/extract/extract.go

@@ -1334,13 +1334,57 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 //kv、表格、块上的标签凡是新的标签都入库
 //val  type   times   firstid  createtime 判定field
 func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
+	now := time.Now().Unix()
 	coll := e.TaskInfo.TestColl
 	if coll == "" {
 		coll = "extract_tag_result"
 	} else {
 		coll += "_tag"
 	}
-	//for _,v := range j.ColonKV
+	datas := []map[string]interface{}{}
+	kv := map[string]int{}
+	for _, v := range j.Block {
+		//
+		for _, vv := range []*ju.JobKv{v.ColonKV, v.TableKV, v.SpaceKV} {
+			if vv == nil || vv.KvTag == nil {
+				continue
+			}
+			for kkk, vvv := range vv.KvTag {
+				if vvv.Weight == ju.RetainKvWeight {
+					kv[kkk] = kv[kkk] + 1
+				}
+			}
+		}
+		for _, vv := range v.NotClassifyTitles {
+			datas = append(datas, map[string]interface{}{
+				"val":        vv,
+				"times":      0,
+				"type":       "block",
+				"firstid":    j.SourceMid,
+				"createtime": now,
+			})
+			if len(datas) == 200 {
+				db.Mgo.SaveBulk(coll, datas...)
+				datas = []map[string]interface{}{}
+			}
+		}
+	}
+	for k, v := range kv {
+		datas = append(datas, map[string]interface{}{
+			"val":        k,
+			"times":      v,
+			"type":       "kv",
+			"firstid":    j.SourceMid,
+			"createtime": now,
+		})
+		if len(datas) == 200 {
+			db.Mgo.SaveBulk(coll, datas...)
+			datas = []map[string]interface{}{}
+		}
+	}
+	if len(datas) > 0 {
+		db.Mgo.SaveBulk(coll, datas...)
+	}
 }
 
 func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {

+ 6 - 2
src/jy/pretreated/analystep.go

@@ -30,7 +30,7 @@ func AnalyStart(job *util.Job) {
 		}
 	}
 	blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock) //分块
-	if len(blockArrays) > 0 { //有分块
+	if len(blockArrays) > 0 {                                                //有分块
 		//从块里面找分包
 		job.BlockPackage = FindPackageFromBlocks(&blockArrays, job.Title) //从块里面找分包
 		for _, bl := range blockArrays {
@@ -164,7 +164,11 @@ func processTableResult(tabres *TableResult, block *util.Block, job *util.Job) {
 	for k, v := range tabres.SortKVWeight {
 		kvIndex[k] = v
 	}
-	block.TableKV = &util.JobKv{Kv: kv, KvIndex: kvIndex}
+	KvTag := map[string]*util.Tag{}
+	for k, _ := range tabres.SortKV.NotTagKey {
+		KvTag[k] = &util.Tag{Weight: util.RetainKvWeight}
+	}
+	block.TableKV = &util.JobKv{Kv: kv, KvIndex: kvIndex, KvTag: KvTag}
 
 	//分包
 	tablePackage := map[string]*util.BlockPackage{}

+ 19 - 7
src/jy/pretreated/analytable.go

@@ -125,8 +125,8 @@ func IsHide(g *goquery.Selection) (b bool) {
 
 //对表格的key进行标准化处理,多个k相同时,出现覆盖问题
 //待扩展,暂不支持正则标签库
-func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1 []string, weight []int, v1, returntag string, b bool) {
-	k1 = []string{}
+func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1, k2 []string, weight []int, v1, returntag string, b bool) {
+	k1, k2 = []string{}, []string{}
 	weight = []int{}
 	tk := k
 	if sv, sok := v.(string); sok { //取KV
@@ -177,6 +177,8 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1 []string,
 				returntag = "中标情况"
 			}
 			b = true
+		} else {
+			k2 = append(k2, k)
 		}
 	}
 	//对上一步没有取到标准化key的进一步处理
@@ -227,7 +229,7 @@ func (table *Table) KVFilter() {
 		v := table.SortKV.Map[k]
 		if _, ok := v.(string); ok { //table.SortKV.Value为字符串,匹配抽取关键词table.SortKV.Key,匹配到添加k,v到table.StandKV,table.StandKVWeight
 			k = regSpliteSegment.ReplaceAllString(regReplAllSpace.ReplaceAllString(k, ""), "")
-			k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) //对key标准化处理,没有找到会走中标
+			k1, n_k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v) //对key标准化处理,没有找到会走中标
 			//qutil.Debug(k, v, k1, w1, v1, tag, b)
 			if b {
 				//降低冒号值的权重
@@ -257,6 +259,9 @@ func (table *Table) KVFilter() {
 					table.StandKVWeight[k] = 0
 				}
 			}
+			for _, n_k2 := range n_k1 {
+				table.SortKV.NotTagKey[n_k2] = true
+			}
 		} else {
 			//u.Debug(k, v, "---------")
 			as.AddKey(k, v)
@@ -473,7 +478,7 @@ func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 					}
 				}
 			}
-			k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v)
+			k1, n_k1, w1, v1, tag, b := CommonDataAnaly(k, table.Tag, table.Desc, v)
 			if b {
 				if tag != "" && table.Tag == "" {
 					table.Tag = tag
@@ -489,6 +494,10 @@ func (table *Table) sortKVArr(as *SortMap, winnertag bool) {
 					//					}
 					//				}
 				}
+			} else {
+				for _, n_k2 := range n_k1 {
+					table.SortKV.NotTagKey[n_k2] = true
+				}
 			}
 		}
 	}
@@ -845,6 +854,9 @@ func (tn *Table) AnalyTables(contactFormat *u.ContactFormat) []*Table {
 					table.TableResult.SortKVWeight[k] = table.StandKVWeight[k]
 				}
 			}
+			for k, v := range table.SortKV.NotTagKey {
+				table.TableResult.SortKV.NotTagKey[k] = v
+			}
 			//u.Debug(str)
 		}
 	}
@@ -2079,7 +2091,7 @@ func (tn *Table) manyPackageProcessByIndex(index []string, standIndex_pos []int)
 			}
 		} else if val, bvs := v1.(string); bvs && len(index) == 1 {
 			//删除子包的kv
-			k1tags, _, _, _, _ := CommonDataAnaly(k1, "", "", val)
+			k1tags, _, _, _, _, _ := CommonDataAnaly(k1, "", "", val)
 			if len(k1tags) > 0 && regexp.MustCompile("^(项目|开标|采购单位|招标机构)").MatchString(k1tags[0]) { //(k1tags[0].Value == "采购单位" || k1tags[0].Value == "项目编号")) {
 				//log.Println("remove", k1, val)
 				tn.SortKV.RemoveKey(k1)
@@ -2301,7 +2313,7 @@ func (tn *Table) assemblePackage(k1, v1, key string) {
 		bp.TableKV = u.NewJobKv()
 	}
 	if v1 != "" {
-		k2, w1, v2, _, bf := CommonDataAnaly(k1, "中标情况", "", v1) //匹配抽取关键词
+		k2, _, w1, v2, _, bf := CommonDataAnaly(k1, "中标情况", "", v1) //匹配抽取关键词
 		if bf {
 			for pos, k3 := range k2 {
 				if bp.TableKV.Kv != nil && bp.TableKV.KvTag[k3] != nil && (bp.TableKV.Kv[k3] == "" || w1[pos] > bp.TableKV.KvTag[k3].Weight) {
@@ -3169,7 +3181,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 15 - 13
src/jy/pretreated/tablev2.go

@@ -22,13 +22,13 @@ type TableResult struct {
 	Itype          int         //1全文 2是块
 	BlockTag       string      //块标签
 	Html           string
-	Tabs           []*Table             //子表集合,子表中包含标准化kv或原始kv
+	Tabs           []*Table           //子表集合,子表中包含标准化kv或原始kv
 	GoqueryTabs    *goquery.Selection //goquery对象
-	TableSize      int                  //子表的个数0,1,n
-	IsMultiPackage bool                 //是否有子包
-	PackageMap     *SortMap             //子包对象的sortmap,含标准化过的
-	SortKV         *SortMap             //全局KVmap值,标准化处理过的
-	SortKVWeight   map[string]int       //全局KVmap值,标准化处理过的
+	TableSize      int                //子表的个数0,1,n
+	IsMultiPackage bool               //是否有子包
+	PackageMap     *SortMap           //子包对象的sortmap,含标准化过的
+	SortKV         *SortMap           //全局KVmap值,标准化处理过的
+	SortKVWeight   map[string]int     //全局KVmap值,标准化处理过的
 	WinnerOrder    []map[string]interface{}
 	BrandData      [][]map[string]string //品牌抽取结果
 	HasKey         int                   //有key
@@ -644,18 +644,20 @@ func (t *Table) InsertTR(tr *TR) {
 
 //支持排序的map
 type SortMap struct {
-	Index map[string]int
-	Keys  []string
-	Map   map[string]interface{}
-	Lock  sync.Mutex
+	Index     map[string]int
+	Keys      []string
+	Map       map[string]interface{}
+	Lock      sync.Mutex
+	NotTagKey map[string]bool
 }
 
 //快速创建排序map
 func NewSortMap() *SortMap {
 	return &SortMap{
-		Index: map[string]int{},
-		Keys:  []string{},
-		Map:   map[string]interface{}{},
+		Index:     map[string]int{},
+		Keys:      []string{},
+		Map:       map[string]interface{}{},
+		NotTagKey: map[string]bool{},
 	}
 }