wcj 6 жил өмнө
parent
commit
6d4577219c

+ 39 - 39
src/jy/admin/version.go

@@ -375,36 +375,36 @@ func init() {
 	})
 	Admin.POST("/version/blockclassify_info_save", func(c *gin.Context) {
 		status := 0
-		_id, _ := c.GetPostForm("_id")
 		name, _ := c.GetPostForm("name")
 		code, _ := c.GetPostForm("code")
-		if _id != "" {
-			if Mgo.UpdateById("block_classify_info", _id, bson.M{
-				"$set": bson.M{
-					"l_updatetime": time.Now().Unix(),
-					"name":         name,
-					"code":         code,
+		pid, _ := c.GetPostForm("pid")
+		if Mgo.Count("block_classify_info", map[string]interface{}{
+			"delete": false,
+			"$or": []map[string]interface{}{
+				map[string]interface{}{
+					"name": name,
 				},
-			}) {
-				status = 1
-			}
+				map[string]interface{}{
+					"code": code,
+				},
+			},
+		}) == 1 {
+			status = -1
 		} else {
-			vid, _ := c.GetPostForm("vid")
-			pid, _ := c.GetPostForm("pid")
-			unix := time.Now().Unix()
-			if Mgo.Count("block_classify_info", map[string]interface{}{
-				"delete": false,
-				"$or": []map[string]interface{}{
-					map[string]interface{}{
-						"name": name,
+			_id, _ := c.GetPostForm("_id")
+			if _id != "" {
+				if Mgo.UpdateById("block_classify_info", _id, bson.M{
+					"$set": bson.M{
+						"l_updatetime": time.Now().Unix(),
+						"name":         name,
+						"code":         code,
 					},
-					map[string]interface{}{
-						"code": code,
-					},
-				},
-			}) == 1 {
-				status = -1
+				}) {
+					status = 1
+				}
 			} else {
+				vid, _ := c.GetPostForm("vid")
+				unix := time.Now().Unix()
 				if Mgo.Save("block_classify_info", bson.M{
 					"delete":       false,
 					"name":         name,
@@ -444,24 +444,24 @@ func init() {
 	})
 	Admin.POST("/version/blockclassify_tag_save", func(c *gin.Context) {
 		status := 0
-		_id, _ := c.GetPostForm("_id")
 		name, _ := c.GetPostForm("name")
-		if _id != "" {
-			if Mgo.UpdateById("block_classify_tag", _id, bson.M{
-				"$set": bson.M{
-					"l_updatetime": time.Now().Unix(),
-					"name":         name,
-				},
-			}) {
-				status = 1
-			}
+		pid, _ := c.GetPostForm("pid")
+		if Mgo.Count("block_classify_tag", map[string]interface{}{"delete": false, "name": name, "pid": pid}) == 1 {
+			status = -1
 		} else {
-			vid, _ := c.GetPostForm("vid")
-			pid, _ := c.GetPostForm("pid")
-			unix := time.Now().Unix()
-			if Mgo.Count("block_classify_tag", map[string]interface{}{"delete": false, "name": name, "pid": pid}) == 1 {
-				status = -1
+			_id, _ := c.GetPostForm("_id")
+			if _id != "" {
+				if Mgo.UpdateById("block_classify_tag", _id, bson.M{
+					"$set": bson.M{
+						"l_updatetime": time.Now().Unix(),
+						"name":         name,
+					},
+				}) {
+					status = 1
+				}
 			} else {
+				vid, _ := c.GetPostForm("vid")
+				unix := time.Now().Unix()
 				if Mgo.Save("block_classify_tag", bson.M{
 					"delete":       false,
 					"name":         name,

+ 55 - 13
src/jy/extract/extract.go

@@ -23,13 +23,13 @@ import (
 )
 
 var (
-	lock    sync.RWMutex
-	cut     = ju.NewCut()                          //获取正文并清理
-	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask          //任务列表
-	ClearTaskList map[string]*ClearTask            //清理任务列表
-	saveLimit     = 200                            //抽取日志批量保存
-	PageSize      = 5000                           //查询分页
+	lock          sync.RWMutex
+	cut           = ju.NewCut()                          //获取正文并清理
+	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask                //任务列表
+	ClearTaskList map[string]*ClearTask                  //清理任务列表
+	saveLimit     = 200                                  //抽取日志批量保存
+	PageSize      = 5000                                 //查询分页
 	Fields        = `{"title":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -683,7 +683,7 @@ func ExtRegCore(extfrom string, doc map[string]interface{}, j *ju.Job, in *RegLu
 				}
 				if tmps, ok := v.([]map[string]interface{}); ok {
 					for _, tmp := range tmps {
-						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"],}
+						field := &ju.ExtField{Field: k, Code: qu.ObjToString(tmp["code"]), RuleText: qu.ObjToString(tmp["ruletext"]), Type: qu.ObjToString(tmp["type"]), MatchType: qu.ObjToString(tmp["matchtype"]), ExtFrom: extfrom, Value: tmp["value"]}
 						if extfrom == "title" {
 							field.Score = 4
 						}
@@ -1204,6 +1204,7 @@ type FieldValue struct {
 func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 	qu.Try(func() {
 		doc, result, _id, values := funcAnalysis(j)
+		go otherNeedSave(j, result, e)
 		//从排序结果中取值
 		tmp := map[string]interface{}{} //抽取值
 		for key, val := range values {
@@ -1278,10 +1279,12 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 			}
 			// log.Debug("============", j.HasBrand, j.HasGoods, j.HasKey, j.HasTable, j.BrandData)
 		}
-		//分包和标签
-		if ju.Config["saveblock"].(bool) {
-			blocks := make([]ju.BlockAndTag, 0)
-			for _, v := range j.Block {
+		//所有kv组成的字符串
+		var kvtext bytes.Buffer
+		blocks := make([]ju.BlockAndTag, 0)
+		for _, v := range j.Block {
+			//分包和标签
+			if ju.Config["saveblock"].(bool) {
 				xx, _ := json.Marshal(v)
 				tmpblock := new(ju.TmpBlock)
 				err := json.Unmarshal(xx, &tmpblock)
@@ -1294,8 +1297,33 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				}
 				blocks = append(blocks, ju.BlockAndTag{v.Tag, tmpblock})
 			}
+			//把所有kv组装成一个字符串,存库
+			for ck, cv := range v.ColonKV.Kv {
+				kvtext.WriteString(ck)
+				kvtext.WriteString(":")
+				kvtext.WriteString(cv)
+				kvtext.WriteString(" ")
+			}
+			for sk, sv := range v.SpaceKV.Kv {
+				kvtext.WriteString(sk)
+				kvtext.WriteString(":")
+				kvtext.WriteString(sv)
+				kvtext.WriteString(" ")
+			}
+			for tk, tv := range v.TableKV.Kv {
+				kvtext.WriteString(tk)
+				kvtext.WriteString(":")
+				kvtext.WriteString(tv)
+				kvtext.WriteString(" ")
+			}
+		}
+		if kvtext.Len() > 0 {
+			tmp["kvtext"] = kvtext.String()
+		}
+		if len(blocks) > 0 {
 			tmp["blocks"] = blocks
 		}
+		tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
 				for field, _ := range e.Fields {
@@ -1343,6 +1371,20 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		log.Debug("AnalysisSaveResult err", err)
 	})
 }
+
+//保存其他
+//kv、表格、块上的标签凡是新的标签都入库
+//val  type   times   firstid  createtime 判定field
+func otherNeedSave(j *ju.Job, result map[string][]*ju.ExtField, e *ExtractTask) {
+	coll := e.TaskInfo.TestColl
+	if coll == "" {
+		coll = "extract_tag_result"
+	} else {
+		coll += "_tag"
+	}
+	//for _,v := range j.ColonKV
+}
+
 func rangeBlockToJson(j *ju.Block, tmpblock ju.TmpBlock) (b *ju.TmpBlock) {
 	if j == nil {
 		return nil
@@ -1479,7 +1521,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 { //reids未找到,执行规则匹配
+	if i == 0 {                            //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 9 - 13
src/jy/extract/extractInit.go

@@ -1243,7 +1243,12 @@ func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
 	tag_map := map[string]ju.Tags{}
 	for _, v := range *classify_tag {
 		pid := qu.ObjToString(v["pid"])
-		tag_map[pid] = append(tag_map[pid], &ju.Tag{Value: qu.ObjToString(v["name"])})
+		name := qu.ObjToString(v["name"])
+		tag := &ju.Tag{Value: name}
+		if strings.HasPrefix(name, "reg__") {
+			tag.TagReg = regexp.MustCompile(strings.TrimLeft(name, "reg__"))
+		}
+		tag_map[pid] = append(tag_map[pid], tag)
 	}
 	//
 	info_map := map[string][]*ju.NameCode{}
@@ -1252,14 +1257,8 @@ func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
 		pid := qu.ObjToString(v["pid"])
 		_id := qu.BsonIdToSId(v["_id"])
 		name := qu.ObjToString(v["name"])
-		info_tag[name] = &ju.TagFile{
-			Name:  name,
-			Items: tag_map[_id],
-		}
-		info_map[pid] = append(info_map[pid], &ju.NameCode{
-			Name: name,
-			Code: qu.ObjToString(v["code"]),
-		})
+		info_tag[name] = &ju.TagFile{Name: name, Items: tag_map[_id]}
+		info_map[pid] = append(info_map[pid], &ju.NameCode{Name: name, Code: qu.ObjToString(v["code"])})
 	}
 	classify_map := map[string][]*ju.NameCode{}
 	for _, v := range *classify {
@@ -1271,8 +1270,5 @@ func (e *ExtractTask) InitBlockClassify() *ju.BlockClassify {
 			classify_map[vv] = append(classify_map[vv], info_map[_id]...)
 		}
 	}
-	return &ju.BlockClassify{
-		Type:     classify_map,
-		Classify: info_tag,
-	}
+	return &ju.BlockClassify{Type: classify_map, Classify: info_tag}
 }

+ 13 - 1
src/jy/pretreated/colonkv.go

@@ -4,6 +4,7 @@ package pretreated
 import (
 	"jy/clear"
 	. "jy/util"
+	jutil "jy/util"
 	qutil "qfw/util"
 	"regexp"
 	"sort"
@@ -200,6 +201,17 @@ func (ce *ColonkvEntity) blockTitleKV(title, key string) string {
 
 //根据配置文件中的规则,格式化正文
 func formatText(content, key string) string {
+	segment := DivideSegment(content)
+	newCon := ""
+	for _, v := range segment {
+		if v.Index > len(segment)-3 {
+			if regexp.MustCompile("上一篇(.+)下一篇").MatchString(v.Text) {
+				break
+			}
+		}
+		newCon += v.Text + "\n"
+	}
+	content = regEndWrap.ReplaceAllString(newCon, "")
 	for _, v := range FormatTextMap[key] {
 		reg, _ := v["reg"].(*regexp.Regexp)
 		separator, isString := v["separator"].(string)
@@ -681,7 +693,7 @@ func KvTagsToKV(findkvs []*Kv, title string, tagdbs []string, from int) (map[str
 			//Debug(key)
 			//continue
 			//由跳过修改为保留
-			tags = []*Tag{&Tag{k, -100, nil}}
+			tags = []*Tag{&Tag{k, jutil.RetainKvWeight, nil}}
 		}
 		for _, tk := range tags {
 			//分包过来给kv打标签的时候,只取第一个,后面的不覆盖

+ 4 - 1
src/jy/pretreated/division.go

@@ -246,7 +246,7 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock) ([]*ut
 				titles = append(titles, sv)
 			}
 		}
-		block.Classify = ruleBlock.Classify.GetClassify(tp, titles)
+		block.Classify, block.NotClassifyTitles = ruleBlock.Classify.GetClassify(tp, titles)
 		tagsToBlocks(blocks, block)
 		//log.Println(index, sv, splitTitles)
 		//log.Println(blockText)
@@ -256,6 +256,9 @@ func DivideBlock(tp, content string, from int, ruleBlock *util.RuleBlock) ([]*ut
 	if len(blocks) > 0 {
 		//头
 		if headBlock != nil {
+			if tp == "招标" {
+				headBlock.Classify = map[string]bool{"bidcondition": true}
+			}
 			returnBlocks = append(returnBlocks, headBlock)
 		}
 		//中间块

+ 16 - 34
src/jy/util/article.go

@@ -71,21 +71,22 @@ type RuleBlock struct {
 
 //块
 type Block struct {
-	Tags        []Tags                   //对块做的标签,可以作为数据抽取的依据
-	Title       string                   //块标题
-	Titles      []string                 //拆分以后多个块标题
-	Index       int                      //块索引
-	Text        string                   //块内容
-	Start       int                      //开始索引
-	End         int                      //结束索引
-	ColonKV     *JobKv                   //冒号kv (分出的对应的KV值)
-	TableKV     *JobKv                   //table kv (分出的对应的KV值)
-	SpaceKV     *JobKv                   //空格 kv (分出的对应的KV值)
-	BPackage    *BlockPackage            //分包信息
-	Tag         map[string]bool          //块标签
-	Block       []*Block                 //子块
-	Classify    map[string]bool          //块分类
-	Winnerorder []map[string]interface{} //块中,中标候选人排序
+	Tags              []Tags                   //对块做的标签,可以作为数据抽取的依据
+	Title             string                   //块标题
+	Titles            []string                 //拆分以后多个块标题
+	NotClassifyTitles []string                 //没有分出类的标题
+	Index             int                      //块索引
+	Text              string                   //块内容
+	Start             int                      //开始索引
+	End               int                      //结束索引
+	ColonKV           *JobKv                   //冒号kv (分出的对应的KV值)
+	TableKV           *JobKv                   //table kv (分出的对应的KV值)
+	SpaceKV           *JobKv                   //空格 kv (分出的对应的KV值)
+	BPackage          *BlockPackage            //分包信息
+	Tag               map[string]bool          //块标签
+	Block             []*Block                 //子块
+	Classify          map[string]bool          //块分类
+	Winnerorder       []map[string]interface{} //块中,中标候选人排序
 }
 
 //块
@@ -173,22 +174,3 @@ type NameCode struct {
 	Name string
 	Code string
 }
-
-type BlockClassify struct {
-	Type     map[string][]*NameCode
-	Classify map[string]*TagFile
-}
-
-func (b *BlockClassify) GetClassify(tp string, src []string) map[string]bool {
-	m := map[string]bool{}
-	for _, v := range src {
-		v = TrimLRAll(v, "")
-		for _, vv := range b.Type[tp] {
-			if ok, _ := b.Classify[vv.Name].Match(v); ok {
-				m[vv.Code] = true
-				break
-			}
-		}
-	}
-	return m
-}

+ 27 - 0
src/jy/util/blockclassify.go

@@ -0,0 +1,27 @@
+package util
+
+type BlockClassify struct {
+	Type     map[string][]*NameCode
+	Classify map[string]*TagFile
+}
+
+func (b *BlockClassify) GetClassify(tp string, src []string) (map[string]bool, []string) {
+	m := map[string]bool{}
+	array := []string{}
+	for _, v := range src {
+		v = TrimLRAll(v, "")
+		flag := false
+		for _, vv := range b.Type[tp] {
+			if ok, _ := b.Classify[vv.Name].Match(v); ok {
+				m[vv.Code] = true
+				flag = true
+				break
+			} else {
+			}
+		}
+		if !flag {
+			array = append(array, v)
+		}
+	}
+	return m, array
+}

+ 3 - 1
src/jy/util/tagmatch.go

@@ -88,7 +88,9 @@ func binarysearch(arr Tags, begin, end int, key, types string) int {
 			return binarysearch(arr, mid+1, end, key, types)
 		}
 	} else {
-		if arr[mid].Value == key {
+		if arr[mid].TagReg != nil && arr[mid].TagReg.MatchString(key) {
+			return mid
+		} else if arr[mid].Value == key {
 			return mid
 		} else if arr[mid].Value > key {
 			return binarysearch(arr, begin, mid-1, key, types)

+ 4 - 3
src/jy/util/util.go

@@ -21,9 +21,10 @@ var BrandRules map[string]map[string]string
 var GoodsConfig []string
 var BrandConfig []string
 
-var GoodsGet *DFA     //商品
-var BrandGet *DFA     //品牌
-var IsBrandGoods bool //是否开启品牌抽取
+var GoodsGet *DFA           //商品
+var BrandGet *DFA           //品牌
+var IsBrandGoods bool       //是否开启品牌抽取
+var RetainKvWeight = -99999 //没有标准化的kv的权重
 
 func init() {
 	syncint = make(chan bool, 1)

+ 1 - 1
src/web/templates/admin/blockclassify_tag.html

@@ -12,7 +12,7 @@
 		</h1>
 		<ol class="breadcrumb">
 		  <li><a href="/admin/version"><i class="fa fa-dashboard"></i>抽取版本</a></li>
-		  <li class="active"><a href="/admin/version/blockinfo?id={{.vid}}">分块配置</a></li>
+		  <li class="active"><a href="/admin/version/blockinfo?vid={{.vid}}">分块配置</a></li>
 		</ol>
     </section>
   <!-- Main content -->