Procházet zdrojové kódy

抽取分包信息

zhangjinkun před 6 roky
rodič
revize
e362b4b43e

+ 1 - 0
src/config.json

@@ -3,5 +3,6 @@
     "mgodb": "192.168.3.207:27082",
     "dbsize": 5,
     "dbname": "extract_kf",
+    "saveresult": true,
     "fieldscore": true
 } 

+ 0 - 1
src/jy/admin/buyermanager/buyermanage.go

@@ -1,4 +1,3 @@
-//采购单位管理,验证规则放在版本上,验证不是过滤。中标单位也一样
 package buyermanager
 
 import (

+ 114 - 0
src/jy/extract/extpackage.go

@@ -0,0 +1,114 @@
+// extpackage
+package extract
+
+import (
+	"jy/clear"
+	ju "jy/util"
+	qu "qfw/util"
+)
+
+//处理分包信息
+func PackageDetail(j *ju.Job, e *ExtractTask) {
+	if len(j.BlockPackage) > 0 {
+		packageResult := map[string]map[string]interface{}{}
+		packagenum := len(j.BlockPackage)
+		for pkName, pkg := range j.BlockPackage {
+			sonJobResult := map[string]interface{}{}
+			sonJobResult["text"] = pkg.Text
+			sonJobResult["origin"] = pkg.Origin
+			sonJobResult["type"] = pkg.Type
+			sonJobResult["winnerorder"] = pkg.WinnerOrder
+			for k, tags := range e.Tag {
+			L:
+				for _, tag := range tags {
+					if pkg.TableKV != nil {
+						for key, val := range pkg.TableKV.Kv {
+							if tag.Key == key {
+								sonJobResult[k] = val
+								if packagenum == 1 {
+									field := &ju.ExtField{
+										Field:     k,
+										Code:      "package",
+										RuleText:  "package",
+										Type:      "table",
+										MatchType: "tag_string",
+										ExtFrom:   "package",
+										Value:     val,
+										Score:     0,
+									}
+									j.Result[k] = append(j.Result[k], field)
+								}
+								break L
+							}
+						}
+					}
+					if pkg.ColonKV != nil {
+						for key, val := range pkg.ColonKV.Kv {
+							if tag.Key == key {
+								sonJobResult[k] = val
+								if packagenum == 1 {
+									field := &ju.ExtField{
+										Field:     k,
+										Code:      "package",
+										RuleText:  "package",
+										Type:      "colon",
+										MatchType: "tag_string",
+										ExtFrom:   "package",
+										Value:     val,
+										Score:     0,
+									}
+									j.Result[k] = append(j.Result[k], field)
+								}
+								break L
+							}
+						}
+					}
+					if pkg.SpaceKV != nil {
+						for key, val := range pkg.SpaceKV.Kv {
+							if tag.Key == key {
+								sonJobResult[k] = val
+								if packagenum == 1 {
+									field := &ju.ExtField{
+										Field:     k,
+										Code:      "package",
+										RuleText:  "package",
+										Type:      "space",
+										MatchType: "tag_string",
+										ExtFrom:   "package",
+										Value:     val,
+										Score:     0,
+									}
+									j.Result[k] = append(j.Result[k], field)
+								}
+								break L
+							}
+						}
+					}
+				}
+			}
+			//如果有中标候选人排序,优先用第一中标候选人的中标单位和中标金额覆盖该包里面相应的字段的值
+			if pkg.WinnerOrder != nil && len(pkg.WinnerOrder) > 0 {
+				firstWinnerOrder := pkg.WinnerOrder[0]
+				if qu.ObjToString(sonJobResult["winner"]) == "" || (!pkg.Accuracy && qu.ObjToString(firstWinnerOrder["entname"]) != "" && qu.Int64All(firstWinnerOrder["sort"]) == 1) {
+					sonJobResult["winner"] = firstWinnerOrder["entname"]
+				}
+				if qu.Float64All(sonJobResult["bidamount"]) == 0 || (!pkg.Accuracy && qu.Float64All(firstWinnerOrder["price"]) > 0 && qu.Int64All(firstWinnerOrder["sort"]) == 1) {
+					sonJobResult["bidamount"] = firstWinnerOrder["price"]
+				}
+			}
+			//log.Println(pkName, sonJobResult)
+			packageResult[pkName] = sonJobResult
+		}
+		if len(packageResult) > 0 {
+			j.PackageInfo = packageResult
+		}
+	}
+	for _, packageResult := range j.PackageInfo {
+		//函数清理
+		for key, val := range packageResult {
+			data := clear.DoClearFn(e.ClearFn[key], []interface{}{val, j.Content})
+			packageResult[key] = data[0]
+		}
+	}
+	//log.Println(j.PackageInfo)
+}

+ 19 - 12
src/jy/extract/extract.go

@@ -1,8 +1,6 @@
 package extract
 
 import (
-	"encoding/json"
-	//"encoding/json"
 	"fmt"
 	"jy/clear"
 	db "jy/mongodbutil"
@@ -204,17 +202,16 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 	//		log.Println(bl.ColonKV.Kv)
 	//	}
 	//	for k, v := range j.BlockPackage {
-	//		bs, _ := json.Marshal(v.TableKV)
-	//		log.Println(k, string(bs), v.WinnerOrder)
+	//		//bs, _ := json.Marshal(v.TableKV)
+	//		log.Println(k, v.WinnerOrder)
 	//	}
+	//log.Println("Winnerorder", j.Winnerorder)
 	qu.Try(func() {
 		doc := *j.Data
 		//全局前置规则,结果覆盖doc属性
 		for _, v := range e.RulePres {
 			doc = ExtRegPre(doc, j, v, e.TaskInfo)
 		}
-		//log.Println("全局前置规则", doc)
-
 		//抽取规则
 		for _, vc := range e.RuleCores {
 			tmp := ju.DeepCopy(doc).(map[string]interface{})
@@ -239,7 +236,6 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 				ExtRegBack(j, v, e.TaskInfo)
 			}
 			//log.Println("抽取-后置规则", tmp)
-
 		}
 		//全局后置规则
 		for _, v := range e.RuleBacks {
@@ -252,10 +248,11 @@ func (e *ExtractTask) ExtractProcess(j *ju.Job) {
 				v.Value = data[0]
 			}
 		}
-		bs, _ := json.Marshal(j.Result)
-		log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
+		PackageDetail(j, e) //处理分包信息
+		//		bs, _ := json.Marshal(j.Result)
+		//		log.Println("抽取结果", j.Title, j.SourceMid, string(bs))
 		//分析抽取结果并保存 todo
-		AnalysisSaveResult(j.Data, j.Result, e.TaskInfo)
+		AnalysisSaveResult(j, e.TaskInfo)
 
 	}, func(err interface{}) {
 		log.Println(err)
@@ -704,7 +701,9 @@ type FieldValue struct {
 }
 
 //分析抽取结果并保存
-func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.ExtField, task *TaskInfo) {
+func AnalysisSaveResult(j *ju.Job, task *TaskInfo) {
+	doc := j.Data
+	result := j.Result
 	_id := qu.BsonIdToSId((*doc)["_id"])
 	iscore, _ := ju.Config["fieldscore"].(bool)
 	if iscore { //打分
@@ -750,6 +749,12 @@ func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.Ext
 		}
 	}
 	resulttmp["result"] = result
+	if len(j.BlockPackage) > 0 { //分包详情
+		resulttmp["epackage"] = j.BlockPackage
+	}
+	if len(j.PackageInfo) > 0 { //分包信息
+		resulttmp["package"] = j.PackageInfo
+	}
 	for k, v := range *doc {
 		if resulttmp[k] == nil { //&& (k != "detail" || k != "contenthtml") {
 			resulttmp[k] = v
@@ -766,7 +771,9 @@ func AnalysisSaveResult(doc *map[string]interface{}, result map[string][]*ju.Ext
 		if len(tmp) > 0 { //保存抽取结果
 			task.DB.Update(task.SaveColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": tmp}, true, false)
 		}
-		db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
+		if b, ok := ju.Config["saveresult"].(bool); ok && b {
+			db.Mgo.Update("extract_result", `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
+		}
 	} else { //测试结果
 		db.Mgo.Update(task.TestColl, `{"_id":"`+_id+`"}`, map[string]interface{}{"$set": resulttmp}, true, false)
 	}

+ 51 - 0
src/jy/extract/isextract.go

@@ -0,0 +1,51 @@
+// isextract.go
+//标题、全文正则匹配,判断是否继续抽取
+package extract
+
+import (
+	qu "qfw/util"
+	"regexp"
+)
+
+var N_extractMap map[string][]string
+var N_extract map[string][]*regexp.Regexp
+
+func init() {
+	qu.ReadConfig("./res/isextract.json", &N_extractMap)
+	N_extract = map[string][]*regexp.Regexp{}
+	for key, regs := range N_extractMap {
+		for _, v := range regs {
+			tmp, _ := regexp.Compile(v)
+			N_extract[key] = append(N_extract[key], tmp)
+		}
+	}
+}
+
+func IsExtract(filed, title, content string) bool {
+	b := true
+	if N_extract[filed] != nil {
+		nregs := N_extract[filed]
+		for _, v := range nregs { //标题
+			rep := v.FindAllStringIndex(title, -1)
+			if len(rep) > 0 {
+				b = false
+				break
+			}
+		}
+		/*
+			if b {
+				for _, v := range N_extract["filter"] {
+					content = v.ReplaceAllString(content, "")
+				}
+				for _, v := range nregs { //正文
+					rep := v.FindAllStringIndex(content, -1)
+					if len(rep) > 0 {
+						b = false
+						break
+					}
+				}
+			}
+		*/
+	}
+	return b
+}

+ 2 - 2
src/jy/pretreated/analystep.go

@@ -42,7 +42,7 @@ func AnalyStart(job *util.Job) {
 					}
 					bl.Text = t
 					bl.ColonKV = GetKVAll(t, bl.Title, 1)
-					bl.SpaceKV = spacekvEntity.entrance(t, bl.Title)
+					bl.SpaceKV = SspacekvEntity.Entrance(t, bl.Title)
 				}
 			}
 			//块中再查找表格(块,处理完把值赋到块)
@@ -76,7 +76,7 @@ func AnalyStart(job *util.Job) {
 		}
 		//调用kv解析
 		bl.ColonKV = GetKVAll(newCon, "", 1)
-		bl.SpaceKV = spacekvEntity.entrance(newCon, "")
+		bl.SpaceKV = SspacekvEntity.Entrance(newCon, "")
 		job.Block = append(job.Block, bl)
 	}
 }

+ 6 - 22
src/jy/pretreated/analytable.go

@@ -139,7 +139,6 @@ func CommonDataAnaly(k, tabletag, tabledesc string, v interface{}) (k1 []string,
 	if len(res) > 0 {
 		b = true
 		for _, t1 := range res {
-			//u.Debug(k, k1, t1.Value, t1.Weight)
 			k1 = append(k1, t1.Value)
 			weight = append(weight, t1.Weight)
 		}
@@ -527,7 +526,6 @@ func AnalyTableV2(tabs []*goquery.Selection, toptype, blockTag, con string, ityp
 	}
 	//解析表格集
 	tabres.Analy()
-	//u.Debug(tabres.SortKV.Map)
 	return
 }
 
@@ -1284,8 +1282,6 @@ func (table *Table) FindKV() {
 				continue
 			}
 			for _, td := range tr.TDs {
-				//u.Debug(td.Val, td.BH, td.StartRow, td.StartCol)
-				//u.Debug(td.BH, td.Val)
 				/**
 				rt := table.StartAndEndRation[fmtkey("r", td.StartCol, td.EndCol)]
 				if rt != nil {
@@ -1400,6 +1396,7 @@ func (table *Table) FindKV() {
 			}
 		}
 	}
+	//log.Println("FindKV", table.SortKV.Map)
 }
 
 //获取中标人顺序
@@ -1456,9 +1453,8 @@ func GetBidSort(str string, n int) int {
 //查找每一个单元格的表头,调用FindNear
 func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 	near := table.FindNear(td, direct)
-	//u.Debug(td.Val, near, direct)
 	//	if near != nil {
-	//		u.Debug(td.Val, near.Val)
+	//		log.Println("td", near.Val, td.Val)
 	//	}
 	if near != nil && near.BH && (near.KeyDirect == vdirect || near.KeyDirect == 0) && (near.KVDirect == direct || near.KVDirect == 0) && near.KVDirect < 3 {
 		near.KVDirect = direct
@@ -1473,7 +1469,6 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 		if val != nil {
 			curpos := table.SortKV.Index[key]
 			thistr := table.kTD[curpos]
-			//u.Debug(curpos, key, thistr == near)
 			if thistr != near {
 				near.Val += "_"
 				for table.SortKV.Map[near.Val] != nil {
@@ -1559,7 +1554,7 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 			tkey := fmtkey("k", near.TR.RowPos, near.ColPos)
 			table.SortKV.ReplaceKey(key, val, tkey)
 		} else {
-			//u.Debug(near.Val, td.Val, val)
+			//log.Println("AddKey", near.Val, td.Val, val)
 			table.SortKV.AddKey(key, val)
 			pos := table.SortKV.Index[key]
 			if barr {
@@ -1582,16 +1577,10 @@ func (table *Table) FindTdVal(td *TD, direct, vdirect int) (b bool) {
 				}
 				table.kTD[pos] = near
 			}
-
 		}
 		b = true
 	}
-	//	 else {
-	//		u.Debug(direct, near == nil, td.Val, td.StartRow, td.EndRow, td.StartCol)
-	//		if near != nil {
-	//			u.Debug(near.Val, near.BH, near.StartRow, near.EndRow, near.KVDirect, near.KeyDirect)
-	//		}
-	//	}
+	//log.Println("map", b, table.SortKV.Map)
 	return
 }
 
@@ -1656,9 +1645,6 @@ func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
 	//是数组且能找到标段之类的提示
 	arr_count := 0
 	key_index := -1
-	//	for k, v := range tn.SortKV.Map["成交供应商"].([]string) {
-	//		u.Debug(k, v)
-	//	}
 	hasPkgTd := map[string]bool{}
 	for in, k := range tn.SortKV.Keys {
 		if excludeKey.MatchString(BracketsTextReg.ReplaceAllString(k, "")) {
@@ -1817,8 +1803,6 @@ func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
 			}
 		}
 	}
-	//	u.Debug(tn.SortKV.Keys)
-	//	u.Debug(tn.SortKV.Map)
 	//	u.Debug(index)
 	//过滤重复及标准化!
 	standIndex := []string{}
@@ -1930,14 +1914,14 @@ func (tn *Table) CheckMultiPackageByTable() (b bool, index []string) {
 				} else if val, bvs := v1.(string); bvs && len(index) == 1 {
 					//删除子包的kv
 					k1tags, _, _, _, _ := CommonDataAnaly(k1, "", "", val)
-					//k1tags := u.GetTags(k1)
 					if !(len(k1tags) > 0 && regexp.MustCompile("^(项目|开标|采购单位|招标机构)").MatchString(k1tags[0])) { //(k1tags[0].Value == "采购单位" || k1tags[0].Value == "项目编号")) {
-						//u.Debug(k1, val)
+						//log.Println("remove", k1, val)
 						tn.assemblePackage(k1, val, index[0])
 						tn.SortKV.RemoveKey(k1)
 					}
 					//u.Debug("----==2==-------", k1)
 				}
+
 			}
 		}
 	} else {

+ 3 - 3
src/jy/pretreated/colonkv.go

@@ -102,7 +102,7 @@ func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv {
 		con = RemoveWarpOfTdVal(con)
 	}
 	findkvs := []*Kv{}
-	lines := spacekvEntity.getLines(con)
+	lines := SspacekvEntity.getLines(con)
 	for index, line := range lines {
 		res := regKV.FindAllStringSubmatch(line, -1)
 		if len(res) > 0 {
@@ -158,12 +158,12 @@ func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv {
 //冒号kv和空格kv结合
 func (ce *ColonkvEntity) getColonSpaceKV(con string) []*Kv {
 	con = colonkvEntity.processText(con)
-	lines := spacekvEntity.getLines(con)
+	lines := SspacekvEntity.getLines(con)
 	kvMaps := []*Kv{}
 	for _, line := range lines {
 		kvs := colonkvEntity.getColonKv(line, "", 1)
 		if len(kvs) == 0 {
-			kv := spacekvEntity.divideKV(line)
+			kv := SspacekvEntity.divideKV(line)
 			if kv != nil {
 				kvMaps = append(kvMaps, kv...)
 			}

+ 4 - 4
src/jy/pretreated/division.go

@@ -273,7 +273,7 @@ func DivideBlock(content string, from int) ([]*util.Block, int) {
 		//解析kv
 		newText := TextAfterRemoveTable(bl.Text)
 		bl.ColonKV = GetKVAll(newText, bl.Title, from)
-		bl.SpaceKV = spacekvEntity.entrance(newText, bl.Title)
+		bl.SpaceKV = SspacekvEntity.Entrance(newText, bl.Title)
 		//正则抽取的时候有时需要匹配换行或者句号,这里在解析完kv之后,在块结尾添加换行和句号
 		bl.Text = appendWarpStop(bl.Text)
 	}
@@ -439,7 +439,7 @@ func FindPackageFromBlocks(blocks *[]*util.Block, title string) (blockPackage ma
 		if ok && false {
 			v.Text = surplusText
 			v.ColonKV = GetKVAll(surplusText, v.Title, 1)
-			v.SpaceKV = spacekvEntity.entrance(surplusText, v.Title)
+			v.SpaceKV = SspacekvEntity.Entrance(surplusText, v.Title)
 		}
 	}
 	return
@@ -608,7 +608,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 					(*blockPackage)[index].ColonKV.Kv[kv_k] = kv_v
 				}
 				//合并空格kv
-				spaceJobKv := spacekvEntity.entrance(text, "")
+				spaceJobKv := SspacekvEntity.Entrance(text, "")
 				for kv_k, kv_v := range spaceJobKv.Kv {
 					if kv_v == "" {
 						continue
@@ -637,7 +637,7 @@ func divisionPackageChild(blockPackage *map[string]*util.BlockPackage, content,
 					}
 				}
 				newBpkg.ColonKV = finalKv
-				newBpkg.SpaceKV = spacekvEntity.entrance(text, "")
+				newBpkg.SpaceKV = SspacekvEntity.Entrance(text, "")
 				(*blockPackage)[index] = newBpkg
 			}
 		}

+ 2 - 2
src/jy/pretreated/spacekv.go

@@ -10,13 +10,13 @@ import (
 type SpacekvEntity struct{}
 
 var (
-	spacekvEntity   = &SpacekvEntity{}
+	SspacekvEntity  = &SpacekvEntity{}
 	filterLine      = regexp.MustCompile("[::,,。??'\"“”‘’·~!…+=|&*#$【】]")
 	filterSpaceKey  = regexp.MustCompile("[((][^((]+[))]")
 	excludeSpaceKey = regexp.MustCompile("[.、�\\[【{{〔<《\\]】}}〕>》]")
 )
 
-func (se *SpacekvEntity) entrance(text, title string) *util.JobKv {
+func (se *SpacekvEntity) Entrance(text, title string) *util.JobKv {
 	lines := se.getLines(text)
 	kvMaps := []*util.Kv{}
 	for _, line := range lines {

+ 1 - 1
src/jy/pretreated/winnerorder.go

@@ -136,7 +136,7 @@ func (wo *WinnerOrderEntity) getText(text string, blocks []string, reg_2 *regexp
 		text = reg_2.ReplaceAllString(text, "\n$1")
 	}
 	text = regReplWrapSpace.ReplaceAllString(text, "")
-	lines := spacekvEntity.getLines(text)
+	lines := SspacekvEntity.getLines(text)
 	text = ""
 	for k, v := range lines {
 		v = strings.TrimSpace(v)

+ 16 - 15
src/jy/util/article.go

@@ -2,21 +2,22 @@ package util
 
 //
 type Job struct {
-	SourceMid    string                   //数据源的MongoId
-	Category     string                   //类别
-	Content      string                   //正文
-	Title        string                   //标题
-	SpiderCode   string                   //爬虫代码
-	Domain       string                   //网站域名
-	Href         string                   //原文链接
-	City         string                   //城市
-	Province     string                   //省份
-	Data         *map[string]interface{}  //数据库源数据
-	Block        []*Block                 //分块
-	Result       map[string][]*ExtField   //结果
-	BuyerAddr    string                   //采购单位地址
-	BlockPackage map[string]*BlockPackage //块中的分包
-	Winnerorder  []map[string]interface{} //中标候选人排序
+	SourceMid    string                            //数据源的MongoId
+	Category     string                            //类别
+	Content      string                            //正文
+	Title        string                            //标题
+	SpiderCode   string                            //爬虫代码
+	Domain       string                            //网站域名
+	Href         string                            //原文链接
+	City         string                            //城市
+	Province     string                            //省份
+	Data         *map[string]interface{}           //数据库源数据
+	Block        []*Block                          //分块
+	Result       map[string][]*ExtField            //结果
+	BuyerAddr    string                            //采购单位地址
+	BlockPackage map[string]*BlockPackage          //块中的分包
+	Winnerorder  []map[string]interface{}          //中标候选人排序
+	PackageInfo  map[string]map[string]interface{} //分包信息
 }
 
 type ExtField struct {

+ 5 - 0
src/jy/util/tagmatch.go

@@ -7,8 +7,11 @@ import (
 	"regexp"
 	"sort"
 	"strings"
+	"sync"
 )
 
+var lock sync.Mutex
+
 //单条tag
 type Tag struct {
 	Value  string //
@@ -131,6 +134,7 @@ func GetAppointTags(src string, array []string) Tags {
 			m[v] = true
 		}
 	}
+	lock.Lock()
 	for k, v := range TagdbTable {
 		if len(m) > 0 && !m[k] {
 			continue
@@ -143,6 +147,7 @@ func GetAppointTags(src string, array []string) Tags {
 			})
 		}
 	}
+	lock.Unlock()
 	//sort.Sort(ret)
 	return ret
 }

+ 11 - 0
src/res/isextract.json

@@ -0,0 +1,11 @@
+{
+    "bidamount": [
+        "(不足三家|废标流标|废标公告|流标公告|ppp项目|PPP项目)"
+    ],
+    "winner": [
+        "(不足三家|废标流标|废标公告|流标公告|ppp项目|PPP项目)"
+    ],
+	"filter":[
+		"(?i)(PPP项目[::]否|非PPP项目)"
+	]
+}

+ 1 - 1
src/web/templates/admin/com_memu.html

@@ -33,7 +33,7 @@
             </span>
           	</a>
           	<ul class="treeview-menu">
-	            <li><a href="/admin/rule/pre"><i class="fa fa-circle-o"></i>菜单</a></li>
+	            <li><a href="/admin/buyermanager/list"><i class="fa fa-circle-o"></i>采购单位审核</a></li>
 			</ul>
         </li>
 		<li class="treeview">