瀏覽代碼

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

apple 5 年之前
父節點
當前提交
2c8d30ca14

+ 3 - 3
src/config.json

@@ -3,18 +3,18 @@
     "mgodb": "192.168.3.207:27082",
     "dbsize": 10,
     "dbname": "extract_kf",
-    "redis": "buyer=127.0.0.1:6379,winner=127.0.0.1:6379,agency=127.0.0.1:6379",
+    "redis": "buyer=192.168.3.207:1679,winner=192.168.3.207:1679,agency=192.168.3.207:1679",
     "elasticsearch": "http://192.168.3.11:9800",
     "elasticPoolSize": 30,
     "mergetable": "projectset",
     "mergetablealias": "projectset_v1",
-    "saveresult": false,
+    "saveresult": true,
     "qualityaudit": false,
     "saveblock": false,
     "filelength": 100000,
     "iscltlog": false,
     "brandgoods": false,
-    "udptaskid": "5dc92520d0fcef1a582f869d",
+    "udptaskid": "5cdd3025698414032c8322b1",
     "udpport": "1484",
     "nextNode": [
         {

+ 7 - 0
src/jy/clear/clear.go

@@ -27,6 +27,7 @@ func init() {
 	BindFn("getPhone", GetPhone)                 //取手机号
 	BindFn("chiToEng", ChiToEng)                 //中文符号转英文
 	BindFn("clearBuyerPerson", ClearBuyerPerson) //处理较长采购联系人
+	BindFn("clearNumber", ClearNumber)           //一般用于处理抽取联系人后带有电话的情况
 }
 
 //绑定清理方法
@@ -56,3 +57,9 @@ func GetPhone(data []interface{}) []interface{} {
 	data[0] = PhoneReg.FindString(fmt.Sprint(data[0]))
 	return data
 }
+
+//去除数字
+func ClearNumber(data []interface{}) []interface{} {
+	data[0] = clearNum.ReplaceAllString(fmt.Sprint(data[0]), "")
+	return data
+}

+ 2 - 0
src/jy/clear/cutspace.go

@@ -12,6 +12,7 @@ var (
 	catSymbol      *regexp.Regexp
 	separateSymbol *regexp.Regexp
 	placeReg       *regexp.Regexp
+	clearNum       *regexp.Regexp
 )
 
 var spaces = []string{"\u3000", "\u2003", "\u00a0", "\t", "\r", "\n"}
@@ -22,6 +23,7 @@ func init() {
 	catSymbol, _ = regexp.Compile(`[]+`)
 	separateSymbol, _ = regexp.Compile("[\\s\u3000\u2003\u00a0\\n,,、/。|]")
 	placeReg, _ = regexp.Compile("^.*(公司|学(校)?|集团|单位|机构|企业|厂|场|院|所|店|中心|市|局|站|城|处|行|部|队|联合(会|体)|工作室)$")
+	clearNum, _ = regexp.Compile("\\d+")
 }
 
 var LableStr = "&?(amp|nbsp|#8266);?|(<).*?(>?)"

+ 1 - 1
src/jy/clear/specialsymbols.go

@@ -226,7 +226,7 @@ func RemoveAsy(text string) string {
 		if AsyReg.MatchString(first) {
 			textRune = textRune[1:]
 		}
-		if len(textRune) > 0 && AsyReg.MatchString(last) && len(text) > 0 {
+		if len(textRune) > 0 && AsyReg.MatchString(last) {
 			textRune = textRune[:len(textRune)-1]
 		}
 		text = string(textRune)

+ 7 - 15
src/jy/extract/extract.go

@@ -47,16 +47,12 @@ func StartExtractTestTask(taskId, startId, num, resultcoll, trackcoll string) bo
 	ext.InitSite()
 	ext.InitRulePres()
 	ext.InitRuleBacks(false)
-	ext.InitRuleBacks(true)
 	ext.InitRuleCore(false)
-	ext.InitRuleCore(true)
 	ext.InitPkgCore()
 	ext.InitBlockRule()
 	ext.InfoTypeList()
 	ext.InitTag(false)
-	ext.InitTag(true)
 	ext.InitClearFn(false)
-	ext.InitClearFn(true)
 	if ext.IsExtractCity { //版本上控制是否开始城市抽取
 		//初始化城市DFA信息
 		ext.InitCityInfo()
@@ -129,16 +125,12 @@ func StartExtractTaskId(taskId string) bool {
 	ext.InitSite()
 	ext.InitRulePres()
 	ext.InitRuleBacks(false)
-	ext.InitRuleBacks(true)
 	ext.InitRuleCore(false)
-	ext.InitRuleCore(true)
 	ext.InitPkgCore()
 	ext.InitBlockRule()
 	ext.InfoTypeList()
 	ext.InitTag(false)
-	ext.InitTag(true)
 	ext.InitClearFn(false)
-	ext.InitClearFn(true)
 	if ext.IsExtractCity { //版本上控制是否开始城市抽取
 		//初始化城市DFA信息
 		//ext.InitCityDFA()
@@ -290,12 +282,12 @@ func (e *ExtractTask) PreInfo(doc map[string]interface{}) (j, jf *ju.Job, isSite
 		subtype = "all"
 	}
 	toMap := qu.ObjToMap(doc["jsondata"])
-	//	log.Debug("toMap", toMap)
-	//	if toMap != nil {
-	//		if (*toMap)["extweight"] == nil {
-	//			(*toMap)["extweight"] = ju.Config["jsondata_extweight"]
-	//		}
-	//	}
+	//log.Debug("toMap", toMap)
+	if toMap != nil {
+		if (*toMap)["extweight"] == nil {
+			(*toMap)["extweight"] = ju.Config["jsondata_extweight"]
+		}
+	}
 	j = &ju.Job{
 		SourceMid:      qu.BsonIdToSId(doc["_id"]),
 		Category:       toptype,
@@ -441,7 +433,7 @@ func (e *ExtractTask) ExtractProcess(j, jf *ju.Job, isSite bool) {
 		}
 	}
 
-	//分析抽取结果并保存 todo
+	//分析抽取结果并保存
 	AnalysisSaveResult(j, jf, e)
 	<-e.TaskInfo.ProcessPool
 }

+ 9 - 8
src/jy/extract/extractInit.go

@@ -2,7 +2,6 @@
 package extract
 
 import (
-	"gopkg.in/mgo.v2/bson"
 	db "jy/mongodbutil"
 	ju "jy/util"
 	qu "qfw/util"
@@ -13,6 +12,8 @@ import (
 	"sync"
 	"time"
 
+	"gopkg.in/mgo.v2/bson"
+
 	log "github.com/donnie4w/go-logger/logger"
 	"github.com/go-ego/gse"
 )
@@ -86,18 +87,18 @@ type ExtractTask struct {
 	ResultChanel chan bool //抽取结果详情
 	sync.RWMutex
 	ResultArr [][]map[string]interface {
-	}                   //抽取结果详情
+	} //抽取结果详情
 	BidChanel chan bool //抽取结果
 	BidArr    [][]map[string]interface {
-	}                   //抽取结果
-	BidTotal int        //结果数量
+	} //抽取结果
+	BidTotal int //结果数量
 
 	RecogFieldMap map[string]map[string]interface {
-	}                    //识别字段
+	} //识别字段
 	FidClassMap map[string][]map[string]interface {
-	}                    //分类
+	} //分类
 	CidRuleMap map[string][]map[string]interface {
-	}                    //规则
+	} //规则
 	AuditFields []string //需要审核的字段名称
 
 	SiteCityMap          map[string]*SiteCity //站点对应的省市区
@@ -1612,7 +1613,7 @@ func (e *ExtractTask) InitAuditRule() {
 func (e *ExtractTask) InitAuditFields() {
 	if len(e.AuditFields) == 0 {
 		v, _ := db.Mgo.FindOne("version", `{"isuse":true,"delete":false}`) //查找当前使用版本
-		if v != nil && len(*v) > 0 { //查找当前使用版本中属性配置需要审核的字段
+		if v != nil && len(*v) > 0 {                                       //查找当前使用版本中属性配置需要审核的字段
 			vid := qu.BsonIdToSId((*v)["_id"])
 			query := map[string]interface{}{
 				"isaudit": true,

+ 46 - 20
src/jy/pretreated/analytable.go

@@ -94,11 +94,12 @@ var (
 	projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$")
 	MhSpilt        = regexp.MustCompile("[::]")
 	//识别采购单位联系人、联系电话、代理机构联系人、联系电话
-	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公|单位)?((电话([//]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|经办)人)|采购方代表")
+	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|(征求意见|报名审核购买)?((联系人?(及|和)?|办公|单位)?(((联系)?(电话|方式)([//]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(详细)?(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|经办)人)|采购方代表")
 	ContactInfoMustReg  = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$")
 	ContactType         = map[string]*regexp.Regexp{
 		"采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|招标(服务)?|甲|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心地址|业主|收料人|采购部"),
 		"代理机构": regexp.MustCompile("(代理|受托|集中采购).{0,2}(人|方|单位|公司|机构)|招标机构|采购代理"),
+		"中标单位": regexp.MustCompile("拟?(中标|成交|中选|供(货|应))[^候选]{0,2}(人|方|单位|公司|(服务|供应)?商|企业)"),
 	}
 	ContactBuyerPersonFilterReg = regexp.MustCompile("(管理局)$")
 	MultipleValueSplitReg       = regexp.MustCompile("[,,、\\s\u3000\u2003\u00a0]")
@@ -110,7 +111,7 @@ var (
 	jsonReg                     = regexp.MustCompile(`\{.+:[^}]*\} `) //  \{".*\":\".+\"}
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 	winnerOrderAndBidResult     = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")
-	WinnerOrderStr = regexp.MustCompile(`(集团|公司|学校|中心|家具城|门诊|\[大中小\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$`)
+	WinnerOrderStr              = regexp.MustCompile(`(集团|公司|学校|中心|家具城|门诊|\[大中小\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$`)
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -415,11 +416,11 @@ func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) {
 
 										}
 									}
-								}else if kv == "预算"{
-									if strings.Contains(k,"万元"){
+								} else if kv == "预算" {
+									if strings.Contains(k, "万元") {
 										for vsk, vsv := range vs {
-											if !strings.Contains(vsv,"万元"){
-												vs[vsk] = vsv+"万元"
+											if !strings.Contains(vsv, "万元") {
+												vs[vsk] = vsv + "万元"
 											}
 										}
 									}
@@ -460,8 +461,8 @@ func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) {
 			}
 			for kk, vv := range kvTags {
 				if vsss, ok := v.([]string); ok {
-					if len(vv) > 0{
-						for _,vvvvvv := range vsss{
+					if len(vv) > 0 {
+						for _, vvvvvv := range vsss {
 							tmp := u.Tag{}
 							tmp.Weight = vv[0].Weight
 							tmp.Key = vv[0].Key
@@ -1992,17 +1993,17 @@ func (tn *Table) CheckMultiPackageByTable(isSite bool, codeSite string) (b bool,
 			for nk, v := range index {
 				if tn.BlockPackage.Map[v] == nil {
 					kv := u.NewJobKv()
-					for tnk,tnv := range tn.StandKV{
-						if nk >= len(tnv){
+					for tnk, tnv := range tn.StandKV {
+						if nk >= len(tnv) {
 							continue
 						}
 						kv.KvTags[tnk] = append(kv.KvTags[tnk], tnv[nk])
 					}
 					//kv.KvTags = tn.StandKV
 					bp := &u.BlockPackage{}
-					bp.Index = v                  //序号 (转换后编号,只有数字或字母)
-					bp.Origin = oldIndex[nk]      //包的原始值
-					bp.TableKV = kv               //table kv (分出的对应的KV值)
+					bp.Index = v             //序号 (转换后编号,只有数字或字母)
+					bp.Origin = oldIndex[nk] //包的原始值
+					bp.TableKV = kv          //table kv (分出的对应的KV值)
 					bp.Text = tn.Html
 					tn.BlockPackage.AddKey(v, bp) //table子包数组
 				}
@@ -2524,7 +2525,7 @@ func (tn *Table) TdContactFormat(contactFormat *u.ContactFormat, isSite bool, co
 	reCreate := false
 	matchCount := 0
 	contactTypeTagMap := map[string]map[string][]interface{}{}
-	//u.Debug(mustMatchFirst, indexMap, matchMap)
+	//qutil.Debug("============================", mustMatchFirst, indexMap, matchMap)
 	notMatchTrCount := 0
 	allAscFind := true //开启正序查询
 	//涉及变量allAscFind,indexMap
@@ -2536,23 +2537,30 @@ func (tn *Table) TdContactFormat(contactFormat *u.ContactFormat, isSite bool, co
 		for _, tr := range tn.TRs {
 			for td_index, td := range tr.TDs {
 				thisTdKvs := tn.tdkv(td) //获取td冒号kv
+				//qutil.Debug(td.Val, len(thisTdKvs))
+				//				for _, v := range thisTdKvs {
+				//					qutil.Debug(v.Key, v.Value)
+				//				}
 				if len(thisTdKvs) != 1 {
 					continue
 				}
 				//1.处理带括号的()[]【】采购单位,代理机构;2.识别采购单位联系人、联系电话、代理机构联系人、联系电话
 				goOnFunc, isContinue, td_k := tn.tdKV(thisTdKvs[0].Key, &matchPrevFlag, &isCanAddToIndexMap, &indexMap, "LS")
+				//qutil.Debug("goOnFunc---", goOnFunc, "isContinue---", isContinue, "indexMap---", indexMap, "isCanAddToIndexMap---", isCanAddToIndexMap)
 				if !goOnFunc {
 					break LS
 				}
 				if isContinue {
 					continue
 				}
-				//采购单位,代理机构
+				//采购单位,代理机构,中标单位
+				//qutil.Debug("td_k---", td_k, HasOrderContactType(td_k))
 				for _, k := range HasOrderContactType(td_k) {
-					if !ContactType[k].MatchString(td_k) { //不是采购单位,代理机构跳过
+					if !ContactType[k].MatchString(td_k) { //不是采购单位,代理机构,中标单位跳过
 						continue
 					}
 					if len(indexMap) == 0 {
+						//qutil.Debug("isCanAddToIndexMap---", isCanAddToIndexMap, "prevCanAddToIndexMap---", prevCanAddToIndexMap, len(tr.TDs))
 						if isCanAddToIndexMap || (prevCanAddToIndexMap && len(tr.TDs) == 1) {
 							myPrevTdVal := ""
 							if td_index-2 >= 0 {
@@ -2578,6 +2586,8 @@ func (tn *Table) TdContactFormat(contactFormat *u.ContactFormat, isSite bool, co
 		}
 	}
 	//////
+	//qutil.Debug("indexMap-------------------------", indexMap)
+	//indexMap = map[int]string{}
 L:
 	for tr_index, tr := range tn.TRs {
 		thisTrHasMatch := false
@@ -2591,19 +2601,33 @@ L:
 				thisTdKvs = tn.tdkv(td) //获取冒号kv
 			}
 			tdAscFind := true //开启td正序查询
+			//qutil.Debug("---", td.Val, len(thisTdKvs), len(indexMap))
 			if len(thisTdKvs) == 0 {
 				continue
 			} else if allAscFind && len(thisTdKvs) >= 3 && len(indexMap) == 0 {
 				//采购人在联系人、电话后面的处理
 				tdAscFind = tn.hasIndexMap(thisTdKvs, &indexMap, tdAscFind)
 			}
+			//qutil.Debug(len(thisTdKvs), len(tr.TDs))
+			if len(thisTdKvs) >= 2 && len(tr.TDs) == 1 { //td中包含多个kv值 5d6b2aa2a5cb26b9b73e79d2
+				tmpIndexMap := map[int]string{}
+				start := 0
+				for _, td_kv := range thisTdKvs {
+					//qutil.Debug(td_kv.Key)
+					for _, k := range HasOrderContactType(td_kv.Key) {
+						tmpIndexMap[start] = k
+						start++
+					}
+				}
+				indexMap = tmpIndexMap
+			}
 			prevKey := ""
 			oldIndexMapLength := len(indexMap)
 			thidTdIndex := td_index
 			//notmatchCount := 0
 			kvTitle := ""
+			//qutil.Debug("indexMap++++++++++++++++++", indexMap, oldIndexMapLength)
 			for _, td_kv := range thisTdKvs {
-				//u.Debug(td_kv.Key, td_kv.Value, td_kv.Title)
 				iscontinue := false
 				td_v := td_kv.Value
 				td_k := FilterContactKey(td_kv.Key) //带括号()[]的采购单位,代理机构处理
@@ -2612,6 +2636,7 @@ L:
 					continue
 				}
 				//都为正序查询
+				//qutil.Debug("td_k+++", td_k, "td_v+++", td_v, "allAscFind+++", allAscFind, "tdAscFind+++", tdAscFind)
 				if allAscFind && tdAscFind {
 					//都为正序查询处理
 					matchCount, weightMap, matchMap, thisTrHasMatch, indexMap, iscontinue, reCreate, thidTdIndex = tn.asdFind(td_k, matchCount, weightMap, matchMap, td, thisTrHasMatch, td_kv, indexMap, iscontinue, reCreate, thidTdIndex, isSite, codeSite)
@@ -2626,7 +2651,6 @@ L:
 					indexMap = map[int]string{}
 				}
 				kvTitle = td_kv.Title
-				//u.Debug(indexMap, td_k, td_v, matchMap)
 				if td_k_length < 2 || td_k_length > 10 {
 					continue
 				}
@@ -2663,11 +2687,13 @@ L:
 					}
 					//u.Debug(indexMap, td_k, td_v, matchMap, index, modle)
 					//myContactType
+
 					myContactType := indexMap[index]
 					if myContactType == "" && len(indexMap) == 1 {
 						_, onlyContactType := u.FirstKeyValueInMap(indexMap)
 						myContactType, _ = onlyContactType.(string)
 					}
+					//qutil.Debug("indexMap+++", indexMap, "index+++", index, "myContactType+++", myContactType)
 					if myContactType == "" {
 						continue
 					}
@@ -2687,7 +2713,7 @@ L:
 					modle(thisTdKvs, td, myContactType, td_k, td_v, &contactTypeTagMap, tn, &weightMap, tr_index, td_index, isSite, codeSite)
 				}
 			}
-			//u.Debug(td.SortKV.Map)
+			//qutil.Debug("map===", td.SortKV.Map)
 		}
 		if allAscFind && !thisTrHasMatch {
 			notMatchTrCount++
@@ -3217,7 +3243,7 @@ func initLineMapLineMapArr(table *Table) (lineMapArr map[string]*SortMap, lineMa
 	for _, key := range table.SortKV.Keys { //遍历table.SortKV.Keys而不是直接遍历table.SortKV.Map是为了得到table头的顺序
 		val := table.SortKV.Map[key]
 		key = regReplAllSpace.ReplaceAllString(key, "")
-		key = strings.Replace(key, "", "", -1) //处理一个特殊的采购量 经上层处理空格后未处理掉
+		key = strings.Replace(key, "", "", -1)    //处理一个特殊的采购量 经上层处理空格后未处理掉
 		if realTypeVal, ok := val.([]string); ok { //val为数组 {"数量":["1","2","3"]}
 			/*
 				{

+ 57 - 22
src/jy/pretreated/colonkv.go

@@ -67,10 +67,10 @@ func (ce *ColonkvEntity) divisionMoreKV(con string) string {
 }
 
 //获取冒号kv入口
-func (ce *ColonkvEntity) entrance(con, title string, contactFormat *ContactFormat, from int,isSite bool,codeSite string) ([]*Kv, map[string]string) {
+func (ce *ColonkvEntity) entrance(con, title string, contactFormat *ContactFormat, from int, isSite bool, codeSite string) ([]*Kv, map[string]string) {
 	kvs := ce.GetKvs(con, title, from)
 	if from == 1 {
-		FormatContactKv(&kvs, title, nil, contactFormat,isSite,codeSite)
+		FormatContactKv(&kvs, title, nil, contactFormat, isSite, codeSite)
 	}
 	kv := map[string]string{}
 	for _, v := range kvs {
@@ -163,14 +163,14 @@ func (ce *ColonkvEntity) getColonKv(con, title string, from int) []*Kv {
 }
 
 //冒号kv和空格kv结合
-func (ce *ColonkvEntity) getColonSpaceKV(con string,isSite bool,codeSite string) []*Kv {
+func (ce *ColonkvEntity) getColonSpaceKV(con string, isSite bool, codeSite string) []*Kv {
 	con = colonkvEntity.processText(con)
 	lines := SspacekvEntity.getLines(con)
 	kvMaps := []*Kv{}
 	for _, line := range lines {
 		kvs := colonkvEntity.getColonKv(line, "", 1)
 		if len(kvs) == 0 {
-			kv := SspacekvEntity.divideKV(line,isSite,codeSite)
+			kv := SspacekvEntity.divideKV(line, isSite, codeSite)
 			if kv != nil {
 				kvMaps = append(kvMaps, kv...)
 			}
@@ -276,7 +276,7 @@ func IsContactKvHandle(value string, m map[string]bool) bool {
 
 //kv关于联系人信息的处理
 //采购人>集中采购机构
-func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *ContactFormat,isSite bool,codeSite string) {
+func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *ContactFormat, isSite bool, codeSite string) {
 	////////////////////////////
 	//处理联系人信息
 	var indexMap map[int]string
@@ -294,14 +294,19 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	ascFind := true
 	ascFindFlag := len(indexMap) == 0 && buyers == nil
 	//采购人在联系人、电话后面的处理
-	isCanAddToIndexMap := false
+	//qutil.Debug("indexMap---", indexMap)
+	//qutil.Debug("ascFind---", ascFind, "ascFindFlag---", ascFindFlag, "isCanAddToIndex---", isCanAddToIndexMap)
 	for _, kv := range *kvs {
-		k := FilterContactKey(kv.Key)
+		isCanAddToIndexMap := false
+		k := FilterContactKey(kv.Key) //过滤key
+		//qutil.Debug(k, "---", kv.Value)
 		k_length := len([]rune(k))
 		if k_length < 2 || k_length > 15 {
 			continue
 		}
-		isContinue := ContactInfoMustReg.MatchString(k)
+		isContinue := ContactInfoMustReg.MatchString(k) //精确匹配 邮编、电话、联系人等
+		//qutil.Debug("isContinue---", isContinue, ContactInfoVagueReg.MatchString(k), IsMapHasValue(k, ContactType), ascFindFlag)
+		//qutil.Debug((isContinue || (ContactInfoVagueReg.MatchString(k) && IsMapHasValue(k, ContactType))) && ascFindFlag)
 		if (isContinue || (ContactInfoVagueReg.MatchString(k) && IsMapHasValue(k, ContactType))) && ascFindFlag {
 			if len(indexMap) > 0 {
 				ascFind = true
@@ -311,30 +316,39 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			isCanAddToIndexMap = true
 		}
 		n := 1
+		//qutil.Debug("isCanAddToIndexMap---", isCanAddToIndexMap, "ascFind---", ascFind, "ascFindFlag---", ascFindFlag, "indexMap---", indexMap)
+		//qutil.Debug(" HasOrderContactType(k)---", HasOrderContactType(k))
 		for _, ct_k := range HasOrderContactType(k) {
+			//qutil.Debug("ct_k---", ct_k, !ContactType[ct_k].MatchString(k))
 			if !ContactType[ct_k].MatchString(k) {
 				continue
 			}
 			totalIndexMap[ct_k] = true
+			//qutil.Debug(isContinue, !ascFindFlag, totalIndexMap)
 			/////////////////////////////
 			if isContinue || !ascFindFlag {
 				continue
 			}
-			//			if isCanAddToIndexMap && len(indexMap) == 0 {
+			//qutil.Debug("isCanAddToIndexMap---", isCanAddToIndexMap)
 			if isCanAddToIndexMap {
 				indexMap[n] = ct_k
 				n++
 				ascFind = false
 			}
+			//qutil.Debug(n, ascFind, indexMap)
 		}
+		//qutil.Debug("indexMap---", indexMap)
 	}
+	//qutil.Debug("indexMap1---", indexMap)
 	mustMatchFirst := len(indexMap) > 0 //第一个必须匹配上
 	titleMatch := false
+	//qutil.Debug("title---", title, ContactTypeTitleMatch(title))
 	if titleMatchType := ContactTypeTitleMatch(title); titleMatchType != "" {
 		titleMatch = true
 		mustMatchFirst = false
 		indexMap = map[int]string{1: titleMatchType}
 	}
+	//qutil.Debug("titleMatch---", titleMatch, "mustMatchFirst---", mustMatchFirst, "indexMap---", indexMap)
 	//	if titleMatchType := ContactTypeTitleMatch(title); len(titleMatchType) != 0 {
 	//		titleMatch = true
 	//		mustMatchFirst = false
@@ -351,6 +365,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	//			Debug("bbbbbbbbbb", kv.Key, kv.Value)
 	//		}
 	//	}
+	//qutil.Debug("=========================================================")
 	startIndex := 0
 	prevKey := ""
 	index, tmpindex, notmatchCount, allMatchCount := 0, 0, 0, 0
@@ -360,6 +375,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 	copy(kvsTemp, *kvs)
 	//again := 0
 	ishad := false
+	afterWinner := false
 	for kv_index, kv := range *kvs {
 		isBreak := true
 		v := strings.TrimSpace(kv.Value)
@@ -367,6 +383,12 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		isContinue := false
 		k := FilterContactKey(kv.Key)
 		k_length := len([]rune(k))
+		//3.4新增winnerperson和winnertel抽取
+		if indexMap[1] == "中标单位" && ContactInfoMustReg.MatchString(k) { //中标后是否出现电话、联系人、地址等信息
+			//qutil.Debug("kkkkkk:", k, indexMap)
+			afterWinner = true
+		}
+		//qutil.Debug(kv.Key, "++++++++++", kv.Value, buyers != nil, ascFind, isContinue)
 		if buyers != nil {
 			for _, buyer := range buyers {
 				if buyer == "" {
@@ -415,13 +437,16 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 				}
 			}
 		} else if ascFind {
+			//qutil.Debug("HasOrderContactType(k)+++", HasOrderContactType(k))
 			for _, ct_k := range HasOrderContactType(k) {
 				ishad = false
+				//qutil.Debug("ct_k+++", ct_k, "ishad+++", ishad)
 				//again++
 				if k_length < 3 || k_length > 15 {
 					isBreak = false
 					continue
 				}
+				//qutil.Debug("+++", !ContactType[ct_k].MatchString(k))
 				if !ContactType[ct_k].MatchString(k) {
 					continue
 				}
@@ -436,7 +461,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 						matchMap[ct_k] = map[string]bool{}
 					}
 					if !strings.HasSuffix(k, "方式") {
-						kvTags := GetKvTags([]*Kv{&Kv{Key: k, Value: v}}, "", BuyerContacts,isSite,codeSite)
+						kvTags := GetKvTags([]*Kv{&Kv{Key: k, Value: v}}, "", BuyerContacts, isSite, codeSite)
 						if len(kvTags) == 1 {
 							tagVal, weightVal := FirstKeyValueInMap(kvTags)
 							if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) {
@@ -468,7 +493,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 					}
 				}
 				if ct_k == "采购单位" { //打标签,权重高的重新覆盖
-					kvTags := GetKvTags([]*Kv{&Kv{Key: k, Value: v}}, "", []string{"采购单位"},isSite,codeSite)
+					kvTags := GetKvTags([]*Kv{&Kv{Key: k, Value: v}}, "", []string{"采购单位"}, isSite, codeSite)
 					tagVal, weightVal := FirstKeyValueInMap(kvTags)
 					if tagVal == ct_k {
 						if weightMap[ct_k][ct_k] == nil || (weightVal != nil && weightVal.(int) > weightMap[ct_k][ct_k].(int)) {
@@ -503,6 +528,13 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 				isContinue = true
 			}
 		}
+		//qutil.Debug(len(indexMap), !afterWinner)
+		if len(indexMap) == 2 && !afterWinner { //处理同时出现winner、buyer、agency 5d6b2aa2a5cb26b9b73e79d3
+			//qutil.Debug("+++++++++++++++++++")
+			delete(indexMap, 1)
+			indexMap = map[int]string{1: indexMap[2]}
+		}
+		//qutil.Debug("isContinue+++", isContinue, indexMap)
 		if isContinue {
 			continue
 		}
@@ -557,6 +589,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 				index++
 			}
 		}
+		//qutil.Debug("index+++", index, "prevKey+++", prevKey, "indexmap+++", indexMap)
 		//		if startIndex == 0 || startIndex%2 == 1 || index == 0 {
 		//			index = 1
 		//		} else if startIndex%2 == 0 {
@@ -569,6 +602,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 			continue
 		}
 		myContactType := indexMap[index]
+		//qutil.Debug("myContactType+++", myContactType)
 		if myContactType == "" {
 			continue
 		}
@@ -595,7 +629,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		allMatchCount++
 		delete(totalIndexMap, myContactType)
 		if !strings.HasSuffix(k, "方式") {
-			kvTags := GetKvTags([]*Kv{&Kv{Key: myContactType + k, Value: v}}, "", BuyerContacts,isSite,codeSite)
+			kvTags := GetKvTags([]*Kv{&Kv{Key: myContactType + k, Value: v}}, "", BuyerContacts, isSite, codeSite)
 			if len(kvTags) == 1 {
 				tagVal, _ := FirstKeyValueInMap(kvTags)
 				if tagVal == "采购单位联系人" && ContactBuyerPersonFilterReg.MatchString(v) {
@@ -619,6 +653,7 @@ func FormatContactKv(kvs *[]*Kv, title string, buyers []string, contactFormat *C
 		kvTemp := *kv
 		kvTemp.Key = myContactType + k
 		kvTemp.Value = v
+		//qutil.Debug(kvTemp.Key, "----------------", kvTemp.Value)
 		(*kvs)[kv_index] = &kvTemp
 		if ascFind && isBreak && len(indexMap) > 0 {
 			break
@@ -707,20 +742,20 @@ func HasOrderContactType(text string) []string {
 
 //两种冒号kv结合到一起
 //from 1--全文 2--table td 3--table td解析采购单位联系人 4--分包
-func GetKVAll(content, title string, contactFormat *ContactFormat, from int,isSite bool,codeSite string) *JobKv {
+func GetKVAll(content, title string, contactFormat *ContactFormat, from int, isSite bool, codeSite string) *JobKv {
 	content = formatText(content, "kv")
-	m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from,isSite,codeSite)
+	m1Kvs, _ := colonkvEntity.entrance(content, title, contactFormat, from, isSite, codeSite)
 	//	for _, kvs := range m1Kvs {
 	//		qutil.Debug(kvs.Key, kvs.Value)
 	//	}
-	kvTags := GetKvTags(m1Kvs, title, nil,isSite,codeSite)
+	kvTags := GetKvTags(m1Kvs, title, nil, isSite, codeSite)
 	//	for k, kvs := range kvTags {
 	//		qutil.Debug("kkkkk--", k)
 	//		for _, kv := range kvs {
 	//			qutil.Debug(kv.Key, kv.Value)
 	//		}
 	//	}
-	m2Kvs, m2KvTags := GetKvFromtxt(content, title, from,isSite,codeSite)
+	m2Kvs, m2KvTags := GetKvFromtxt(content, title, from, isSite, codeSite)
 	//	for k, kvs := range m2KvTags {
 	//		qutil.Debug("kkkkk--", k)
 	//		for _, kv := range kvs {
@@ -774,7 +809,7 @@ func PrintKvTags(kvTags map[string][]*Tag) {
 }
 
 //KVTags转kv
-func GetKvTags(findkvs []*Kv, title string, tagdbs []string,isSite bool,codeSite string) map[string][]*Tag {
+func GetKvTags(findkvs []*Kv, title string, tagdbs []string, isSite bool, codeSite string) map[string][]*Tag {
 	kvTags := map[string][]*Tag{}
 	if title != "" && BlockTagMap[title] {
 		kvTags[title] = append(kvTags[title], &Tag{title, title, 0, nil, false})
@@ -792,17 +827,17 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string,isSite bool,codeSite
 		}
 		key = colonkvEntity.blockTitleKV(title, key)
 		//先用新的key
-		tags := GetAppointTags(key, tagdbs,isSite,codeSite) //找标签库
+		tags := GetAppointTags(key, tagdbs, isSite, codeSite) //找标签库
 		if len(tags) == 0 && len(key) < 10 && len(title) > 0 && len(title) < 15 {
 			key = title + key
-			tags = GetAppointTags(key, tagdbs,isSite,codeSite)
+			tags = GetAppointTags(key, tagdbs, isSite, codeSite)
 		}
 		//再用老的key
 		if len(tags) == 0 && k != key {
-			tags = GetAppointTags(k, tagdbs,isSite,codeSite)
+			tags = GetAppointTags(k, tagdbs, isSite, codeSite)
 			if len(tags) == 0 && len(k) < 10 && len(title) > 0 && len(title) < 15 {
 				k = title + k
-				tags = GetAppointTags(k, tagdbs,isSite,codeSite)
+				tags = GetAppointTags(k, tagdbs, isSite, codeSite)
 				if len(tags) > 0 {
 					key = k
 				}
@@ -824,7 +859,7 @@ func GetKvTags(findkvs []*Kv, title string, tagdbs []string,isSite bool,codeSite
 							if strings.TrimSpace(nextval) == "" {
 								continue
 							}
-							if GetAppointTags(nextval, tagdbs,isSite,codeSite).Len() > 0 || GetAppointTags(k, tagdbs,isSite,codeSite).Len() > 0 {
+							if GetAppointTags(nextval, tagdbs, isSite, codeSite).Len() > 0 || GetAppointTags(k, tagdbs, isSite, codeSite).Len() > 0 {
 								continue
 							}
 						}

+ 12 - 15
src/main_blocktest.go

@@ -16,7 +16,7 @@ import (
 var f *os.File
 var m = map[string]bool{}
 
-func main12() {
+func main1() {
 	//winnerorder()
 	//return
 	//log.Println(pretreated.ProcTitle("以上公告内容如有变动将在相关网络媒体上另行通知凡购买本招标文件的单位必须就此采购项目的相关事宜详细咨询否则参与投标即被视为已经充分了解了招标方的需求中标后承担该文件范围内的所有要求投标前如对招标文件存有疑问请在投标截止日期前三个工作日以实名制书面文件向我公司询问否则视为接受已报名购买招标文件的投标商未递交投标文件或虽递交投标文件但未参加开标大会的投标商不得再参加该项目的采购活动"))
@@ -51,7 +51,7 @@ func all() {
 }
 func one() {
 	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
-	d, _ := m.FindById("bidding", "5d424df7a5cb26b9b7b61fde", extract.Fields)
+	d, _ := m.FindById("bidding", "5d6b2aa2a5cb26b9b73e79d2", extract.Fields)
 	com(*d)
 }
 func com(doc map[string]interface{}) {
@@ -67,12 +67,11 @@ func com(doc map[string]interface{}) {
 	}
 	e := &extract.ExtractTask{
 		TaskInfo: &extract.TaskInfo{
-			Version:     "V3.1.2",
-			VersionId:   "5cdd1c70e138234848c1d703",
+			Version:     "v3.6",
+			VersionId:   "5cdd3025698414032c8322b1",
 			ProcessPool: make(chan bool, 1),
 		},
 	}
-
 	e.Id = qu.ObjToString(ju.Config["udptaskid"])
 	e.InitTaskInfo()
 	//d.TaskInfo.FDB = db.MgoFactory(3, 5, 600, ext.TaskInfo.FromDbAddr, ext.TaskInfo.FromDB)
@@ -80,15 +79,11 @@ func com(doc map[string]interface{}) {
 	e.InitSite()
 	e.InitRulePres()
 	e.InitRuleBacks(false)
-	e.InitRuleBacks(true)
 	e.InitRuleCore(false)
-	e.InitRuleCore(true)
 	e.InitBlockRule()
 	e.InitPkgCore()
 	e.InitTag(false)
-	e.InitTag(true)
 	e.InitClearFn(false)
-	e.InitClearFn(true)
 	if e.IsExtractCity { //版本上控制是否开始城市抽取
 		//初始化城市DFA信息
 		e.InitCityDFA()
@@ -121,8 +116,8 @@ func com(doc map[string]interface{}) {
 		RuleBlock: e.RuleBlock,
 	}
 	e.TaskInfo.ProcessPool <- true
-	pretreated.AnalyStart(j,false,"")
-	e.ExtractProcess(j, nil,false)
+	pretreated.AnalyStart(j, false, "")
+	e.ExtractProcess(j, nil, false)
 
 	log.Println("=============块信息================")
 	for _, v := range j.Block {
@@ -152,13 +147,15 @@ func com(doc map[string]interface{}) {
 		//log.Println("Tag", v.Tag)
 	}
 	log.Println("=============抽取结果================")
+	log.Println(e.ResultArr)
 	set := (e.ResultArr[0][1]["$set"]).(map[string]interface{})
 	for k, v := range set {
-		if k == "budget" || k == "bidamount" || k == "winner" || k == "amount" || k == "projectname" || k == "projectcode" || k == "buyer" || k == "buyerperson" || k == "buyertel" || k == "agency" {
-			log.Println(k, "---", v)
-		}
+		//if k == "budget" || k == "bidamount" || k == "winner" || k == "amount" || k == "projectname" || k == "projectcode" || k == "buyer" || k == "buyerperson" || k == "buyertel" || k == "agency" {
+		log.Println(k, "---", v)
+		//}
 	}
 	log.Println("=============抽取结果 result================")
+	return
 	for k, v := range set["result"].(map[string][]*ju.ExtField) {
 		if k != "winner" {
 			continue
@@ -257,5 +254,5 @@ func winnerorder() {
 第一入围供货商:沈阳曲暖鼎盛保温安装有限公司 、总单价:11.833300
 第二入围供货商:沈阳国盛防腐保温有限公司、总单价:11.102100
 第三入围供货商:沈阳泰豪管材有限公司、总单价:13.258100`
-	log.Println((&pretreated.WinnerOrderEntity{}).Find(text, true, 1,false,""))
+	log.Println((&pretreated.WinnerOrderEntity{}).Find(text, true, 1, false, ""))
 }

+ 1 - 1
src/main_test.go

@@ -28,7 +28,7 @@ func Test_han(t *testing.T) {
 func Test_task(t *testing.T) {
 	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27082", "extract_kf")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "58369e4161a0721f1583247a", "1", "mxs_v1", "mxs_v1")
+	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5d6b2aa2a5cb26b9b73e79d3", "1", "mxs_v1", "mxs_v1")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }

+ 79 - 1
src/res/fieldscore.json

@@ -347,7 +347,85 @@
             }
         ]
     },
-    "projectcode": {
+    "winnerperson": {
+        "type": "string",
+        "positivewords": [
+            {
+                "describe": "以*结尾",
+                "regstr": ".{2,100}(工|老师|经理|女士|先生|主任|科长)$",
+                "score": 3
+            }
+        ],
+        "negativewords": [
+            {
+                "describe": "出现符号",
+                "regstr": "[*]",
+                "score": -10
+            },
+			{
+                "describe": "是数字",
+                "regstr": "^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$",
+                "score": -10
+            }
+        ],
+        "length": [
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    1,
+                    -5
+                ]
+            },
+			{
+                "describe": "[gt,lte,score]",
+                "range": [
+                    1,
+                    4,
+                    10
+                ]
+            }
+        ]
+    },
+	"winnertel": {
+        "type": "string",
+        "positivewords": [],
+        "negativewords": [
+            {
+                "describe": "出现中文汉字",
+                "regstr": "[\\u4e00-\\u9fa5]",
+                "score": -10
+            }
+        ],
+        "length": [
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    6,
+                    -5
+                ]
+            },
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    6,
+                    14,
+                    3
+                ]
+            },
+            {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    14,
+                    -1,
+                    -1
+                ]
+            }
+        ]
+    },
+    
+	"projectcode": {
         "type": "string",
         "positivewords": [
             {

+ 5 - 2
src/res/specialsymbols.json

@@ -80,10 +80,13 @@
             "buyer": true,
             "winner": true,
             "agency": true,
-            "agency": true,
             "buyertel": true,
             "buyerperson": true,
-			"buyerzipcode":true
+			"buyerzipcode": true,
+			"agencytel": true,
+			"agencyperson": true,
+			"winnertel": true,
+			"winnerperson": true
         },
         "symbol": [
             ":",

+ 1 - 1
src/web/templates/admin/clear.html

@@ -102,7 +102,7 @@ menuActive("version")
 var field = {{.field}};
 var _id = "";
 //var clearArr = ["cutspace","cutallspace","cutSymbol","cutNotPrs","clearAllWord","clearMaxAmount","clearProjectName","toint","tofloat","totimestamp","tomoney","getcurrency","getrate","getPhone","rateToFloat"]; 
-var clearMap = {"中文符号转英文":"chiToEng","去除首尾空格":"cutspace","去除所有空格":"cutallspace","清理符号":"cutSymbol","清理不成对符号后面的内容":"cutNotPrs","清理全部是汉字或者特殊符号的情况":"clearAllWord","过滤大于1万亿":"clearMaxAmount","清理项目名称":"clearProjectName","转int":"toint","转float":"tofloat","转时间戳":"totimestamp","转换金额":"tomoney","获取币种":"getcurrency","获取汇率":"getrate","取手机号":"getPhone","费率转小数":"rateToFloat","处理较长采购联系人":"clearBuyerPerson"}
+var clearMap = {"中文符号转英文":"chiToEng","去除首尾空格":"cutspace","去除所有空格":"cutallspace","清理符号":"cutSymbol","清理不成对符号后面的内容":"cutNotPrs","清理全部是汉字或者特殊符号的情况":"clearAllWord","过滤大于1万亿":"clearMaxAmount","清理项目名称":"clearProjectName","转int":"toint","转float":"tofloat","转时间戳":"totimestamp","转换金额":"tomoney","获取币种":"getcurrency","获取汇率":"getrate","取手机号":"getPhone","清理数字":"clearNumber","费率转小数":"rateToFloat","处理较长采购联系人":"clearBuyerPerson"}
 $(function () {
 	ttableclear=$('#clearTable').DataTable({
 		"lengthChange": false,

+ 1 - 1
src/web/templates/admin/version.html

@@ -162,7 +162,7 @@ $(function () {
 		"columns": [
 				{ "data": "version","width":"5%"},
 			{ "data": "s_username","width":"5%"},
-			{ "data": "s_descript","width":"25%"},
+			{ "data": "s_descript","width":"21%"},
 			{ "data": "isuse","width":"7%",render:function(val,a,row){
 				tmp=""
 				if(val){