浏览代码

Merge branch 'dev3.4' of http://192.168.3.207:10080/qmx/jy-data-extract into dev3.4

apple 5 年之前
父节点
当前提交
20189f16eb

+ 8 - 1
src/jy/clear/tonumber.go

@@ -98,10 +98,17 @@ func ObjToMoney(data []interface{}) []interface{} {
 			f = f * 10000
 		}
 	}
+	if f == 0 && !moneyUnitRegBool.MatchString(fmt.Sprint(data[0])) {
+		data = append(data, false)
+		return data
+	}
+	data = append(data, true)
 	data[0] = f
 	return data
 }
-
+//["中标金额","成交金额","合同金额","中标价","成交价","成交价格","中标(成交)金额","投标报价","中标标价","成交结果"]
+//["0元","零元","0.0万元","¥0元"]
+var moneyUnitRegBool = regexp.MustCompile(`(中标金额|成交金额|合同金额|中标价|成交价|成交价格|中标\(成交\)金额|投标报价|中标标价|成交结果)?[::\s]?(0元|零元|0.0万元|¥0元|0)+`)
 //数字金额转换
 func numMoney(data []interface{}) ([]interface{}, bool) {
 	tmp := fmt.Sprintf("%f",data[0])

+ 17 - 2
src/jy/extract/extpackage.go

@@ -7,7 +7,7 @@ import (
 	"log"
 	qu "qfw/util"
 	"reflect"
-	"regexp"
+	regexp "regexp"
 	"sort"
 )
 
@@ -60,6 +60,13 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 					cfn := e.ClearFn["budget"]
 					lock.Unlock()
 					data := clear.DoClearFn(cfn, []interface{}{pv[0].Value, ""})
+					if data[0] ==0{
+						if istrue,ok:= data[len(data)-1].(bool);istrue&&ok{
+							(*sonJobResult)["budget"] = data[0]
+						}else {
+							continue
+						}
+					}
 					(*sonJobResult)["budget"] = data[0]
 					continue
 				}
@@ -68,10 +75,18 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 					cfn := e.ClearFn["budget"]
 					lock.Unlock()
 					data := clear.DoClearFn(cfn, []interface{}{pv[0].Value, ""})
+					if istrue,ok:= data[len(data)-1].(bool);istrue&&ok{
+						(*sonJobResult)["budget"] = data[0]
+					}else {
+						continue
+					}
 					(*sonJobResult)["bidamount"] = data[0]
 					continue
 				}
 				if ((*sonJobResult)["winner"] == nil || (*sonJobResult)["winner"] == "") && tags[0].Key == "中标单位" {
+					if winnerorderNotReg.MatchString(pv[0].Value){
+						continue
+					}
 					(*sonJobResult)["winner"] = pv[0].Value
 				}
 
@@ -82,7 +97,7 @@ func kvparse(p *ju.JobKv, e *ExtractTask, sonJobResult *map[string]interface{},
 		}
 	}
 }
-
+var winnerorderNotReg =regexp.MustCompile(`(附件|否决原因|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\d[\s]{0,10}(\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})`)
 //处理分包信息
 func PackageDetail(j *ju.Job, e *ExtractTask, isSite bool, codeSite string) {
 	qu.Try(func() {

+ 38 - 22
src/jy/extract/extract.go

@@ -26,12 +26,12 @@ import (
 var (
 	lock, lockrule, lockclear, locktag, blocktag sync.RWMutex
 
-	cut           = ju.NewCut()                          //获取正文并清理
-	ExtLogs       map[*TaskInfo][]map[string]interface{} //抽取日志
-	TaskList      map[string]*ExtractTask                //任务列表
-	ClearTaskList map[string]*ClearTask                  //清理任务列表
-	saveLimit     = 100                                  //抽取日志批量保存
-	PageSize      = 5000                                 //查询分页
+	cut     = ju.NewCut()                          //获取正文并清理
+	ExtLogs map[*TaskInfo][]map[string]interface{} //抽取日志
+	TaskList      map[string]*ExtractTask          //任务列表
+	ClearTaskList map[string]*ClearTask            //清理任务列表
+	saveLimit     = 100                            //抽取日志批量保存
+	PageSize      = 5000                           //查询分页
 	Fields        = `{"title":1,"summary":1,"detail":1,"contenthtml":1,"site":1,"spidercode":1,"toptype":1,"subtype":1,"bidstatus":1,"area":1,"city":1,"comeintime":1,"publishtime":1,"sensitive":1,"projectinfo":1,"jsondata":1,"href":1}`
 	Fields2       = `{"budget":1,"bidamount":1,"title":1,"projectname":1,"winner":1}`
 )
@@ -568,9 +568,8 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 			}
 		}
 		//函数清理
-
 		for key, val := range j.Result {
-			for _, v := range val {
+			for i, v := range val {
 				//qu.Debug(key, v.Value)
 				lockclear.Lock()
 				var cfn = []string{}
@@ -584,6 +583,13 @@ func (e *ExtractTask) ExtractDetail(j *ju.Job, isSite bool, codeSite string) {
 					continue
 				}
 				data := clear.DoClearFn(cfn, []interface{}{v.Value, j.Content})
+				if key == "budget" || key == "bidamount" {
+						if istrue, ok := data[len(data)-1].(bool); istrue && ok {
+							j.Result[key][i].IsTrue = true
+						} else {
+							continue
+						}
+				}
 				before, _ := v.Value.(string)
 				v.Value = data[0]
 				BeforeAddClearFnLog(v.Type, "函数清理", j.SourceMid, before, v.MatchType, v, e)
@@ -1576,6 +1582,11 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 				} else if v.Field == "projectname" {
 					tmp[v.Field] = v.Value
 					break
+				} else if v.Field == "bidamount"||v.Field =="budget"{
+					if v.IsTrue{
+						tmp[v.Field] =v.Value
+						break
+					}
 				}
 			}
 		}
@@ -1691,19 +1702,19 @@ func AnalysisSaveResult(j, jf *ju.Job, e *ExtractTask) {
 		//tmp["extract_content"] = j.Content
 		if e.TaskInfo.TestColl == "" {
 			if len(tmp) > 0 { //保存抽取结果
-			/*	if len(e.SiteFields) <= 0 {
-					//for field, _ := range e.Fields {
-					//	if tmp[field] == nil &&  {
-					//		tmp[field] = "" //覆盖之前版本数据
-					//	}
-					//}
-				} else {
-					//for field, _ := range e.SiteFields {
-					//	if tmp[field] == nil &&{
-					//		tmp[field] = "" //覆盖之前版本数据
-					//	}
-					//}
-				}*/
+				/*	if len(e.SiteFields) <= 0 {
+						//for field, _ := range e.Fields {
+						//	if tmp[field] == nil &&  {
+						//		tmp[field] = "" //覆盖之前版本数据
+						//	}
+						//}
+					} else {
+						//for field, _ := range e.SiteFields {
+						//	if tmp[field] == nil &&{
+						//		tmp[field] = "" //覆盖之前版本数据
+						//	}
+						//}
+					}*/
 				tmp["repeat"] = 0
 				tmparr := []map[string]interface{}{
 					map[string]interface{}{
@@ -1877,6 +1888,11 @@ func auxInfo(j *ju.Job) map[string][]map[string]interface{} {
 					standardized = true
 				}
 			}
+			if field == "budget"||field == "bidamount"{
+				if !v.IsTrue{
+					continue
+				}
+			}
 			sfield := map[string]interface{}{
 				"val":          v.Value,
 				"type":         v.Type,
@@ -1929,7 +1945,7 @@ func (e *ExtractTask) QualityAudit(resulttmp map[string]interface{}) {
 func (e *ExtractTask) RedisMatch(field, fv string, val map[string]interface{}) {
 	defer qu.Catch()
 	i := redis.GetInt(field, field+"_"+fv) //查找redis
-	if i == 0 {                            //reids未找到,执行规则匹配
+	if i == 0 { //reids未找到,执行规则匹配
 		val[field+"_isredis"] = false
 		e.RuleMatch(field, fv, val) //规则匹配
 	} else { //redis找到,打标识存库

+ 18 - 4
src/jy/extract/score.go

@@ -5,6 +5,7 @@ import (
 	"fmt"
 	ju "jy/util"
 	"log"
+	"os"
 	qu "qfw/util"
 	"regexp"
 	"strconv"
@@ -28,6 +29,9 @@ func init() {
 	qu.ReadConfig("./res/tagscoredesc.json", &TagConfigDesc)
 	qu.ReadConfig("./res/tagscore.json", &TagConfig)
 	qu.ReadConfig("./res/fieldscore.json", &SoreConfig)
+	if SoreConfig == nil { //配置出错,强退
+		os.Exit(0)
+	}
 	if repeat, ok := SoreConfig["other"]["repeat"].(map[string]interface{}); ok {
 		RepeatScore = qu.Float64All(repeat["score"])
 	}
@@ -114,9 +118,19 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 		for tmpsindex, tmpsvalue := range tmps {
 			//没有抽取到值,不打分
 			if string_value := fmt.Sprint(tmpsvalue.Value); string_value == "" || string_value == "0" || string_value == "<nil>" {
-				tmps[tmpsindex].Score = -10
-				tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
-				continue
+				if field == "budget" || field == "bidamount" {
+					if tmpsvalue.IsTrue {
+						//continue
+					} else {
+						tmps[tmpsindex].Score = -10
+						tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
+						continue
+					}
+				} else {
+					tmps[tmpsindex].Score = -10
+					tmps[tmpsindex].ScoreItem = append(tmps[tmpsindex].ScoreItem, &ju.ScoreItem{Des: `value结果为空直接-10分`, Code: field, Value: tmpsvalue.Value, Score: -10})
+					continue
+				}
 			}
 			lockscore.Lock()
 			describe := qu.ObjToString(SoreConfig["extractype"]["describe"])
@@ -270,7 +284,7 @@ func ScoreFields(j *ju.Job, ftag map[string][]*Tag) map[string][]*ju.ExtField {
 				max := qu.IntAll(scoreRule["max"])
 				val := qu.IntAll(tmpsvalue.Value)
 				scores, _ := scoreRule["score"].([]interface{})
-				if len(scores) < 3 || val == 0 {
+				if len(scores) < 3 || val < 0 {
 					continue
 				}
 				if val < min && 0 < val {

+ 2 - 1
src/jy/pretreated/analykv.go

@@ -350,12 +350,13 @@ func keydetail(k, v string, m *SortMap, tag string, pos int, strs [][]string, ma
 				相关竞价人对成交结果有异议的,可自本公告发布之日起三日内书面提出。
 				联系方式:卢明珠 0871-66136373
 			*/
-			if doubtMap[pos-1] { //当识别到中标、采购、代理标签后,对其后的联系人、电话等信息判断是否属于该标签
+			if doubtMap[pos-1] && len(m.Map) == 1 { //当识别到中标、采购、代理标签后,对其后的联系人、电话等信息判断是否属于该标签
 				goto L
 			}
 			num := 0
 			bf := false
 			for i := len(m.Keys) - 1; i > -1; i-- {
+				//u.Debug("k", k)
 				num++
 				if from == 1 && !ContactType["代理机构"].MatchString(k) && ContactType["代理机构"].MatchString(m.Keys[i]) && !IsContactKvHandle(k, matchMap["代理机构"]) {
 					matchMap["代理机构"][k] = true

+ 12 - 12
src/jy/pretreated/analytable.go

@@ -96,7 +96,7 @@ var (
 	projectnameReg = regexp.MustCompile("((公开)?招标)*[((第]*[一二三四五六七八九十a-zA-Z0-9]+(标段|包|标|段)[))]*$")
 	MhSpilt        = regexp.MustCompile("[::]")
 	//识别采购单位联系人、联系电话、代理机构联系人、联系电话
-	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公|单位)?((电话([//]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|经办)人)|采购方代表")
+	ContactInfoVagueReg = regexp.MustCompile("邮政编码|邮编|名称|(征求意见|报名审核购买)?((联系人?(及|和)?|办公|单位)?(((联系)?(电话|方式)([//]传真|及手机)?|手机)(号码)?|邮箱(地址)?|(详细)?(地(址|点)))|(联系|收料)(人(姓名)?|方式)|传真|电子邮件|(主要负责|项目(负责|联系)|经办)人)|采购方代表")
 	ContactInfoMustReg  = regexp.MustCompile("^(" + ContactInfoVagueReg.String() + ")$")
 	ContactType         = map[string]*regexp.Regexp{
 		"采购单位": regexp.MustCompile("(采购(项目.{2}|服务)?|比选|询价|招标(服务)?|甲|建设|委托|发包|业主|使用|谈判|本招标项目经办|征求意见联系|项目实施)(人|单位|部门|机构|机关|(执行)?方$)|(项目|建(库|设))单位|招标人信息|采购中心(地址)?|业主|收料人|采购部"),
@@ -114,7 +114,7 @@ var (
 	regHz                       = regexp.MustCompile("[\u4e00-\u9fa5]")
 	winnerOrderAndBidResult     = regexp.MustCompile("((中标)?候选人|(中标|评标)结果)")
 	WinnerOrderStr              = regexp.MustCompile(`(集团|公司|学校|中心|家具城|门诊|\[大中小\]学|部|院|局|厂|店|所|队|社|室|厅|段|会|场|行)$`)
-	DoubtReg                    = regexp.MustCompile("((|交易)中心|有(疑问|质疑|异议|意见)|(书面)?提出|不再受理|投诉|质疑|书面形式|监督|交易中心|公示期(限)?|招标|采购)")
+	DoubtReg                    = regexp.MustCompile("(我中心|有(疑问|质疑|异议|意见)|(书面)?提出|不再受理|投诉|质疑|书面形式|监督|公示期(限)?)")
 )
 
 //在解析时,判断表格元素是否隐藏
@@ -244,10 +244,10 @@ func (table *Table) KVFilter(isSite bool, codeSite string) {
 			MergeKvTags(table.StandKV, kvTags)
 		} else {
 			//u.Debug(k, v, "---------")
-			if strings.Contains(k,"总价"){
-				if vvvv,ok := v.([]string);ok && len(vvvv)>0{
+			if strings.Contains(k, "总价") {
+				if vvvv, ok := v.([]string); ok && len(vvvv) > 0 {
 					as.RemoveKey("报价")
-					as.AddKey(k,vvvv[len(vvvv)-1])
+					as.AddKey(k, vvvv[len(vvvv)-1])
 					continue
 				}
 			}
@@ -477,15 +477,15 @@ func (table *Table) sortKVArr(as *SortMap, isSite bool, codeSite string) {
 							tmp.Weight = vv[0].Weight
 							tmp.Key = vv[0].Key
 							tmp.IsInvalid = vv[0].IsInvalid
-							if kk == "单品报价"||kk == "中标金额"||kk == "预算"{
-								if strings.Contains(k,"万"){
-									tmp.Value = vvvvvv+"万"
-								}else if strings.Contains(k,"亿"){
-									tmp.Value = vvvvvv+"亿"
-								}else {
+							if kk == "单品报价" || kk == "中标金额" || kk == "预算" {
+								if strings.Contains(k, "万") {
+									tmp.Value = vvvvvv + "万"
+								} else if strings.Contains(k, "亿") {
+									tmp.Value = vvvvvv + "亿"
+								} else {
 									tmp.Value = vvvvvv
 								}
-							}else {
+							} else {
 								tmp.Value = vvvvvv
 							}
 							table.StandKV[kk] = append(table.StandKV[kk], &tmp)

+ 1 - 0
src/jy/util/article.go

@@ -55,6 +55,7 @@ type ExtField struct {
 	ScoreItem   []*ScoreItem      //打分项
 	Weight      int               //权重值
 	ValRepeat   int               //结果值重复次数,打分参考
+	IsTrue      bool              //针对金额0是否有效的值,其他字段不参考
 }
 
 //打分项

+ 2 - 2
src/main.go

@@ -13,10 +13,10 @@ import (
 	_ "net/http/pprof"
 	qu "qfw/util"
 
-	log "github.com/donnie4w/go-logger/logger"
 	"qfw/util/elastic"
 	"qfw/util/redis"
 
+	log "github.com/donnie4w/go-logger/logger"
 )
 
 func init() {
@@ -42,7 +42,7 @@ func init() {
 	//	log.Fatal("ElasticClient err:", err)
 	//} else {
 	//	util.ElasticClient = eClient
-		util.ElasticClientIndex = qu.ObjToString(util.Config["elasticsearch_index"])
+	util.ElasticClientIndex = qu.ObjToString(util.Config["elasticsearch_index"])
 	util.ElasticClientType = qu.ObjToString(util.Config["elasticsearch_type"])
 	util.ElasticClientDB = qu.ObjToString(util.Config["winner_enterprise"])
 	//}

+ 1 - 1
src/main_blocktest.go

@@ -51,7 +51,7 @@ func all() {
 }
 func one() {
 	m := mongodbutil.MgoFactory(3, 3, 120, "192.168.3.207:27081", "qfw")
-	d, _ := m.FindById("bidding", "59e47b5a40d2d9bbe82296bf", extract.Fields)
+	d, _ := m.FindById("bidding", "5e17dfcb50b5ea296ec93aea", extract.Fields)
 	com(*d)
 }
 func com(doc map[string]interface{}) {

+ 3 - 3
src/main_test.go

@@ -26,10 +26,10 @@ func Test_han(t *testing.T) {
 	os.Exit(0)
 }
 func Test_task(t *testing.T) {
-	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_kf")
+	Mgo = MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_dev32")
 	//extract.StartExtractTaskId("5b8f804025e29a290415aee1")5c528686698414055c47b115
-	//extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "59e47b5a40d2d9bbe82296bf", "1", "result_mxs", "result_mxs")
-	extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5e17e00e85a9271abf0860a6", "1", "result_mxs", "result_mxs")
+	extract.StartExtractTestTask("5e103206234ddc34b406c5d1", "5df6e6a6e9d1f601e494b749", "1", "mxs_v1", "mxs_v1")
+	//extract.StartExtractTestTask("5cdd3025698414032c8322b1", "5e17e00e85a9271abf0860a6", "1", "mxs_v1", "mxs_v1")
 	//extract.StartExtractTestTask("5c3d75c96984142998eb00e1", "5c2a3d28a5cb26b9b76144dd", "100", "mxs_v3", "mxs_v3")
 	time.Sleep(5 * time.Second)
 }

+ 48 - 4
src/res/fieldscore.json

@@ -200,7 +200,7 @@
         "negativewords": [
             {
                 "describe": "包含负分",
-                "regstr": "(附件|否决原因|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
+                "regstr": "(附件|否决原因|候选|招标失败|注册表|交易中心|序号内容|不足|公告|变更|采购|招标|废标|废止|流标|中标|投标|评标|开标|供应商|金额|万元|元整|预算|报价|单价|第(\\d|一|二|三|四|五)(名|包)|排名|候选|确定|标段|(标|一|二|三|四|五)包|中选|成交|包号|(A|B|C|D|E|F|G)包|地址|详情|要求|推荐|名称|评审|得分|合同|平方米|公示期|结果|备注|说明|单位|代表|委托|工作日|营业(执|期)|通过|代码|电话|联系|条件|合理|费率|以上|以下|拟定|为|注:|\\d[\\s]{0,10}(\\.|元|包|米|平米|平方米|吨|辆|千克|克|毫克|毫升|公升|套|件|瓶|箱|只|台|年|月|日|天|号)|(:|:|;|;|?|¥|\\*|%)|^[a-zA-Z0-9-]{5,100}|^[a-zA-Z0-9-]{1,100}$|[a-zA-Z0-9-]{10,100})",
                 "score": -10
             },
 			{
@@ -361,7 +361,51 @@
             }
         ]
     },
-    "winnerperson": {
+    "buyeraddr": {
+        "type": "string",
+		"positivewords": [],
+        "negativewords": [
+            {
+                "describe": "出现符号",
+                "regstr": "[*]",
+                "score": -10
+            },
+			{
+                "describe": "是数字",
+                "regstr": "^\\d*[×―—-\\-]*[\u3000\u2003\u00a0\\s]*\\d*$",
+                "score": -10
+            },
+			{
+                "describe": "出现日期",
+                "regstr": "(\\d)+(年|月|日)+",
+                "score": -10
+            },
+			{
+                "describe": "包含负分",
+                "regstr": "(详见公告)",
+                "score": -10
+            }
+        ],
+        "length": [
+            {
+                "describe": "[gt,lte,score]",
+                "range": [
+                    0,
+                    1,
+                    -10
+                ]
+            },
+			 {
+                "describe": "[gt,∞,score]",
+                "range": [
+                    90,
+                    -1,
+                    -10
+                ]
+            }
+        ]
+    },
+	"winnerperson": {
         "type": "string",
         "positivewords": [
             {
@@ -445,7 +489,7 @@
                 "range": [
                     14,
                     -1,
-                    -1
+                    -10
                 ]
             }
         ]
@@ -557,7 +601,7 @@
     "bidamount": {
         "type": "float",
         "describe": "min>val:1,min<=val<=max:3,max<val:1",
-        "min": 1000,
+        "min": -0.999,
         "max": 1000000000,
         "score": [
             -3,

+ 0 - 8
udpcreateindex/src/config.json

@@ -16,14 +16,6 @@
         "index": "winner",
         "type": "winner"
     },
-	"winnerenterprise":{
-		"addr":"172.17.145.163:27082",
-		"db":"extract_v3",
-		"collect":"winner_enterprise",
-		"size":6,
-		"index":"winner2",
-		"type":"winner_enterprise"
-	},
     "buyer": {
         "db": "qfw",
         "collect": "buyer",

+ 0 - 15
udpcreateindex/src/main.go

@@ -16,7 +16,6 @@ var (
 	Sysconfig                                                      map[string]interface{} //配置文件
 	mgo                                                            *mongodb.MongodbSim    //mongodb操作对象
 	extractmgo                                                     *mongodb.MongodbSim    //mongodb操作对象
-	winnerentermgo                                                 *mongodb.MongodbSim    //mongodb操作对象
 	udpclient                                                      mu.UdpClient           //udp对象
 	updport                                                        string
 	winner, winnerenterprise, bidding, biddingback, project, buyer map[string]interface{}
@@ -62,12 +61,6 @@ func init() {
 		extractmgo.InitPool()
 	}
 
-	winnerentermgo = &mongodb.MongodbSim{
-		MongodbAddr: winnerenterprise["addr"].(string),
-		Size:        util.IntAllDef(winnerenterprise["size"], 5),
-		DbName:      winnerenterprise["db"].(string),
-	}
-	winnerentermgo.InitPool()
 	econf := Sysconfig["elastic"].(map[string]interface{})
 	elastic.InitElasticSize(econf["addr"].(string), util.IntAllDef(econf["pool"], 5))
 	if bidding["indexfields"] != nil {
@@ -124,14 +117,6 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 					}()
 					winnerTask(data, mapInfo)
 				}()
-			case "winner_enterprise":
-				pool <- true
-				go func() {
-					defer func() {
-						<-pool
-					}()
-					winnerEnterPriseTask(data, mapInfo)
-				}()
 			case "bidding": //实时+udp调用,可选择是否生成关键词, 一次性最大20万
 				pool <- true
 				go func() {

+ 0 - 74
udpcreateindex/src/winnerenterpriseindex.go

@@ -1,74 +0,0 @@
-package main
-
-import (
-	"log"
-	"qfw/util"
-	elastic "qfw/util/elastic"
-	"sync"
-
-	"gopkg.in/mgo.v2/bson"
-)
-
-func winnerEnterPriseTask(data []byte, mapInfo map[string]interface{}) {
-	defer util.Catch()
-	q, _ := mapInfo["query"].(map[string]interface{})
-	if q == nil {
-		q = map[string]interface{}{
-			"_id": bson.M{
-				"$gt":  util.StringTOBsonId(mapInfo["gtid"].(string)),
-				"$lte": util.StringTOBsonId(mapInfo["lteid"].(string)),
-			},
-		}
-	}
-	log.Println("++++++++++++++++++++++")
-	session := winnerentermgo.GetMgoConn(1800)
-	defer winnerentermgo.DestoryMongoConn(session)
-	c, _ := winnerenterprise["collect"].(string)
-	db, _ := winnerenterprise["db"].(string)
-	index, _ := winnerenterprise["index"].(string)
-	itype, _ := winnerenterprise["type"].(string)
-	log.Println("index===", index, "itype===", itype)
-	count, _ := session.DB(db).C(c).Find(&q).Count()
-	savepool := make(chan bool, 10)
-	UpdatesLock := sync.Mutex{}
-	log.Println("查询语句:", q, "同步总数:", count, "elastic库:", index)
-	query := session.DB(db).C(c).Find(q).Select(bson.M{"alias": 0, "tmp_id": 0}).Iter()
-
-	tmp := []map[string]interface{}{}
-	tmp = append(tmp, map[string]interface{}{
-		"test": "test",
-	})
-
-	elastic.BulkSave(index, itype, &tmp, true)
-	arrEs := []map[string]interface{}{}
-	var n int
-	for tmp := make(map[string]interface{}); query.Next(tmp); n++ {
-		//go IS.Add("winner")
-		log.Println("tmp=========", tmp)
-		UpdatesLock.Lock()
-		arrEs = append(arrEs, tmp)
-		if len(arrEs) > savesizei {
-			tmps := arrEs
-			savepool <- true
-			go func(tmpn []map[string]interface{}) {
-				defer func() {
-					<-savepool
-				}()
-				elastic.BulkSave(index, itype, &tmpn, true)
-			}(tmps)
-			arrEs = []map[string]interface{}{}
-		}
-		UpdatesLock.Unlock()
-		if n%1000 == 0 {
-			log.Println("current:", n, util.BsonIdToSId(tmp["_id"]))
-		}
-		tmp = make(map[string]interface{})
-	}
-	UpdatesLock.Lock()
-	if len(arrEs) > 0 {
-		tmpn := arrEs
-		elastic.BulkSave(index, itype, &tmpn, true)
-	}
-	UpdatesLock.Unlock()
-	log.Println(mapInfo, "create winner_enterprise index...over", n)
-}

+ 2 - 2
versioncomparison/config.json

@@ -1,10 +1,10 @@
 {
     "premgo": "192.168.3.207:27092",
     "predb": "extract_kf",
-    "prec": "datainfo_dev3.2",
+    "prec": "demo_data3.2",
     "newmgo": "192.168.3.207:27092",
     "newdb": "extract_kf",
-    "newc": "datainfo_dev3.4",
+    "newc": "demo_data3.4",
     "fields": [
         "projectname",
         "projectcode",

+ 128 - 0
versioncomparison/demo_data.go

@@ -0,0 +1,128 @@
+// demodata
+package main
+
+import (
+	"jy/mongodbutil"
+	"log"
+	"qfw/util"
+)
+
+//30个网站,10个信息分类,共计5000条样例数据
+var sitenums = map[string]int{
+	"中国政府采购网":              1006,
+	"中国招标投标公共服务平台":         674,
+	"国家公共资源交易平台":           371,
+	"广东省政府采购网":             282,
+	"机电产品招标投标电子交易平台":       189,
+	"中国山东政府采购网":            183,
+	"中国华能集团公司":             177,
+	"湖北省政府采购网":             173,
+	"基建云采购":                166,
+	"浙江政府采购网":              141,
+	"中国大唐集团公司电子商务平台":       140,
+	"中国华电集团公司电子商务平台":       129,
+	"中国电力设备信息网":            122,
+	"安徽省政府采购网":             116,
+	"中国电信阳光采购网":            100,
+	"河北省政府采购网":             98,
+	"四川政府采购":               92,
+	"安徽省招标投标信息网":           85,
+	"中国山西政府采购网":            74,
+	"浙江省公共资源交易服务平台":        73,
+	"新疆维吾尔自治区政府采购网":        73,
+	"阿里巴巴大企业采购平台":          70,
+	"中国冶金科工集团有限公司采购电子商务平台": 64,
+	"广东省电子化采购执行平台":         62,
+	"河南省政府采购网":             60,
+	"国家能源e购":               57,
+	"铁路物资采购与招商平台":          57,
+	"兵器工业集团公司采购电子商务平台":     57,
+	"安徽合肥公共资源交易中心":         55,
+	"河北省公共资源交易信息网":         55,
+}
+
+var bidtypesP = map[string]float64{
+	"招标": 0.25,
+	"邀标": 0.03,
+	"询价": 0.10,
+	"竞谈": 0.06,
+	"单一": 0.03,
+	"竞价": 0.03,
+	"合同": 0.09,
+	"验收": 0.03,
+	"中标": 0.23,
+	"成交": 0.15,
+}
+
+var bidtypes = map[string]bool{
+	"招标": true, "邀标": true, "询价": true, "竞价": true, "竞谈": true,
+	"单一": true, "中标": true, "成交": true, "合同": true, "验收": true,
+}
+var sitebidtypesNum = map[string]int{}
+
+func mainT() {
+	ext := mongodbutil.MgoFactory(1, 3, 120, "192.168.3.207:27092", "extract_kf")
+	//	ls, _ := ext.Find("demo_data", nil, nil, nil, false, -1, -1)
+	//	bidnum := map[string]int{}
+	//	for _, v := range *ls {
+	//		subtype, _ := v["subtype"].(string)
+	//		bidnum[subtype] += 1
+	//	}
+	//	for k, v := range bidnum {
+	//		log.Println(k, v)
+	//	}
+	//	os.Exit(0)
+	var total = 0
+	for site, num := range sitenums {
+		tt := 0
+		for stype, _ := range bidtypes {
+			n := bidtypesP[stype] * float64(num)
+			sitebidtypesNum[site+"-"+stype] = int(n)
+			total += int(n)
+			tt += int(n)
+			log.Println(site+"-"+stype, int(n))
+		}
+		log.Println(site, tt)
+	}
+	log.Println(total)
+
+	db := mongodbutil.MgoFactory(1, 3, 120, "192.168.3.207:27083", "qfw")
+	it := db.Get().DB("qfw").C("bidding").Find(map[string]interface{}{
+		"_id": map[string]interface{}{
+			"$gt": util.StringTOBsonId("5df507000000000000000000"),
+			"$lt": util.StringTOBsonId("5e0a1f000000000000000000"),
+		},
+	}).Iter()
+	index := 0
+	datanum := 0
+	for tmp := make(map[string]interface{}); it.Next(&tmp); index++ {
+		if index%1000 == 0 {
+			log.Println(index, datanum)
+		}
+		site, _ := tmp["site"].(string)
+		subtype, _ := tmp["subtype"].(string)
+		if !bidtypes[subtype] {
+			continue
+		}
+		tp := site + "-" + subtype
+		if sitebidtypesNum[tp] > 0 {
+			sitebidtypesNum[tp] -= 1
+		} else {
+			continue
+		}
+		datanum++
+		ext.Update("demo_data", map[string]interface{}{"_id": tmp["_id"]}, tmp, true, false)
+		tmp = map[string]interface{}{}
+	}
+
+	log.Println("datanum", datanum)
+	ls, _ := ext.Find("demo_data", nil, nil, nil, false, -1, -1)
+	snum := map[string]int{}
+	for _, v := range *ls {
+		site, _ := v["site"].(string)
+		snum[site] += 1
+	}
+	for k, v := range snum {
+		log.Println(k, v)
+	}
+}

+ 38 - 3
versioncomparison/main.go

@@ -40,8 +40,8 @@ type Data struct {
 }
 
 func init() {
-	flag.StringVar(&Sid, "sid", "5e17deb150b5ea296ec939d3", "开始id")
-	flag.StringVar(&Eid, "eid", "5e17e1e685a9271abf08616d", "结束id")
+	flag.StringVar(&Sid, "sid", "5df5071ce9d1f601e495fa54", "开始id")
+	flag.StringVar(&Eid, "eid", "5e09c05f0cf41612e0626abc", "结束id")
 	flag.Parse()
 	qu.ReadConfig(&SysConfig)
 	Premgo = mongodbutil.MgoFactory(1, 3, 120, qu.ObjToString(SysConfig["premgo"]), qu.ObjToString(SysConfig["predb"]))
@@ -97,8 +97,8 @@ func createXlsx() {
 			sh.Rows[k] = row
 		}
 	}
+	var idsall = map[string]bool{}
 	//生成信息sheet
-
 	for _, field := range Fields {
 		sh, _ := xf.AddSheet(field)
 		rowh := sh.AddRow()
@@ -114,9 +114,44 @@ func createXlsx() {
 				row.AddCell().SetString(v.PreVal)
 				row.AddCell().SetString(v.NewVal)
 				row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", v.Id)))
+				idsall[v.Id] = true
+			}
+		}
+	}
+	log.Println("不同数据总量", len(idsall))
+	//生全量信息不同部分
+	shall, _ := xf.AddSheet("全量数据(不同部分)")
+	rowh := shall.AddRow()
+	rowh.AddCell().SetString("id")
+	for _, v := range Fields {
+		rowh.AddCell().SetString("preval_" + v)
+		rowh.AddCell().SetString("newval_" + v)
+	}
+	rowh.AddCell().SetString("url")
+	i := 0
+	for k, _ := range idsall {
+		i++
+		row := shall.AddRow()
+		row.AddCell().SetString(k)
+		for _, field := range Fields {
+			tmp := FieldData[field]
+			v := tmp[k]
+			if v != nil {
+				if v.NewVal != v.PreVal {
+					row.AddCell().SetString(v.PreVal)
+					row.AddCell().SetString(v.NewVal)
+				} else {
+					row.AddCell().SetString("")
+					row.AddCell().SetString("")
+				}
+			} else {
+				row.AddCell().SetString("")
+				row.AddCell().SetString("")
 			}
 		}
+		row.AddCell().SetString(fmt.Sprintf(Url, qu.CommonEncodeArticle("content", k)))
 	}
+	log.Println("数据处理完成,正在生成文件")
 	err = xf.Save("result.xlsx")
 	if err != nil {
 		log.Println("保存xlsx失败:", err)

二进制
versioncomparison/template.xlsx