123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230 |
- package main
- import (
- "go.uber.org/zap"
- util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
- "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
- "regexp"
- "sync"
- )
- var (
- MgoB *mongodb.MongodbSim
- //标题匹配关键词
- //titleMatchRegStr = `(开业|新员工|员工培训|正式投用|成立新公司|图书馆正式开馆|项目顺利竣工|学校新建|投产运营|启动运营|竣工验收|正式投用|建成|竣工交付|项目竣工|工程开工|正式开工|全面完工|项目投产|工程竣工|揭牌成立|正式投用|新签合同|施工完成|封顶|正式竣工|新员工|项目开工|建设总包项目|建设项目|办公大楼搬迁|选址|设立分支机构|开新店|门店拓展|项目规划启动会|获批变更|项目顺利推进)`
- //titleMatchRegStr2 = `(重点项目|新动作|项目签约|助力企业新发展|捐赠|项目移交|投资|打造|落户|收购|合并|签约|合作|数字经济|数字化|人员调整|签约.+项目)`
- //titleMatchRegStr3 = `(培训)`
- //titleOutRegStr = `(习近平|十八大|十九大|二十大|端午节|中秋节|建军节|建党节|诈骗|主题教育|招标|中标|成交|询价|磋商|竞价|招租|采购|马拉松|救人义举|安全生产月|防汛抗旱|社区|爱心|台风|比选公告|教育活动|消费|老党员|开展.*活动|举办.*活动|整改整治|新华|会见|等奖|获奖|进校园|反诈|百姓|专项整治工作|保障|日报|成功举办|新征程|获评|审查|可行性研究|接受.+查|一带一路|故事|荣获|候选人|宣传|专题|慰问|圆满完成|竞争性磋商|政治|思想|人民网|央广网|系统升级|网站维护|比选|入围.+名单|获.+奖|演习|新纪录|刷新.+记录|创新高|巡视|表彰|培训班|考察|涉嫌|减持|卖出|流出|致辞|买入.+股|粮食安全)`
- //titleOutRegStr2 = `(新加披|美国|英国|波黑|竞赛|领导拜访|会议|会谈|进博会|工作座谈)`
- //titleOutRegStr3 = `(技能|专项|业务)`
- //
- //detailOutRegStr = `(责任编辑|会议指出|通讯员|报道|通讯.+报道|文章来源|.+会在.+成功举办|理财|端午节|中秋节|建军节|建党节|大会期间|致辞)`
- titleMatchRegStr string
- titleOutRegStr string
- detailOutRegStr string
- )
- func main() {
- titleMatchRegStr = GF.Cron.TitleMatch
- titleOutRegStr = GF.Cron.TitleOut
- detailOutRegStr = GF.Cron.DetailOut
- dealBidding()
- select {}
- }
- //dealBidding 处理标讯数据
- func dealBidding() {
- sess := MgoB.GetMgoConn()
- defer MgoB.DestoryMongoConn(sess)
- // 指定对应的时间格式
- //layout := "2006-01-02 15:04:05"
- // 获取当前时间
- //now := time.Now()
- var q interface{}
- //targetTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Cron.Start, 04, 20, 0, 0, now.Location())
- //targetTime := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location())
- //todayTime := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, now.Location())
- if GF.Cron.Start != "" {
- if GF.Cron.End != "" {
- q = map[string]interface{}{
- "createtime": map[string]interface{}{
- "$gt": GF.Cron.Start,
- "$lte": GF.Cron.End,
- },
- }
- } else {
- q = map[string]interface{}{
- "createtime": map[string]interface{}{
- "$gt": GF.Cron.Start,
- },
- }
- }
- } else {
- q = nil
- }
- log.Info("dealBidding", zap.Any("q", q))
- query := sess.DB(GF.MongoB.DB).C(GF.MongoB.Coll).Find(q).Select(map[string]interface{}{
- "title": 1, "detail": 1}).Iter()
- count := 0
- ch := make(chan bool, 10)
- wg := &sync.WaitGroup{}
- for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
- if count%100 == 0 {
- log.Info("dealBidding", zap.Int("current", count))
- }
- ch <- true
- wg.Add(1)
- go func(tmp map[string]interface{}) {
- defer func() {
- <-ch
- wg.Done()
- }()
- tagData(tmp)
- }(tmp)
- tmp = map[string]interface{}{}
- }
- wg.Wait()
- log.Info("dealBidding", zap.Int("over ", count))
- }
- func tagData(tmp map[string]interface{}) {
- var label int
- var labelField string
- var labelWords []string
- var update = make(map[string]interface{}, 0)
- title := util.ObjToString(tmp["title"])
- detail := util.ObjToString(tmp["detail"])
- titleOutReg := regexp.MustCompile(titleOutRegStr)
- titleOutMatchs := titleOutReg.FindAllString(title, -1)
- //标题排除词
- if len(titleOutMatchs) > 0 {
- label = 2
- labelWords = titleOutMatchs
- labelField = "title_out"
- } else {
- //内容排除词
- detailOutReg := regexp.MustCompile(detailOutRegStr)
- detailOutMatchs := detailOutReg.FindAllString(detail, -1)
- if len(detailOutMatchs) > 0 {
- label = 2
- labelWords = detailOutMatchs
- labelField = "detail_out"
- } else {
- titleMatchReg := regexp.MustCompile(titleMatchRegStr)
- titleMatchs := titleMatchReg.FindAllString(title, -1)
- if len(titleMatchs) > 0 {
- label = 1
- labelWords = titleMatchs
- labelField = "title_match"
- } else {
- label = 3
- }
- }
- }
- update["label"] = label
- update["label_field"] = labelField
- update["label_words"] = labelWords
- where := map[string]interface{}{
- "_id": tmp["_id"],
- }
- MgoB.Update(GF.MongoB.Coll, where, map[string]interface{}{"$set": update}, true, false)
- }
- //tagData 数据打标签
- //func tagData2(tmp map[string]interface{}) {
- // var label int
- // var labelField string
- // var labelWords []string
- // var update = make(map[string]interface{}, 0)
- // title := util.ObjToString(tmp["title"])
- // detail := util.ObjToString(tmp["detail"])
- //
- // titleMatchReg := regexp.MustCompile(titleMatchRegStr)
- // titleMatchReg2 := regexp.MustCompile(titleMatchRegStr2)
- // titleMatchReg3 := regexp.MustCompile(titleMatchRegStr3)
- // titleOutReg := regexp.MustCompile(titleOutRegStr)
- // titleOutReg2 := regexp.MustCompile(titleOutRegStr2)
- // titleOutReg3 := regexp.MustCompile(titleOutRegStr3)
- // detailOutReg := regexp.MustCompile(detailOutRegStr)
- // //
- // titleMatchs := titleMatchReg.FindAllString(title, -1)
- // titleMatchs2 := titleMatchReg2.FindAllString(title, -1)
- // titleMatchs3 := titleMatchReg3.FindAllString(title, -1)
- // //
- // titleOutMatchs := titleOutReg.FindAllString(title, -1)
- // titleOutMatchs2 := titleOutReg2.FindAllString(title, -1)
- // titleOutMatchs3 := titleOutReg3.FindAllString(title, -1)
- //
- // if len(titleMatchs) > 0 {
- // label = 1
- // labelWords = titleMatchs
- // labelField = "title_match"
- // } else if len(titleMatchs2) > 0 {
- // //标题规则2 匹配
- // if len(titleOutMatchs2) == 0 {
- // label = 1
- // labelWords = titleMatchs2
- // labelField = "title_match"
- // } else {
- // label = 2
- // labelField = "title_out2"
- // labelWords = titleOutMatchs2
- // }
- // } else if len(titleMatchs3) > 0 {
- // if len(titleOutMatchs3) == 0 {
- // label = 1
- // labelWords = titleMatchs2
- // labelField = "title_match3"
- // } else {
- // label = 2
- // labelField = "title_out3"
- // labelWords = titleOutMatchs3
- // }
- // } else {
- // //标题排除词
- // if len(titleOutMatchs) > 0 {
- // label = 2
- // labelField = "title_out"
- // labelWords = titleOutMatchs
- // } else {
- // //内容匹配
- // detailOutMatchs := detailOutReg.FindAllString(detail, -1)
- // if len(detailOutMatchs) > 0 {
- // label = 2
- // labelField = "detail_out"
- // labelWords = detailOutMatchs
- // } else {
- // label = 3
- // }
- // }
- // }
- //
- // update["label"] = label
- // update["label_field"] = labelField
- // update["label_words"] = labelWords
- // where := map[string]interface{}{
- // "_id": tmp["_id"],
- // }
- //
- // MgoB.Update(GF.MongoB.Coll, where, map[string]interface{}{"$set": update}, true, false)
- //
- //}
|