123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626 |
- package main
- import (
- util "app.yhyue.com/data_processing/common_utils"
- "app.yhyue.com/data_processing/common_utils/log"
- "app.yhyue.com/data_processing/common_utils/mongodb"
- "app.yhyue.com/data_processing/common_utils/udp"
- "encoding/json"
- "esindex/config"
- "esindex/oss"
- "go.mongodb.org/mongo-driver/bson"
- "go.uber.org/zap"
- "reflect"
- "regexp"
- "strconv"
- "strings"
- "sync"
- "time"
- )
- var (
- TimeV1 = regexp.MustCompile("(\\d{4})[年.]?$")
- TimeV2 = regexp.MustCompile("(\\d{4}[年.\\-/]?)(\\d{1,2}[月.\\-/]?$)")
- TimeClear = regexp.MustCompile("[年|月|/|.|-]")
- filterSpace = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
- date1 = regexp.MustCompile("20[0-2][0-9][年|\\-/.][0-9]{1,2}[月|\\-/.][0-9]{1,2}[日]?")
- HtmlReg = regexp.MustCompile("<[^>]+>")
- )
- func biddingTask(mapInfo map[string]interface{}) {
- defer util.Catch()
- stype := util.ObjToString(mapInfo["stype"])
- if stype == "bidding" {
- uq := bson.M{"gtid": bson.M{"$gte": util.ObjToString(mapInfo["gtid"])},
- "lteid": bson.M{"$lte": util.ObjToString(mapInfo["lteid"])}}
- MgoB.Update("bidding_processing_ids", uq, bson.M{"$set": bson.M{"dataprocess": 8, "updatetime": time.Now().Unix()}}, false, true)
- }
- q, _ := mapInfo["query"].(map[string]interface{})
- if q == nil {
- q = map[string]interface{}{
- "_id": map[string]interface{}{
- "$gt": mongodb.StringTOBsonId(mapInfo["gtid"].(string)),
- "$lte": mongodb.StringTOBsonId(mapInfo["lteid"].(string)),
- },
- }
- }
- ch := make(chan bool, 10)
- wg := &sync.WaitGroup{}
- //bidding库
- biddingConn := MgoB.GetMgoConn()
- count, _ := biddingConn.DB(MgoB.DbName).C(config.Conf.DB.MongoB.Coll).Find(&q).Count()
- log.Info("bidding表", zap.Int64("同步总数:", count))
- it := biddingConn.DB(config.Conf.DB.MongoB.Dbname).C(config.Conf.DB.MongoB.Coll).Find(&q).Select(map[string]interface{}{
- "contenthtml": 0,
- }).Iter()
- c1, index := 0, 0
- var indexLock sync.Mutex
- for tmp := make(map[string]interface{}); it.Next(tmp); c1++ {
- if c1%1000 == 0 {
- log.Info("biddingTask", zap.Int("current:", c1))
- log.Info("biddingAllTask", zap.Any("current:_id =>", tmp["_id"]))
- }
- ch <- true
- wg.Add(1)
- go func(tmp map[string]interface{}) {
- defer func() {
- <-ch
- wg.Done()
- }()
- if sensitive := util.ObjToString(tmp["sensitive"]); sensitive == "测试" { //bidding中有敏感词,不生索引
- tmp = make(map[string]interface{})
- return
- }
- //只针对增量数据处理;全量数据 需要用extracttype字段判断
- if util.IntAll(tmp["dataprocess"]) != 8 {
- return
- }
- //// 增量数据使用上面判断;全量数据使用下面配置
- //if util.IntAll(tmp["extracttype"]) != 1 {
- // return
- //}
- //针对产权数据,暂时不入es 索引库
- if util.IntAll(tmp["infoformat"]) == 3 {
- return
- }
- /**
- 数据抽取时,有的数据的发布时间是之前的,属于增量历史数据,在判重和同步到bidding表是,会添加history_updatetime
- 字段,所以下面判断才会处理
- */
- if stype == "bidding_history" && tmp["history_updatetime"] == nil {
- return
- }
- indexLock.Lock()
- index++
- indexLock.Unlock()
- newTmp, update := GetEsField(tmp, stype)
- newTmp["dataweight"] = 0 //索引数据新增 jy置顶字段
- //针对中国政府采购网,单独处理
- if util.ObjToString(tmp["site"]) == "中国政府采购网" {
- objectType := MatchService(tmp)
- if objectType != "" {
- newTmp["object_type"] = objectType
- }
- }
- if len(update) > 0 {
- updateBiddingPool <- []map[string]interface{}{{
- "_id": tmp["_id"],
- },
- {"$set": update},
- }
- }
- if util.ObjToString(newTmp["spidercode"]) == "a_jyxxfbpt_gg" {
- // 剑鱼信息发布数据 通过udp通知信息发布程序
- go UdpMethod(mongodb.BsonIdToSId(newTmp["_id"]))
- }
- saveEsPool <- newTmp
- }(tmp)
- tmp = map[string]interface{}{}
- }
- wg.Wait()
- log.Info("biddingTask over", zap.Int("count", c1), zap.Int("index", index))
- // 重采平台需要
- //mapInfo["stype"] = ""
- //datas, _ := json.Marshal(mapInfo)
- //var next = &net.UDPAddr{
- // IP: net.ParseIP("127.0.0.1"),
- // Port: 1910,
- //}
- //log.Info("bidding index es over", zap.Any("es", next), zap.String("mapinfo", string(datas)))
- }
- func biddingAllTask(mapInfo map[string]interface{}) {
- defer util.Catch()
- stype := util.ObjToString(mapInfo["stype"])
- q, _ := mapInfo["query"].(map[string]interface{})
- if q == nil {
- q = map[string]interface{}{
- "_id": map[string]interface{}{
- "$gt": mongodb.StringTOBsonId(mapInfo["gtid"].(string)),
- "$lte": mongodb.StringTOBsonId(mapInfo["lteid"].(string)),
- },
- }
- }
- ch := make(chan bool, 20)
- wg := &sync.WaitGroup{}
- //bidding库
- biddingConn := MgoB.GetMgoConn()
- it := biddingConn.DB(config.Conf.DB.MongoB.Dbname).C(config.Conf.DB.MongoB.Coll).Find(&q).Select(map[string]interface{}{
- "contenthtml": 0,
- }).Iter()
- c1, index := 0, 0
- var indexLock sync.Mutex
- for tmp := make(map[string]interface{}); it.Next(tmp); c1++ {
- if c1%20000 == 0 {
- log.Info("biddingAllTask", zap.Int("current:", c1))
- log.Info("biddingAllTask", zap.Any("current:_id =>", tmp["_id"]))
- }
- ch <- true
- wg.Add(1)
- go func(tmp map[string]interface{}) {
- defer func() {
- <-ch
- wg.Done()
- }()
- if sensitive := util.ObjToString(tmp["sensitive"]); sensitive == "测试" { //bidding中有敏感词,不生索引
- tmp = make(map[string]interface{})
- return
- }
- // 针对17833,需要单独屏蔽这个判断,不需要处理
- if util.IntAll(tmp["extracttype"]) == -1 {
- return
- }
- //针对产权数据,暂时不入es 索引库
- if util.IntAll(tmp["infoformat"]) == 3 {
- return
- }
- indexLock.Lock()
- index++
- indexLock.Unlock()
- newTmp, update := GetEsField(tmp, stype)
- //针对中国政府采购网,单独处理
- if util.ObjToString(tmp["site"]) == "中国政府采购网" {
- objectType := MatchService(tmp)
- if objectType != "" {
- newTmp["object_type"] = objectType
- }
- }
- newTmp["dataweight"] = 0 //索引数据新增 jy置顶字段
- if len(update) > 0 {
- updateBiddingPool <- []map[string]interface{}{{
- "_id": tmp["_id"],
- },
- {"$set": update},
- }
- }
- saveEsPool <- newTmp
- }(tmp)
- tmp = map[string]interface{}{}
- }
- wg.Wait()
- log.Info("biddingAllTask over", zap.Int("count", c1), zap.Int("index", index))
- }
- func biddingTaskById(mapInfo map[string]interface{}) {
- defer util.Catch()
- stype := util.ObjToString(mapInfo["stype"])
- infoid := util.ObjToString(mapInfo["infoid"])
- tmp, _ := MgoB.FindById(config.Conf.DB.MongoB.Coll, infoid, map[string]interface{}{"contenthtml": 0})
- if sensitive := util.ObjToString((*tmp)["sensitive"]); sensitive == "测试" { //bidding中有敏感词,不生索引
- return
- }
- if util.IntAll((*tmp)["extracttype"]) == 1 {
- newTmp, update := GetEsField(*tmp, stype)
- newTmp["dataweight"] = 0 //索引数据新增 jy置顶字段
- if len(update) > 0 {
- //updateBiddingPool <- []map[string]interface{}{{
- // "_id": mongodb.StringTOBsonId(infoid),
- //},
- // {"$set": update},
- //}
- }
- saveEsPool <- newTmp
- }
- }
- // GetEsField @Description ES字段
- // @Author J 2022/6/7 11:34 AM
- func GetEsField(tmp map[string]interface{}, stype string) (map[string]interface{}, map[string]interface{}) {
- newTmp := make(map[string]interface{})
- update := make(map[string]interface{}) // bidding 修改字段
- saveErr := make(map[string]interface{})
- for field, ftype := range config.Conf.DB.Es.FieldEs {
- if tmp[field] != nil { //
- if field == "purchasinglist" { //标的物处理
- purchasinglist_new := []map[string]interface{}{}
- if pcl, _ := tmp[field].([]interface{}); len(pcl) > 0 {
- for _, ls := range pcl {
- lsm_new := make(map[string]interface{})
- lsm := ls.(map[string]interface{})
- for pf, pftype := range config.Conf.DB.Es.FieldPurchasingList {
- lsmv := lsm[pf]
- if lsmv != nil && reflect.TypeOf(lsmv).String() == pftype {
- lsm_new[pf] = lsm[pf]
- }
- }
- if lsm_new != nil && len(lsm_new) > 0 {
- purchasinglist_new = append(purchasinglist_new, lsm_new)
- }
- }
- }
- if len(purchasinglist_new) > 0 {
- newTmp[field] = purchasinglist_new
- }
- } else if field == "procurementlist" {
- if tmp["procurementlist"] != nil {
- var arr []interface{}
- plist := tmp["procurementlist"].([]interface{})
- for _, p := range plist {
- p1 := p.(map[string]interface{})
- p2 := make(map[string]interface{})
- for k, v := range config.Conf.DB.Es.FieldProcurementList {
- if k == "projectname" && util.ObjToString(p1[k]) == "" {
- p2[k] = util.ObjToString(tmp["projectname"])
- } else if k == "buyer" && util.ObjToString(p1[k]) == "" && util.ObjToString(tmp["buyer"]) != "" {
- p2[k] = util.ObjToString(tmp["buyer"])
- } else if p1[k] != nil && reflect.TypeOf(p1[k]).String() == v {
- p2[k] = p1[k]
- }
- //else if k == "expurasingtime" && util.ObjToString(p1[k]) != "" {
- // res := getMethod(util.ObjToString(p1[k]))
- // if res != 0 {
- // p2[k] = res
- // }
- //}
- }
- arr = append(arr, p2)
- }
- if len(arr) > 0 {
- newTmp[field] = arr
- }
- }
- } else if field == "projectscope" {
- ps, _ := tmp["projectscope"].(string)
- newTmp["projectscope"] = ps
- if len(ps) > pscopeLength {
- saveErr["projectscope"] = ps
- saveErr["projectscope_length"] = len(ps)
- }
- } else if field == "winnerorder" { //中标候选
- winnerorder_new := []map[string]interface{}{}
- if winnerorder, _ := tmp[field].([]interface{}); len(winnerorder) > 0 {
- for _, win := range winnerorder {
- winMap_new := make(map[string]interface{})
- winMap := win.(map[string]interface{})
- for wf, wftype := range config.Conf.DB.Es.FieldWinnerOrder {
- wfv := winMap[wf]
- if wfv != nil && reflect.TypeOf(wfv).String() == wftype {
- if wf == "sort" && util.Int64All(wfv) > 100 {
- continue
- }
- winMap_new[wf] = winMap[wf]
- }
- }
- if winMap_new != nil && len(winMap_new) > 0 {
- winnerorder_new = append(winnerorder_new, winMap_new)
- }
- }
- }
- if len(winnerorder_new) > 0 {
- newTmp[field] = winnerorder_new
- }
- } else if field == "qualifies" {
- //项目资质
- qs := []string{}
- if q, _ := tmp[field].([]interface{}); len(q) > 0 {
- for _, v := range q {
- v1 := v.(map[string]interface{})
- qs = append(qs, util.ObjToString(v1["key"]))
- }
- }
- if len(qs) > 0 {
- newTmp[field] = strings.Join(qs, ",")
- }
- } else if field == "bidopentime" {
- if tmp[field] != nil && tmp["bidendtime"] == nil {
- newTmp["bidendtime"] = tmp[field]
- newTmp[field] = tmp[field]
- } else if tmp[field] == nil && tmp["bidendtime"] != nil {
- newTmp["bidendtime"] = tmp[field]
- newTmp[field] = tmp["bidendtime"]
- } else {
- if tmp["bidopentime"] != nil {
- newTmp[field] = tmp["bidopentime"]
- }
- }
- } else if field == "detail" { //过滤
- detail, _ := tmp[field].(string)
- detail = filterSpace.ReplaceAllString(detail, "")
- if len(detail) > pscopeLength {
- saveErr["detail"] = detail
- saveErr["detail_length"] = len(detail)
- }
- if tmp["cleartag"] != nil {
- if tmp["cleartag"].(bool) {
- text, _ := FilterDetail(detail)
- newTmp[field] = util.ObjToString(tmp["title"]) + " " + text
- } else {
- newTmp[field] = util.ObjToString(tmp["title"]) + " " + detail
- }
- } else {
- text, b := FilterDetail(detail)
- newTmp[field] = util.ObjToString(tmp["title"]) + " " + text
- update["cleartag"] = b
- }
- } else if field == "_id" || field == "topscopeclass" || field == "entidlist" {
- newTmp[field] = tmp[field]
- } else if field == "publishtime" || field == "comeintime" {
- //字段类型不正确,特别处理
- if tmp[field] != nil && util.Int64All(tmp[field]) > 0 {
- newTmp[field] = util.Int64All(tmp[field])
- }
- } else { //其它字段判断数据类型,不正确舍弃
- if fieldval := tmp[field]; reflect.TypeOf(fieldval).String() != ftype {
- continue
- } else {
- if fieldval != "" {
- newTmp[field] = fieldval
- }
- }
- }
- }
- }
- filetext := getFileText(tmp)
- if len([]rune(filetext)) > 10 {
- newTmp["filetext"] = filetext
- if len(filetext) > pscopeLength {
- saveErr["filetext"] = filetext
- saveErr["filetext_length"] = len(filetext)
- }
- }
- YuceEndtime(newTmp) // 预测结果时间
- if stype == "bidding" || stype == "bidding_history" {
- newTmp["createtime"] = time.Now().Unix() // es库数据创建时间,只有增量数据有
- newTmp["pici"] = time.Now().Unix() //createtime跟pici一样,为了剑鱼功能需要,并行存在一段时间,之后可以删掉createtime
- update["pici"] = time.Now().Unix()
- }
- if len(saveErr) > 0 {
- saveErr["infoid"] = mongodb.BsonIdToSId(tmp["_id"])
- saveErrBidPool <- saveErr
- }
- return newTmp, update
- }
- // @Description 采购意向 预计采购时间处理
- // @Author J 2022/6/7 8:04 PM
- func getMethod(str string) int64 {
- if TimeV1.MatchString(str) {
- arr := TimeV1.FindStringSubmatch(str)
- st := arr[1] + "0000"
- parseInt, err := strconv.ParseInt(st, 10, 64)
- if err == nil {
- return parseInt
- }
- } else if TimeV2.MatchString(str) {
- arr := TimeV2.FindStringSubmatch(str)
- str1 := arr[2]
- if len(str1) == 1 {
- str1 = "0" + str1
- }
- str2 := TimeClear.ReplaceAllString(arr[1], "") + TimeClear.ReplaceAllString(str1, "") + "00"
- parseInt, err := strconv.ParseInt(str2, 10, 64)
- if err == nil {
- return parseInt
- }
- }
- return 0
- }
- func FilterDetail(text string) (string, bool) {
- b := false // 清理标记
- for _, s := range config.Conf.DB.Es.DetailFilter {
- reg := regexp.MustCompile(s)
- if reg.MatchString(text) {
- text = reg.ReplaceAllString(text, "")
- if !b {
- b = true
- }
- }
- }
- return text, b
- }
- // @Description 附件内容
- // @Author J 2022/6/7 1:54 PM
- func getFileText(tmp map[string]interface{}) (filetext string) {
- if attchMap, ok := tmp["attach_text"].(map[string]interface{}); attchMap != nil && ok {
- for _, tmpData1 := range attchMap {
- if tmpData2, ok := tmpData1.(map[string]interface{}); tmpData2 != nil && ok {
- for _, result := range tmpData2 {
- if resultMap, ok := result.(map[string]interface{}); resultMap != nil && ok {
- if attach_url := util.ObjToString(resultMap["attach_url"]); attach_url != "" {
- bs := oss.OssGetObject(attach_url, mongodb.BsonIdToSId(tmp["_id"])) //oss读数据
- //if utf8.RuneCountInString(filetext+bs) < fileLength {
- // filetext += bs + "\n"
- //} else {
- // if utf8.RuneCountInString(bs) > fileLength {
- // filetext = bs[0:fileLength]
- // } else {
- // filetext = bs
- // }
- // break
- //}
- if len(filetext) > 500000 {
- filetext = filetext[0:500000]
- break
- } else {
- if len(bs) <= 500000 {
- filetext += bs + "\n"
- }
- }
- }
- }
- }
- }
- }
- }
- return
- }
- // 预测结果时间
- func YuceEndtime(tmp map[string]interface{}) {
- flag := false
- flag2 := false
- scope := []string{"信息技术_运维服务", "信息技术_软件开发", "信息技术_系统集成及安全", "信息技术_其他"}
- titles := []string{"短信服务", "短信发送服务"}
- details := []string{"短信发送服务", "短信服务平台", "短信服务项目"}
- subscopeclass := util.ObjToString(tmp["s_subscopeclass"])
- //先判断满足 s_subscopeclass 条件
- for _, v := range scope {
- if strings.Contains(subscopeclass, v) {
- flag = true
- break
- }
- }
- //满足 s_subscopeclass ,再去判断title detail
- if flag {
- title := util.ObjToString(tmp["title"])
- for _, v := range titles {
- if strings.Contains(title, v) {
- flag2 = true
- }
- }
- if !flag2 {
- detail := util.ObjToString(tmp["detail"])
- for _, v := range details {
- if strings.Contains(detail, v) {
- flag2 = true
- }
- }
- }
- }
- if !flag2 {
- return
- }
- subtype := util.ObjToString(tmp["subtype"])
- if subtype == "成交" || subtype == "合同" {
- // yucestarttime、yuceendtime
- yucestarttime, yuceendtime := int64(0), int64(0)
- // 项目周期中
- if util.ObjToString(tmp["projectperiod"]) != "" {
- dateStr := date1.FindStringSubmatch(util.ObjToString(tmp["projectperiod"]))
- if len(dateStr) == 2 {
- sdate := FormatDateStr(dateStr[0])
- edate := FormatDateStr(dateStr[1])
- if sdate < edate && sdate != 0 && edate != 0 {
- yucestarttime = sdate
- yuceendtime = edate
- }
- }
- }
- if yucestarttime > 0 && yuceendtime > yucestarttime {
- tmp["yuceendtime"] = yuceendtime
- return
- }
- // 预测开始时间 合同签订日期
- if yucestarttime == 0 {
- if util.IntAll(tmp["signaturedate"]) <= 0 {
- if util.IntAll(tmp["publishtime"]) <= 0 {
- return
- } else {
- yucestarttime = util.Int64All(tmp["publishtime"])
- }
- } else {
- yucestarttime = util.Int64All(tmp["signaturedate"])
- }
- }
- // 预测结束时间
- if yucestarttime > 0 && yuceendtime == 0 {
- if util.IntAll(tmp["project_duration"]) > 0 && util.ObjToString(tmp["project_timeunit"]) != "" {
- yuceendtime = YcEndTime(yucestarttime, util.IntAll(tmp["project_duration"]), util.ObjToString(tmp["project_timeunit"]))
- tmp["yuceendtime"] = yuceendtime
- }
- }
- }
- }
- func FormatDateStr(ds string) int64 {
- ds = strings.Replace(ds, "年", "-", -1)
- ds = strings.Replace(ds, "月", "-", -1)
- ds = strings.Replace(ds, "日", "", -1)
- ds = strings.Replace(ds, "/", "-", -1)
- ds = strings.Replace(ds, ".", "-", -1)
- location, err := time.ParseInLocation(util.Date_Short_Layout, ds, time.Local)
- if err != nil {
- log.Error("FormatDateStr", zap.Error(err))
- return 0
- } else {
- return location.Unix()
- }
- }
- func YcEndTime(starttime int64, num int, unit string) int64 {
- yuceendtime := int64(0)
- if unit == "日历天" || unit == "天" || unit == "日" {
- yuceendtime = starttime + int64(num*86400)
- } else if unit == "周" {
- yuceendtime = time.Unix(starttime, 0).AddDate(0, 0, num*7).Unix()
- } else if unit == "月" {
- yuceendtime = time.Unix(starttime, 0).AddDate(0, num, 0).Unix()
- } else if unit == "年" {
- yuceendtime = time.Unix(starttime, 0).AddDate(num, 0, 0).Unix()
- } else if unit == "工作日" {
- n := num / 7 * 2
- yuceendtime = time.Unix(starttime, 0).AddDate(0, 0, num+n).Unix()
- }
- return yuceendtime
- }
- // UdpMethod @Description rpc调用信息发布程序接口
- // @Author J 2022/4/13 9:13 AM
- func UdpMethod(id string) {
- mapinfo := map[string]interface{}{
- "infoid": id,
- "stype": "jyfb_data_over",
- }
- datas, _ := json.Marshal(mapinfo)
- log.Info("UdpMethod", zap.Any("JyUdpAddr", JyUdpAddr), zap.String("mapinfo", string(datas)))
- _ = UdpClient.WriteUdp(datas, udp.OP_TYPE_DATA, JyUdpAddr)
- }
- //MatchService 针对中国招标网,匹配关键词打标签,object_type,货物、服务、工程,jsondata.item
- func MatchService(tmp map[string]interface{}) (res string) {
- if jsondata, ok := tmp["jsondata"]; ok {
- if da, ok := jsondata.(map[string]interface{}); ok {
- if item, ok := da["item"]; ok {
- services := []string{"货物", "服务", "工程"}
- for _, v := range services {
- if strings.Contains(util.ObjToString(item), v) {
- return v
- }
- }
- }
- }
- }
- return
- }
|