浏览代码

更新 项目索引,添加subtitle_projectname 字段

wcc 1 年之前
父节点
当前提交
c3a3268551

+ 31 - 23
createEsIndex/common.toml

@@ -7,30 +7,37 @@
 
 [db]
 [db.mongoB] ## bidding标讯数据
-    addr = "127.0.0.1:27083"
-#    addr = "192.168.3.206:27002"    ## 测试环境
-    dbname = "qfw"
+#    addr = "127.0.0.1:27083"
+#    dbname = "qfw"
+#    coll = "bidding"
+#    size = 15
+#    user = "SJZY_RWbid_ES"
+#    password = "SJZY@B4i4D5e6S"
+#    direct = true
+
+    addr = "192.168.3.206:27002"    ## 测试环境
+    dbname = "qfw_data"
     coll = "bidding"
     size = 15
-    user = "SJZY_RWbid_ES"
-    password = "SJZY@B4i4D5e6S"
+    user = "root"
+    password = "root"
     direct = true
 
 [db.mongoP] ## projectset 项目信息
-#    addr = "192.168.3.206:27002"
-#    dbname = "qfw_data"
-#    coll = "projectset"
-#    size = 15
-#    user = "root"
-#    password = "root"
-
-    addr = "127.0.0.1:27080"
-    dbname = "qfw"
-    coll = "projectset_20230904"
+    addr = "192.168.3.206:27002"
+    dbname = "qfw_data"
+    coll = "projectset"
     size = 15
-    user = ""
-    password = ""
-    direct = true
+    user = "root"
+    password = "root"
+
+#    addr = "127.0.0.1:27080"
+#    dbname = "qfw"
+#    coll = "projectset_20230904"
+#    size = 15
+#    user = ""
+#    password = ""
+#    direct = true
 
 [db.mongoQ] ##  winner
     addr = "192.168.3.206:27002"
@@ -69,15 +76,15 @@
     bucketname = "topjy"
     filesize = 500000  ## 单位字节,附件总字节长度限制;超过就不再读取
 [db.es]
-    addr = "http://127.0.0.1:19805"      ## 正常bidding 链接
-#    addr = "http://192.168.3.149:9201"      ## 正常bidding 链接
+#    addr = "http://127.0.0.1:19805"      ## 正常bidding 链接
+    addr = "http://192.168.3.149:9201"      ## 测试环境 bidding 链接
 #    addrp = "http://192.168.3.149:9201"    ##  采集使用的单机版地址
-    username = "es_all"
-    password = "TopJkO2E_d1x"
+#    username = "es_all"
+#    password = "TopJkO2E_d1x"
     size = 5
     indexb = "bidding"
 #    indextmp = "bidding_temporary"       ## 临时索引,其他程序需要;目前已不需要
-    indexp = "projectset_v1"
+    indexp = "projectset"
     indexwinner = "winner_v1"
     indexbuyer = "buyer_v3"
 detailfilter = ["(招标网|千里马|采招网|招标采购导航网|招标与采购网|中国招投标网|中国采购与招标网|中国采购与招标|优质采)[\\w\\W]{0,15}[http|https|htpps]?[a-z0-9:\\/\\/.]{0,20}(qianlima|zhaobiao|okcis|zbytb|infobidding|bidcenter|youzhicai|chinabidding|Chinabidding|CHINABIDDING)[a-z0-9.\\/\\/]{0,40}",
@@ -117,6 +124,7 @@ api = "http://172.17.162.36:19281/_send/_mail"
 
 [env]
     stype = 1           ## 默认0 正式环境;1测试环境。测试环境不会执行定时任务更新采购单位、中标单位、数据检测
+    dbfile = "./db"
 #    openpre = false      ## 是否开启预处理流程
 #    spectype = "day"    ## 定时任务类型;正式环境应该是 month 。day 表示每天创建一个索引;month 表示每个月创建一个
 

+ 4 - 3
createEsIndex/config/conf.go

@@ -96,7 +96,7 @@ type mysql struct {
 	Password string
 }
 
-//oss oss 阿里云配置
+// oss oss 阿里云配置
 type oss struct {
 	Endpoint     string
 	AccessKey    string
@@ -114,12 +114,13 @@ type mgo struct {
 	Direct   bool
 }
 
-//env 全局的相关配置
+// env 全局的相关配置
 type env struct {
 	Stype    int  //默认0,正式环境;1是测试环境,不会执行定时任务更新采购单位、中标单位、数据检测
 	OpenPre  bool //默认关闭,不开启预处理流程
 	Alias    string
 	SpecType string
+	Dbfile   string //配置文件,读取bitmap
 }
 
 type es struct {
@@ -149,7 +150,7 @@ type es struct {
 	Indexb2   string
 }
 
-//PreConf 预处理 配置
+// PreConf 预处理 配置
 type PreConf struct {
 	Addr     string
 	Username string

+ 4 - 0
createEsIndex/go.mod

@@ -17,7 +17,10 @@ require (
 require (
 	github.com/BurntSushi/toml v1.2.0 // indirect
 	github.com/PuerkitoBio/goquery v1.8.0 // indirect
+	github.com/RoaringBitmap/roaring v1.9.0 // indirect
 	github.com/andybalholm/cascadia v1.3.1 // indirect
+	github.com/bits-and-blooms/bitset v1.12.0 // indirect
+	github.com/cespare/xxhash/v2 v2.2.0 // indirect
 	github.com/clbanning/mxj/v2 v2.7.0 // indirect
 	github.com/dchest/captcha v1.0.0 // indirect
 	github.com/fatih/color v1.15.0 // indirect
@@ -41,6 +44,7 @@ require (
 	github.com/mattn/go-runewidth v0.0.15 // indirect
 	github.com/mitchellh/mapstructure v1.5.0 // indirect
 	github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe // indirect
+	github.com/mschoch/smat v0.2.0 // indirect
 	github.com/nats-io/nkeys v0.4.5 // indirect
 	github.com/nats-io/nuid v1.0.1 // indirect
 	github.com/olekukonko/tablewriter v0.0.5 // indirect

+ 8 - 0
createEsIndex/go.sum

@@ -42,6 +42,8 @@ github.com/BurntSushi/toml v1.2.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbi
 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
 github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U=
 github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI=
+github.com/RoaringBitmap/roaring v1.9.0 h1:lwKhr90/j0jVXJyh5X+vQN1VVn77rQFfYnh6RDRGCcE=
+github.com/RoaringBitmap/roaring v1.9.0/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
 github.com/aliyun/aliyun-oss-go-sdk v2.2.5+incompatible h1:QoRMR0TCctLDqBCMyOu1eXdZyMw3F7uGA9qPn2J4+R8=
 github.com/aliyun/aliyun-oss-go-sdk v2.2.5+incompatible/go.mod h1:T/Aws4fEfogEE9v+HPhhw+CntffsBHJ8nXQCwKr0/g8=
 github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c=
@@ -49,7 +51,11 @@ github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEq
 github.com/aws/aws-sdk-go v1.43.21/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
 github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
 github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
+github.com/bits-and-blooms/bitset v1.12.0 h1:U/q1fAF7xXRhFCrhROzIfffYnu+dlS38vCZtmFVPHmA=
+github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
+github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
+github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
 github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
 github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
@@ -198,6 +204,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua
 github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo=
 github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe h1:iruDEfMl2E6fbMZ9s0scYfZQ84/6SPL6zC8ACM2oIL0=
 github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
+github.com/mschoch/smat v0.2.0 h1:8imxQsjDm8yFEAVBe7azKmKSgzSkZXDuKkSq9374khM=
+github.com/mschoch/smat v0.2.0/go.mod h1:kc9mz7DoBKqDyiRL7VZN8KvXQMWeTaVnttLRXOlotKw=
 github.com/nats-io/nats.go v1.31.0 h1:/WFBHEc/dOKBF6qf1TZhrdEfTmOZ5JzdJ+Y3m6Y/p7E=
 github.com/nats-io/nats.go v1.31.0/go.mod h1:di3Bm5MLsoB4Bx61CBTsxuarI36WbhAwOm8QrW39+i8=
 github.com/nats-io/nkeys v0.4.5 h1:Zdz2BUlFm4fJlierwvGK+yl20IAKUm7eV6AAZXEhkPk=

+ 36 - 0
createEsIndex/init.go

@@ -3,9 +3,12 @@ package main
 import (
 	"context"
 	"esindex/config"
+	"flag"
 	"fmt"
+	"github.com/RoaringBitmap/roaring"
 	es7 "github.com/olivere/elastic/v7"
 	"go.uber.org/zap"
+	"io/ioutil"
 	util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/elastic"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
@@ -13,6 +16,7 @@ import (
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/mysqldb"
 	"os"
 	"strings"
+	"sync"
 	"time"
 )
 
@@ -22,6 +26,10 @@ var (
 	BiddingField      = make(map[string]string, 200)       //bidding_processing_field, level=1 最外层字段,
 	BiddingLevelField = make(map[string]map[string]string) //level=2 的第二层字段
 	PreProcessField   = make(map[string]string, 500)       //预处理流程 bidding字段
+	dbfile            = flag.String("dbfile", "./db", "数据库文件")
+	cache             = roaring.NewBitmap()
+	cacheModify       = false    //控制10秒 定时写入文件
+	mutex             sync.Mutex // 互斥锁,用于保护 cache 的并发写入操作
 )
 
 // InitLog @Description
@@ -393,3 +401,31 @@ func GetIndexName(client *es7.Client, name string) (string, error) {
 	// 如果 name 既不是别名,也不是正式索引名称,则返回空字符串
 	return "", nil
 }
+
+// InitBitmap 初始化项目名称副标题 bitmap
+func InitBitmap() {
+	if config.Conf.Env.Dbfile != "" {
+		dbfile = &config.Conf.Env.Dbfile
+	}
+	_, err := os.Stat(*dbfile)
+	log.Info("InitBitmap", zap.String("dbfile", *dbfile))
+	if !os.IsNotExist(err) {
+		bs, err := ioutil.ReadFile(*dbfile)
+		if err != nil {
+			log.Info("InitBitmap", zap.Error(err))
+		}
+		if len(bs) > 0 {
+			cache.FromBuffer(bs)
+		}
+	}
+	//监听,写入文件保存
+	go func() {
+		for {
+			time.Sleep(10 * time.Second)
+			if cacheModify {
+				saveDb()
+				cacheModify = false
+			}
+		}
+	}()
+}

+ 10 - 0
createEsIndex/main.go

@@ -16,7 +16,10 @@ import (
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/udp"
 	"net"
 	"net/http"
+	"os"
+	"os/signal"
 	"sync"
+	"syscall"
 	"time"
 )
 
@@ -64,6 +67,7 @@ func init() {
 	InitMgo()
 	InitEs()
 	InitField()
+	InitBitmap()
 
 	//if config.Conf.Env.OpenPre {
 	//	InitPreProcessField()
@@ -116,6 +120,12 @@ func main() {
 	UdpClient.Listen(processUdpMsg)
 	log.Info("Udp服务监听", zap.String("port:", config.Conf.Udp.LocPort))
 
+	//监听异常退出信号;及时保存项目名称副标题数据
+	signalChan := make(chan os.Signal, 1)
+	signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
+	<-signalChan
+	saveDb()
+
 	ch := make(chan bool, 1)
 	<-ch
 }

+ 22 - 0
createEsIndex/project_es.go

@@ -216,6 +216,28 @@ func projectTask(data []byte, mapInfo map[string]interface{}) {
 				newTmp["bidcycle"] = int(day)
 			}
 		}
+
+		//项目名称副标题
+		subtitleProjectname := util.ObjToString(tmp["subtitle_projectname"])
+		if subtitleProjectname != "" {
+			newTmp["subtitle_projectname"] = subtitleProjectname
+			if !cache.Contains(uint32(hash(subtitleProjectname))) {
+				cache.Add(uint32(hash(subtitleProjectname)))
+				cacheModify = true
+			}
+		} else {
+			name := getNewName(tmp)
+			if name != "" {
+				newTmp["subtitle_projectname"] = name
+				update := make(map[string]interface{})
+				update["subtitle_projectname"] = name
+				res := MgoP.UpdateById(config.Conf.DB.MongoP.Coll, mongodb.BsonIdToSId(tmp["_id"]), map[string]interface{}{"$set": update})
+				if !res {
+					log.Info("项目数据", zap.Any(mongodb.BsonIdToSId(tmp["_id"]), "项目名称副标题更新失败"))
+				}
+			}
+		}
+
 		saveProjectEsPool <- newTmp
 		tmp = make(map[string]interface{})
 	}

+ 224 - 0
createEsIndex/utils.go

@@ -4,13 +4,16 @@ import (
 	"encoding/json"
 	"esindex/config"
 	"fmt"
+	"github.com/cespare/xxhash/v2"
 	"go.mongodb.org/mongo-driver/bson"
 	"go.uber.org/zap"
+	"jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/udp"
 	"math"
 	"net"
+	"os"
 	"regexp"
 	"sort"
 	"strconv"
@@ -501,3 +504,224 @@ func deletePreEsData(preId string) {
 	}
 
 }
+
+// saveDb 文件写入
+func saveDb() {
+	mutex.Lock()
+	defer mutex.Unlock()
+	// 如果 cache 为空,则无需执行写入操作
+	if cache.GetCardinality() == 0 {
+		return
+	}
+	fo, err := os.OpenFile(*dbfile, os.O_CREATE|os.O_RDWR|os.O_SYNC|os.O_TRUNC, 0777)
+	if err != nil {
+		log.Info("saveDb", zap.Error(err))
+	}
+	defer fo.Close()
+	cache.WriteTo(fo)
+}
+
+// getNewName 获取新的不重复名称
+func getNewName(tmp map[string]interface{}) string {
+	projectName := util.ObjToString(tmp["projectname"])
+	projectCode := util.ObjToString(tmp["projectcode"])
+	buyer := util.ObjToString(tmp["buyer"])
+	firsttime := util.Int64All(tmp["firsttime"])
+	createtime := util.Int64All(tmp["createtime"])
+	var projectDate, createDate string
+	if firsttime > 0 {
+		projectDate = time.Unix(firsttime, 0).Format("2006-01-02")
+	}
+	if createtime > 0 {
+		createDate = time.Unix(createtime, 0).Format("2006-01-02")
+	}
+
+	var matchWords = make([]string, 0)
+	if list, ok := tmp["list"].([]interface{}); ok {
+		if len(list) > 0 {
+			for _, v := range list {
+				if da, ok := v.(map[string]interface{}); ok {
+					title := util.ObjToString(da["title"])
+					// 使用正则表达式进行匹配
+					matches := GetPackages(title)
+					for _, v := range matches {
+						if !IsInStringArray(v, matchWords) {
+							matchWords = append(matchWords, v)
+						}
+					}
+				}
+			}
+		}
+	}
+
+	//pks := removeDuplicates(matchWords)
+	packages := strings.Join(matchWords, "、")
+	return RenameProjectName(projectName, projectCode, packages, projectDate, buyer, createDate)
+}
+
+// hash 计算hash
+func hash(src string) uint64 {
+	return xxhash.Sum64String(src)
+}
+
+// RenameProjectName 获取新的不重复的项目名称
+func RenameProjectName(projectName, projectCode, packages, projectDate, buyer, createDate string) (newName string) {
+	//TODO 1.判断项目名称是否重复
+	var id uint64
+	defer func() {
+		if id > 0 && newName != "" {
+			cache.Add(uint32(id))
+			cacheModify = true
+		}
+	}()
+
+	//1.项目名称
+	if projectName != "" {
+		id = hash(projectName)
+		if !cache.Contains(uint32(id)) {
+			newName = projectName
+			return projectName
+		}
+	}
+	//TODO 2.1	项目名称+项目编码
+	if projectCode != "" {
+		newName = projectName + "_" + projectCode
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+	//TODO 2.2	项目名称+分包信息
+	if packages != "" {
+		newName = projectName + "_" + packages
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+	//TODO 2.3	项目名称+项目时间
+	if projectDate != "" {
+		newName = projectName + "_" + projectDate
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+
+	//TODO 2.4	项目名称+采购单位名称
+	if buyer != "" {
+		newName = projectName + "_" + buyer
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+	//TODO 3.1 项目名称+项目编码+分包信息
+	if projectCode != "" && packages != "" {
+		newName = projectName + "_" + projectCode + "_" + packages
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+	//TODO 3.2 项目名称+项目编码+项目时间
+	if projectCode != "" && projectDate != "" {
+		newName = projectName + "_" + projectCode + "_" + projectDate
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+	//TODO 3.3 项目名称+项目编码+采购单位
+	if projectCode != "" && buyer != "" {
+		newName = projectName + "_" + projectCode + "_" + buyer
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+
+	//TODO 3.4 项目名称+分包+项目时间
+	if packages != "" && projectDate != "" {
+		newName = projectName + "_" + packages + "_" + projectDate
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+	//TODO 3.5 项目名称+分包+采购单位
+	if packages != "" && buyer != "" {
+		newName = projectName + "_" + packages + "_" + buyer
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+	//TODO 3.6 项目名称+项目时间+采购单位
+	if projectDate != "" && buyer != "" {
+		newName = projectName + "_" + projectDate + "_" + buyer
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+
+	//TODO 4.1 项目名称+项目编码+分包信息+项目时间
+	if projectCode != "" && packages != "" && projectDate != "" {
+		newName = projectName + "_" + projectCode + "_" + packages + "_" + projectDate
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+	//TODO 4.2 项目名称+项目编码+分包信息+采购单位
+	if projectCode != "" && packages != "" && buyer != "" {
+		newName = projectName + "_" + projectCode + "_" + packages + "_" + buyer
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	}
+
+	//TODO 5 项目名称+项目编码+分包信息+项目时间+采购单位
+	if projectCode != "" && packages != "" && projectDate != "" && buyer != "" {
+		newName = projectName + "_" + projectCode + "_" + packages + "_" + projectDate + "_" + buyer
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		}
+	} else {
+		newName = projectName + "_" + projectCode + "_" + packages + "_" + projectDate + "_" + buyer + "_" + createDate
+		id = hash(newName)
+		if !cache.Contains(uint32(id)) {
+			return newName
+		} else {
+			newName = ""
+		}
+	}
+
+	return
+}
+
+// GetPackages 获取对应的分包
+func GetPackages(title string) (res []string) {
+	// 定义正则表达式
+	rea := regexp.MustCompile(`包\d{1,2}[-~、]\d{1,2}|\d{1,2}[-~、]\d{1,2}包`) //1-6包;01-06包;01、02包;包1、包2
+	//text := "中国绿发投资集团有限公司直属项目公司2023年第20批集中采购非招标项目(包10、12、14、17、18、19"
+	packages := rea.FindAllString(util.ObjToString(title), -1) //匹配的包
+
+	if len(packages) > 0 {
+		res = append(res, packages...)
+	}
+
+	reb := regexp.MustCompile(`(标段[1-9一二三四五六七八九]|[1-9一二三四五六七八九]标段|包[1-9一二三四五六七八九]?[0-9]|[1-9一二三四五六七八九]?[0-9]包|[a-kA-K]包)`) // 标题只有一个包2
+	pgs := reb.FindAllString(title, -1)
+	if len(pgs) > 0 {
+		for _, v := range pgs {
+			if !IsInStringArray(v, res) {
+				res = append(res, v)
+			}
+		}
+	}
+	return res
+}