123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352 |
- package main
- import (
- "context"
- "encoding/json"
- "fmt"
- "github.com/olivere/elastic/v7"
- "io"
- util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
- "log"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "testing"
- )
- func TestEs(T *testing.T) {
- s := "aa"
- fmt.Println(strings.Contains(s, ","))
- //re := regexp.MustCompile(`包\d+`) // 标题只有一个包2
- //re := regexp.MustCompile(`\d+批包(\d+(?:、\d+)*)`)
- // 定义正则表达式
- //re3 := regexp.MustCompile(`(?:包)?(\d+(?:、\d+)*)包?`) //标题含有多个包;包10、12、14、17、18、19
- //re3 := regexp.MustCompile(`包(\d+(?:、\d+)*)`) //标题含有多个包;包10、12、14、17、18、19
- ////re3 := regexp.MustCompile(`\d+包`)//冀中股份2023年12月标准件计划框架协议采购2包12.7
- ////re2 := regexp.MustCompile(`标包\d+`)//2024一季度水火电一般工序类中小金工件(标包1)-招标公告
- //text := "中国绿发投资集团有限公司直属项目公司2023年第20批集中采购非招标项目(包10、12、14、17、18、19"
- //matches := re.FindAllString(util.ObjToString(text), -1)
- //log.Println(matches)
- //title := "中电科青岛科技产业园一期设计施工总承包1标段"
- //rea := regexp.MustCompile(`总承包\d标段`) //总承包1标段
- //maches := rea.FindAllString(title, -1)
- //log.Println("rea", maches)
- //reb := regexp.MustCompile(`包\d[-~]\d`) //包1-6
- //title = "2023年度分布式光伏发电项目主要设备(包1~6)设备采购合格供应商"
- //matches := reb.FindAllString(title, -1)
- //log.Println("reb", matches)
- //re := regexp.MustCompile(`\d+批包(\d+(?:、\d+)*)`)
- re := regexp.MustCompile(`包?\d{1,2}[-~、和](包)?\d{1,2}包?`) //1-6包;01-06包;01、02包;包1、包2
- title := "中国绿发投资集团有限公司直属项目公司2023年第20批集中标段1采购非招标项目(包10、12、14、17、18、19"
- matches := re.FindAllString(title, -1)
- log.Println("rec", matches)
- //
- //re2 := regexp.MustCompile(`包?\d{1,2}`) // 标题只有一个包2
- //re3 := regexp.MustCompile(`\d{1,2}包`) // 标题只有一个包2
- re3 := regexp.MustCompile(`(标段[1-9一二三四五六七八九]|包[1-9一二三四五六七八九]?[0-9]|[1-9一二三四五六七八九]?[0-9]包|[a-kA-K]包)`) // 标题只有一个包2
- title = "济南市历下标段3区东关中心幼儿园超市包21、22、23、24"
- matches = re3.FindAllString(title, -1)
- log.Println("aaa", matches)
- //ree := regexp.MustCompile(`第\d{1,2}[批]|\d{1,2}标段|标段\d{1,2}`)
- //ree2 := regexp.MustCompile(`(第)?\d{1,2}[标段\d{1,2}]`)
- //log.Println(ree2.FindAllString(title, -1))
- // 原始数据
- data := []string{"包1", "包1-2", "包4", "包4-5", "包3", "包5", "包9", "包11", "包12", "11包", "标段1", "包13"}
- // 调用去重函数
- uniquePackages := removeDuplicates(data)
- // 输出去重后的结果
- fmt.Println("去重后的数据:", uniquePackages)
- }
- func removeDuplicates(data []string) []string {
- // 存储已存在的包号
- existingPackages := make(map[int]bool)
- // 存储包含包号信息的字符串
- packages := make(map[int]string)
- // 匹配包号的正则表达式
- re := regexp.MustCompile(`(包)(\d+)(?:-(\d+))?`)
- noexists := make([]string, 0)
- // 遍历数据
- for _, item := range data {
- // 提取包号信息
- matches := re.FindStringSubmatch(item)
- if len(matches) < 3 {
- noexists = append(noexists, item)
- continue
- }
- // 解析包号
- start, _ := strconv.Atoi(matches[2])
- end := start
- if len(matches[3]) > 0 {
- end, _ = strconv.Atoi(matches[3])
- }
- // 添加到已存在的包号中
- for i := start; i <= end; i++ {
- existingPackages[i] = true
- }
- // 将包含包号信息的字符串存储到 packages 中
- packages[start] = matches[0]
- }
- // 从 map 中提取去重后的包号并排序
- var uniquePackages []int
- for packageNum := range existingPackages {
- uniquePackages = append(uniquePackages, packageNum)
- }
- sort.Ints(uniquePackages)
- // 将连续的包号转换为包含范围的字符串
- var result []string
- var start, end int
- for i, num := range uniquePackages {
- if i == 0 {
- start = num
- end = num
- } else if num == end+1 {
- end = num
- } else {
- if start == end {
- result = append(result, packages[start])
- } else {
- result = append(result, fmt.Sprintf("包%d-%d", start, end))
- }
- start = num
- end = num
- }
- }
- if start == end {
- result = append(result, packages[start])
- } else {
- result = append(result, fmt.Sprintf("包%d-%d", start, end))
- }
- result = append(result, noexists...)
- return result
- }
- // syncEs 同步es 数据道信集群
- func TestSyncEs(T *testing.T) {
- //url := "http://172.17.4.184:19805"
- url := "http://127.0.0.1:19805"
- username := "es_all"
- password := "TopJkO2E_d1x"
- index := "bidding" //索引名称
- // 创建 Elasticsearch 客户端
- client, err := elastic.NewClient(
- elastic.SetURL(url),
- elastic.SetBasicAuth(username, password),
- elastic.SetSniff(false),
- )
- if err != nil {
- log.Fatalf("创建 Elasticsearch 客户端失败:%s", err)
- }
- url2 := "http://127.0.0.1:19905"
- username2 := "jybid"
- password2 := "Top2023_JEB01i@31"
- // 创建 Elasticsearch 客户端
- client2, err := elastic.NewClient(
- elastic.SetURL(url2),
- elastic.SetBasicAuth(username2, password2),
- elastic.SetSniff(false),
- )
- if err != nil {
- log.Fatalf("创建 Elasticsearch 客户端失败:%s", err)
- }
- rangeQuery := elastic.NewRangeQuery("id").Gte("65869b436977356f55a01b0b").Lt("6586a4196977356f55a02c79")
- query := elastic.NewBoolQuery().Must(rangeQuery)
- ctx := context.Background()
- //开始滚动搜索
- scrollID := ""
- scroll := "1m"
- searchSource := elastic.NewSearchSource().
- Query(query).
- Size(10000).
- Sort("_doc", true) //升序排序
- //Sort("_doc", false) //降序排序
- searchService := client.Scroll(index).
- Size(10000).
- Scroll(scroll).
- SearchSource(searchSource)
- res, err := searchService.Do(ctx)
- if err != nil {
- if err == io.EOF {
- fmt.Println("没有数据")
- } else {
- panic(err)
- }
- }
- defer client.ClearScroll().ScrollId(scrollID).Do(ctx) // 在退出时清理资源
- fmt.Println("总数是:", res.TotalHits())
- total := 0
- for len(res.Hits.Hits) > 0 {
- for _, hit := range res.Hits.Hits {
- var doc map[string]interface{}
- err := json.Unmarshal(hit.Source, &doc)
- if err != nil {
- log.Printf("解析文档失败:%s", err)
- continue
- }
- id := util.ObjToString(doc["id"])
- client2.Index().Index(index).Id(id).BodyJson(doc).Do(ctx)
- }
- total = total + len(res.Hits.Hits)
- scrollID = res.ScrollId
- res, err = client.Scroll().ScrollId(scrollID).Scroll(scroll).Do(ctx)
- log.Println("current count:", total)
- if err != nil {
- if err == io.EOF {
- // 滚动到最后一批数据,退出循环
- break
- }
- log.Printf("滚动搜索失败:%s", err)
- break // 处理错误时退出循环
- }
- }
- fmt.Println("结束~~~~~~~~~~~~~~~")
- }
- func TestGetP(T *testing.T) {
- MgoB := &mongodb.MongodbSim{
- //MongodbAddr: "172.17.189.140:27080",
- MongodbAddr: "127.0.0.1:27083",
- Size: 10,
- DbName: "qfw",
- UserName: "SJZY_RWbid_ES",
- Password: "SJZY@B4i4D5e6S",
- Direct: true,
- }
- MgoB.InitPool()
- //url := "http://172.17.4.184:19805"
- url := "http://127.0.0.1:19805"
- username := "es_all"
- password := "TopJkO2E_d1x"
- index := "projectset" //索引名称
- // 创建 Elasticsearch 客户端
- client, err := elastic.NewClient(
- elastic.SetURL(url),
- elastic.SetBasicAuth(username, password),
- elastic.SetSniff(false),
- )
- if err != nil {
- log.Fatalf("创建 Elasticsearch 客户端失败:%s", err)
- }
- rangeQuery := elastic.NewRangeQuery("pici").Gt("1672502400").Lte("1704038400")
- //termQ := elastic.NewTermQuery("multipackage", 0)
- //rangeQuery := elastic.NewRangeQuery("id").Gt("657b08556977356f5578cb25").Lte("657b08556977356f5578cb26")
- query := elastic.NewBoolQuery().Must(rangeQuery)
- ctx := context.Background()
- //开始滚动搜索
- scrollID := ""
- scroll := "1m"
- searchSource := elastic.NewSearchSource().
- Query(query).
- Size(1000).
- Sort("_doc", true) //升序排序
- //Sort("_doc", false) //降序排序
- searchService := client.Scroll(index).
- Size(1000).
- Scroll(scroll).
- SearchSource(searchSource)
- res, err := searchService.Do(ctx)
- if err != nil {
- if err == io.EOF {
- fmt.Println("没有数据")
- } else {
- panic(err)
- }
- }
- defer client.ClearScroll().ScrollId(scrollID).Do(ctx) // 在退出时清理资源
- fmt.Println("总数是:", res.TotalHits())
- total := 0
- for len(res.Hits.Hits) > 0 {
- for _, hit := range res.Hits.Hits {
- var doc map[string]interface{}
- err := json.Unmarshal(hit.Source, &doc)
- if err != nil {
- log.Printf("解析文档失败:%s", err)
- continue
- }
- //id := util.ObjToString(doc["id"])
- //log.Println(id)
- var matchWords = make([]string, 0)
- var matchList = make([]interface{}, 0)
- if list, ok := doc["list"].([]interface{}); ok {
- for _, v := range list {
- if da, ok := v.(map[string]interface{}); ok {
- if util.ObjToString(da["toptype"]) != "招标" {
- continue
- }
- title := util.ObjToString(da["title"])
- // 使用正则表达式进行匹配
- matches := GetMatches(title)
- if len(matches) > 0 {
- matchList = append(matchList, da)
- }
- matchWords = append(matchWords, matches...)
- }
- }
- }
- insert := make(map[string]interface{})
- insert["project_id"] = doc["id"]
- insert["_id"] = doc["id"]
- insert["multipackage"] = doc["multipackage"]
- insert["list"] = doc["list"]
- insert["projectname"] = doc["projectname"]
- insert["sourceinfourl"] = doc["sourceinfourl"]
- if len(matchWords) > 0 {
- insert["matchList"] = matchList
- insert["package_name"] = util.ObjToString(doc["projectname"]) + "-" + strings.Join(matchWords, "、")
- insert["type"] = 1
- }
- MgoB.SaveByOriID("wcc_project_20240304", insert)
- }
- total = total + len(res.Hits.Hits)
- scrollID = res.ScrollId
- res, err = client.Scroll().ScrollId(scrollID).Scroll(scroll).Do(ctx)
- log.Println("current count:", total)
- if err != nil {
- if err == io.EOF {
- // 滚动到最后一批数据,退出循环
- break
- }
- log.Printf("滚动搜索失败:%s", err)
- break // 处理错误时退出循环
- }
- }
- fmt.Println("结束~~~~~~~~~~~~~~~")
- }
|