浏览代码

添加项目 详情索引处理

wcc 6 月之前
父节点
当前提交
d0cacb1500

+ 2 - 0
createEsIndex/common.toml

@@ -86,6 +86,8 @@
     indexb = "bidding"
 #    indextmp = "bidding_temporary"       ## 临时索引,其他程序需要;目前已不需要
     indexp = "projectset"
+    indexpd = "projectdetail" ## 项目支持detail的新索引
+    detailcount = 50000 ## 项目索引中,detail汉字长度限制
     indexwinner = "winner_v1"
     indexbuyer = "buyer_v3"
 detailfilter = ["(招标网|千里马|采招网|招标采购导航网|招标与采购网|中国招投标网|中国采购与招标网|中国采购与招标|优质采)[\\w\\W]{0,15}[http|https|htpps]?[a-z0-9:\\/\\/.]{0,20}(qianlima|zhaobiao|okcis|zbytb|infobidding|bidcenter|youzhicai|chinabidding|Chinabidding|CHINABIDDING)[a-z0-9.\\/\\/]{0,40}",

+ 2 - 0
createEsIndex/config/conf.go

@@ -135,6 +135,8 @@ type es struct {
 	IndexB       string
 	TypeB        string
 	IndexP       string
+	IndexPD      string
+	DetailCount  int
 	TypeP        string
 	IndexWinner  string
 	TypeWinner   string

二进制
createEsIndex/createindex_es7_20241205


文件差异内容过多而无法显示
+ 3 - 0
createEsIndex/es_test.go


+ 1 - 1
createEsIndex/go.mod

@@ -11,7 +11,7 @@ require (
 	github.com/spf13/viper v1.15.0
 	go.mongodb.org/mongo-driver v1.10.2
 	go.uber.org/zap v1.23.0
-	jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20231223031213-3f719e173cb5
+	jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20240412074219-927f3f682cb3
 )
 
 require (

+ 2 - 0
createEsIndex/go.sum

@@ -593,6 +593,8 @@ honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9
 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
 jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20231223031213-3f719e173cb5 h1:IJlZ+JTn7UvVeHyALb+yWacmtE94TW2XvBIRgTyRmzU=
 jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20231223031213-3f719e173cb5/go.mod h1:1Rp0ioZBhikjXHYYXmnzL6RNfvTDM/2XvRB+vuPLurI=
+jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20240412074219-927f3f682cb3 h1:mTokQIoOu/oZ2oCSAPayIFfnglIHP0qbOw1Ez6biKDo=
+jygit.jydev.jianyu360.cn/data_processing/common_utils v0.0.0-20240412074219-927f3f682cb3/go.mod h1:1Rp0ioZBhikjXHYYXmnzL6RNfvTDM/2XvRB+vuPLurI=
 rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=
 rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0=
 rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA=

+ 8 - 0
createEsIndex/init.go

@@ -224,6 +224,14 @@ func InitEs() {
 		log.Info("InitEs", zap.String("华为云Addr3", config.Conf.DB.Es.Addr3))
 	}
 
+	if config.Conf.DB.Es.IndexPD == "" {
+		log.Warn("InitEs", zap.String("项目详情索引 ", "缺少项目详情索引配置,请检查是否需要配置项目详情"))
+	}
+
+	// 详情字段,默认设置50000汉字长度限制
+	if config.Conf.DB.Es.DetailCount == 0 {
+		config.Conf.DB.Es.DetailCount = 50000
+	}
 	log.Info("InitEs", zap.Any("duration", time.Since(now).Seconds()))
 }
 

+ 47 - 9
createEsIndex/main.go

@@ -39,15 +39,17 @@ var (
 	JyUdpAddr  *net.UDPAddr
 	NeUdpAddr  *net.UDPAddr
 
-	EsBulkSize        = 50                                      // es批量保存大小
-	updateBiddingPool = make(chan map[string]interface{}, 5000) //更新bingding数据
-	updateBiddingSp   = make(chan bool, 5)
-	saveEsPool        = make(chan map[string]interface{}, 5000) //保存binding数据到es
-	saveEsSp          = make(chan bool, 5)
-	saveProjectEsPool = make(chan map[string]interface{}, 5000) //保存project数据到es
-	saveProjectSp     = make(chan bool, 5)
-	saveEsAllPool     = make(chan map[string]interface{}, 5000) //存储单机版es,爬虫采集判重使用
-	saveEsAllSp       = make(chan bool, 5)
+	EsBulkSize              = 50                                      // es批量保存大小
+	updateBiddingPool       = make(chan map[string]interface{}, 5000) //更新bingding数据
+	updateBiddingSp         = make(chan bool, 5)
+	saveEsPool              = make(chan map[string]interface{}, 5000) //保存binding数据到es
+	saveEsSp                = make(chan bool, 5)
+	saveProjectEsPool       = make(chan map[string]interface{}, 5000) //保存project数据到es
+	saveProjectSp           = make(chan bool, 5)
+	saveProjectDetailEsPool = make(chan map[string]interface{}, 5000) //保存project detail 数据到es
+	saveProjectDetailSp     = make(chan bool, 5)
+	saveEsAllPool           = make(chan map[string]interface{}, 5000) //存储单机版es,爬虫采集判重使用
+	saveEsAllSp             = make(chan bool, 5)
 
 	saveBiddingAllPool  = make(chan map[string]interface{}, 5000) //保存binding数据到es,stype=bidding_all_data
 	saveBiddingAllBEsSp = make(chan bool, 5)
@@ -109,6 +111,7 @@ func main() {
 	//go SaveBiddingEsMethod()  //保存es bidding数据
 	go SaveAllEsMethod()      // 保存爬虫采集临时数据
 	go SaveProjectEs()        //保存项目索引数据
+	go SaveProjectDetailEs()  //保存项目索引,添加了详情字段的新索引
 	go SaveBiddingAllDataEs() //保存stype=bidding_all_data 数据
 
 	go SaveBidErr()
@@ -212,7 +215,24 @@ func processUdpMsg(act byte, data []byte, ra *net.UDPAddr) {
 						<-pool
 					}()
 					projectTask(data, mapInfo)
+					// 配置项目详情索引
+					if config.Conf.DB.Es.IndexPD != "" {
+						projectDetailTask(data, mapInfo)
+					}
 				}()
+			case "project_detail": //添加了详情字段的项目索引
+				if config.Conf.DB.Es.IndexPD != "" {
+					ProjectLastNodeResponse = time.Now().Unix()
+					pool <- true
+					go func() {
+						defer func() {
+							<-pool
+						}()
+						projectDetailTask(data, mapInfo)
+					}()
+				} else {
+					log.Info("升级项目索引", zap.String("项目详情索引 ", "缺少项目详情索引配置,请检查配置文件"))
+				}
 			case "project_all_data": //存量 projectset 数据
 				pool <- true
 				go func() {
@@ -620,6 +640,24 @@ func SaveProjectEs() {
 	}
 }
 
+// SaveProjectDetailEs 保存项目索引,支持详情字段
+func SaveProjectDetailEs() {
+	for {
+		select {
+		case v := <-saveProjectDetailEsPool:
+			id := v["id"]
+			ids := v["_id"]
+			Es.Save(config.Conf.DB.Es.IndexPD, v)
+			// 华为云新集群,存储标讯、项目、凭安数据
+			if config.Conf.DB.Es.Addr3 != "" {
+				v["id"] = id
+				v["_id"] = ids
+				Es3.Save(config.Conf.DB.Es.IndexPD, v)
+			}
+		}
+	}
+}
+
 func checkMapJob() {
 	if config.Conf.Mail.Send {
 		log.Info("checkMapJob", zap.String("to:", config.Conf.Mail.To))

+ 284 - 0
createEsIndex/project_es.go

@@ -10,6 +10,8 @@ import (
 	"reflect"
 	"regexp"
 	"strconv"
+	"strings"
+	"sync"
 )
 
 var (
@@ -243,3 +245,285 @@ func projectTask(data []byte, mapInfo map[string]interface{}) {
 	}
 	log.Info("create project index...over", zap.Any("mapInfo", mapInfo), zap.Int("count", n))
 }
+
+// projectDetailTask 项目索引,添加详情字段
+func projectDetailTask(data []byte, mapInfo map[string]interface{}) {
+	defer util.Catch()
+	q, _ := mapInfo["query"].(map[string]interface{})
+	if q == nil {
+		q = map[string]interface{}{
+			"_id": map[string]interface{}{
+				"$gt":  mongodb.StringTOBsonId(mapInfo["gtid"].(string)),
+				"$lte": mongodb.StringTOBsonId(mapInfo["lteid"].(string)),
+			},
+		}
+	} else {
+		if q["pici"] == nil {
+			idMap, _ := q["_id"].(map[string]interface{})
+			if idMap != nil {
+				tmpQ := map[string]interface{}{}
+				for c, id := range idMap {
+					if idStr, ok := id.(string); ok && id != "" {
+						tmpQ[c] = mongodb.StringTOBsonId(idStr)
+					}
+				}
+				q["_id"] = tmpQ
+			}
+		}
+	}
+
+	conn := MgoP.GetMgoConn()
+	defer MgoP.DestoryMongoConn(conn)
+	count, _ := conn.DB(MgoP.DbName).C(config.Conf.DB.MongoP.Coll).Find(&q).Count()
+	log.Info("projectDetailTask", zap.String("coll", config.Conf.DB.MongoP.Coll), zap.Any("查询语句:", q), zap.Int64("同步总数:", count))
+	query := conn.DB(MgoP.DbName).C(config.Conf.DB.MongoP.Coll).Find(q).Iter()
+	n := 0
+	//
+	ch := make(chan bool, 10)
+	wg := &sync.WaitGroup{}
+	for tmp := make(map[string]interface{}); query.Next(tmp); n++ {
+		if n%2000 == 0 {
+			log.Info("current", zap.Int("count", n))
+			log.Info("current", zap.Any("_id", tmp["_id"]))
+		}
+		ch <- true
+		wg.Add(1)
+		go func(tmp map[string]interface{}) {
+			defer func() {
+				<-ch
+				wg.Done()
+			}()
+			newTmp := make(map[string]interface{})
+			newTmp["s_projectname"] = tmp["projectname"]
+			for f, ftype := range ProjectField {
+				if tmp[f] != nil {
+					if f == "package" {
+						pp := map[string]map[string]interface{}{}
+						if packages, ok := tmp["package"].(map[string]interface{}); ok {
+							for _, pks := range packages {
+								if pk, ok := pks.([]interface{}); ok {
+									for _, v := range pk {
+										if p, ok := v.(map[string]interface{}); ok {
+											winner := util.ObjToString(p["winner"])
+											bidamount := util.Float64All((p["bidamount"]))
+											if len(winner) > 4 && bidamount > 0 {
+												p := map[string]interface{}{
+													"winner":    winner,
+													"bidamount": bidamount,
+												}
+												pp[winner] = p
+											}
+										}
+									}
+								}
+							}
+						} else {
+							winner := util.ObjToString(tmp["winner"])
+							bidamount := util.Float64All(tmp["bidamount"])
+							if len(winner) > 4 && bidamount > 0 {
+								p := map[string]interface{}{
+									"winner":    winner,
+									"bidamount": bidamount,
+								}
+								pp[winner] = p
+							}
+						}
+						pk1 := []map[string]interface{}{}
+						for _, v := range pp {
+							pk1 = append(pk1, v)
+						}
+						if len(pk1) > 0 {
+							newTmp["package1"] = pk1
+						}
+					} else if f == "topscopeclass" {
+						if topscopeclass, ok := tmp["topscopeclass"].([]interface{}); ok {
+							tc := []string{}
+							m2 := map[string]bool{}
+							for _, v := range topscopeclass {
+								str := util.ObjToString(v)
+								str = regLetter.ReplaceAllString(str, "") // 去除字母
+								if !m2[str] {
+									m2[str] = true
+									tc = append(tc, str)
+								}
+							}
+							newTmp["topscopeclass"] = tc
+						}
+					} else if f == "list" {
+						if list, ok := tmp[f].([]interface{}); ok {
+							var newList []map[string]interface{}
+							for _, item := range list {
+								item1 := item.(map[string]interface{})
+								listm := make(map[string]interface{})
+								for f1, ftype1 := range ProjectListF {
+									if item1[f1] != nil {
+										if f == "topscopeclass" || f == "subscopeclass" {
+											listm[f] = item1[f1]
+										} else {
+											if fieldval := item1[f1]; reflect.TypeOf(fieldval).String() != ftype1 {
+												continue
+											} else {
+												if fieldval != "" {
+													listm[f1] = fieldval
+												}
+											}
+										}
+									}
+								}
+								newList = append(newList, listm)
+							}
+							newTmp[f] = newList
+						}
+					} else if f == "budget" || f == "bidamount" || f == "sortprice" {
+						if tmp[f] != nil && util.Float64All(tmp[f]) <= 1000000000 {
+							newTmp[f] = tmp[f]
+						}
+					} else if f == "projectscope" {
+						projectscopeRune := []rune(util.ObjToString(tmp[f]))
+						if len(projectscopeRune) > 1000 {
+							newTmp[f] = util.ObjToString(tmp[f])[:1000]
+						} else {
+							newTmp[f] = tmp[f]
+						}
+					} else if f == "ids" || f == "mpc" || f == "mpn" || f == "review_experts" || f == "winnerorder" ||
+						f == "entidlist" || f == "first_cooperation" || f == "subscopeclass" || f == "jgtime" {
+						newTmp[f] = tmp[f]
+					} else if f == "_id" {
+						newTmp["_id"] = mongodb.BsonIdToSId(tmp["_id"])
+						newTmp["id"] = mongodb.BsonIdToSId(tmp["_id"])
+					} else {
+						if fieldval := tmp[f]; reflect.TypeOf(fieldval).String() != ftype && ftype != "" {
+							continue
+						} else {
+							if fieldval != "" {
+								newTmp[f] = fieldval
+							}
+						}
+					}
+				}
+			}
+
+			budget := util.Float64All(newTmp["budget"])
+			bidamount := util.Float64All(newTmp["bidamount"])
+			if float64(budget) > 0 && float64(bidamount) > 0 {
+				rate := float64(1) - float64(bidamount)/float64(budget)
+				f, _ := strconv.ParseFloat(strconv.FormatFloat(rate, 'f', 4, 64), 64)
+				//不在0~0.6之间,不生成费率;只生成预算,中标金额舍弃,索引增加折扣率异常标识
+				if f < 0 || f > 0.6 {
+					delete(newTmp, "bidamount")
+					newTmp["prate_flag"] = 1
+				} else {
+					newTmp["project_rate"] = f
+				}
+			}
+
+			bidopentime := util.Int64All(tmp["bidopentime"]) //开标日期
+			fzb_publishtime := int64(0)                      //记录第一个招标信息的publishtime
+			bidcycle_flag := false                           //判断是否已计算出标书表编制周期
+
+			//项目详情,辅助字段,处理过的list里面id,英文逗号拼接
+			detail_ids := util.ObjToString(tmp["detail_ids"]) //1,2,3
+			detailIds := make([]string, 0)
+			if detail_ids != "" {
+				detailIds = strings.Split(detail_ids, ",") //[1,2,3]
+			}
+			detail := make([]string, 0) //最终的详情字段
+			//todo 回去原索引 详情字段
+			projectID := mongodb.BsonIdToSId(tmp["_id"])
+			_, oldProject := Es.GetById(config.Conf.DB.Es.IndexPD, projectID)
+			if oldProject != nil {
+				oldDetail := oldProject["detail"]
+				if oddetail, ok := oldDetail.(string); ok {
+					if oddetail != "" {
+						old_details := strings.Split(oddetail, " ")
+						detail = append(detail, old_details...)
+					}
+				}
+			}
+			//统计现有字符串长度
+			totalNumCount := CountChineseCharacters(detail)
+
+			list := tmp["list"].([]interface{})
+			for _, m := range list {
+				tmpM := m.(map[string]interface{})
+				if bidamount, ok := tmpM["bidamount"].(string); ok && len(bidamount) > 0 { //bidamount为string类型,转成float
+					tmpB := util.Float64All(tmpM["bidamount"])
+					tmpM["bidamount"] = tmpB
+				}
+				//计算bidcycle标书表编制周期字段
+				if !bidcycle_flag && bidopentime > 0 { //bidopentime>0证明list中有bidopentime,无则不用计算bidcycle
+					if toptype := util.ObjToString(tmpM["toptype"]); toptype == "招标" {
+						zb_bidopentime := util.Int64All(tmpM["bidopentime"])
+						zb_publishtime := util.Int64All(tmpM["publishtime"])
+						if zb_publishtime > 0 {
+							if zb_bidopentime > 0 {
+								if tmpTime := zb_bidopentime - zb_publishtime; tmpTime > 0 {
+									f_day := float64(tmpTime) / float64(86400)
+									day := math.Ceil(f_day)
+									tmp["bidcycle"] = int(day)
+									bidcycle_flag = true
+								}
+							}
+							if fzb_publishtime == 0 { //仅赋值第一个招标信息的publishtime
+								fzb_publishtime = zb_publishtime
+							}
+						}
+					}
+				}
+				//todo 处理项目详情 新字段;获取es 已有数据,判断是否需要更新detail
+				infoid := util.ObjToString(tmpM["infoid"])
+				if infoid != "" && !IsInStringArray(infoid, detailIds) && (totalNumCount < config.Conf.DB.Es.DetailCount) {
+					detailIds = append(detailIds, infoid)
+					if infoid > "5a862e7040d2d9bbe88e3b1f" {
+						biddingData, _ := MgoB.FindById("bidding", infoid, nil)
+						biddingDetail := util.ObjToString((*biddingData)["detail"])
+						da, _ := CleanHTMLTags(biddingDetail)
+						characterArray := SplitTextByChinesePunctuation(da)
+						detail = append(detail, RemoveDuplicates(characterArray)...)
+					} else {
+						biddingData, _ := MgoB.FindById("bidding_back", infoid, nil)
+						biddingDetail := util.ObjToString((*biddingData)["detail"])
+						da, _ := CleanHTMLTags(biddingDetail)
+						characterArray := SplitTextByChinesePunctuation(da)
+						detail = append(detail, RemoveDuplicates(characterArray)...)
+					}
+				}
+			}
+			if len(detail) > 0 {
+				detailNew := RemoveDuplicates(detail)
+				newTmp["detail"] = strings.Join(detailNew, " ")
+			}
+
+			//计算bidcycle标书表编制周期字段
+			//list中招标信息中未能计算出bidcycle,用第一个招标信息的fzb_publishtime和外围bidopentime计算
+			if !bidcycle_flag && bidopentime > 0 && fzb_publishtime > 0 {
+				if tmpTime := bidopentime - fzb_publishtime; tmpTime > 0 {
+					f_day := float64(tmpTime) / float64(86400)
+					day := math.Ceil(f_day)
+					newTmp["bidcycle"] = int(day)
+				}
+			}
+
+			//todo 这里和上面正常项目索引做了区别,不在单独处理,直接使用数据库数据
+			//项目名称副标题
+			subtitleProjectname := util.ObjToString(tmp["subtitle_projectname"])
+			if subtitleProjectname != "" {
+				newTmp["subtitle_projectname"] = subtitleProjectname
+			}
+
+			//更新项目表,已经处理过的 标讯id
+			if len(detailIds) > 0 {
+				new_bidding_ids := strings.Join(detailIds, ",")
+				update := map[string]interface{}{
+					"detail_ids": new_bidding_ids,
+				}
+				MgoP.UpdateById(config.Conf.DB.MongoP.Coll, mongodb.BsonIdToSId(tmp["_id"]), map[string]interface{}{"$set": update})
+			}
+
+			saveProjectDetailEsPool <- newTmp
+		}(tmp)
+		tmp = map[string]interface{}{}
+	}
+	wg.Wait()
+	log.Info("create projectDetailTask index...over", zap.Any("mapInfo", mapInfo), zap.Int("count", n))
+}

+ 140 - 0
createEsIndex/utils.go

@@ -1,11 +1,13 @@
 package main
 
 import (
+	"bytes"
 	"encoding/json"
 	"fmt"
 	"github.com/cespare/xxhash/v2"
 	"go.mongodb.org/mongo-driver/bson"
 	"go.uber.org/zap"
+	"golang.org/x/net/html"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
@@ -878,3 +880,141 @@ func getRegsResult(data string, regs []*RegexpInfo) (res bool, a string) {
 	}
 	return false, ""
 }
+
+// CleanHTMLTags 处理HTML内容并返回清理后的文本
+func CleanHTMLTags(htmlContent string) (string, error) {
+	// 解析HTML内容
+	doc, err := html.Parse(strings.NewReader(htmlContent))
+	if err != nil {
+		return "", err
+	}
+
+	var buf bytes.Buffer
+
+	// 递归函数,用来遍历 HTML 树
+	var f func(*html.Node)
+	f = func(n *html.Node) {
+		// 处理文本节点
+		if n.Type == html.TextNode {
+			// 去掉文本节点中的所有空格
+			//buf.WriteString(n.Data)
+
+			// 去掉文本节点中的所有空格
+			trimmedText := strings.ReplaceAll(n.Data, " ", "") // 去掉所有空格
+			buf.WriteString(trimmedText)
+		}
+
+		// 处理元素节点
+		if n.Type == html.ElementNode {
+			// 调试:输出当前节点的类型和标签名
+			//fmt.Printf("ElementNode: %s\n", n.Data)
+
+			// 处理 <br> 标签,插入换行
+			if n.Data == "br" {
+				buf.WriteString("\n")
+			} else if n.Data == "table" {
+				// 处理表格标签 <table>
+				for tr := n.FirstChild; tr != nil; tr = tr.NextSibling {
+					if tr.Type == html.ElementNode && tr.Data == "tr" {
+						// 遍历每行中的 <td> 单元格
+						for td := tr.FirstChild; td != nil; td = td.NextSibling {
+							if td.Data == "td" {
+								buf.WriteString("[TD] ")
+								f(td) // 递归处理 <td> 中的内容
+							}
+						}
+					}
+				}
+				//buf.WriteString("Table End\n")
+			} else if n.Data == "ul" {
+				// 处理无序列表 <ul>
+				for li := n.FirstChild; li != nil; li = li.NextSibling {
+					if li.Data == "li" {
+						buf.WriteString("- ")
+						f(li)
+						buf.WriteString("\n")
+					}
+				}
+			}
+		}
+
+		// 遍历子节点
+		for child := n.FirstChild; child != nil; child = child.NextSibling {
+			f(child)
+		}
+	}
+
+	// 启动递归遍历
+	f(doc)
+
+	// 去除多余空格
+	trimmedText := strings.TrimSpace(buf.String())
+
+	return trimmedText, nil
+}
+
+// SplitTextByChinesePunctuation 根据中文断句,拆分语句
+func SplitTextByChinesePunctuation(text string) []string {
+	// Regular expression pattern for Chinese punctuation and spaces
+	// This pattern splits by Chinese punctuation, spaces, and keeps them for splitting.
+	//pattern := `[。!?;,:\s]+`
+	// 替换掉所有的 NBSP(不间断空格)为普通空格
+	text = strings.ReplaceAll(text, "\u00A0", " ")
+	pattern := `[,。!?、;:]|\s+`
+	re := regexp.MustCompile(pattern)
+
+	// Split the text by the pattern
+	parts := re.Split(text, -1)
+
+	// Filter out empty strings resulting from split
+	var result []string
+	for _, part := range parts {
+		trimmed := strings.TrimSpace(part)
+		if trimmed != "" {
+			result = append(result, trimmed)
+		}
+	}
+	return result
+}
+
+// RemoveDuplicates 去除字符串数组中重复数据;并去除被长语句包含的短语句
+func RemoveDuplicates(strs []string) []string {
+	var result []string
+	for _, str := range strs {
+		// 检查当前短语是否已被 result 中的任何一个较长短语包含
+		shouldAdd := true
+		for _, resStr := range result {
+			if strings.Contains(resStr, str) {
+				// 如果已有的短语包含当前短语,则不添加当前短语
+				shouldAdd = false
+				break
+			}
+		}
+		if shouldAdd {
+			// 将当前短语添加到结果中
+			result = append(result, str)
+			// 再次遍历一遍,移除包含当前短语的任何较短短语
+			for i := len(result) - 2; i >= 0; i-- {
+				if strings.Contains(result[i], str) {
+					// 如果之前的较短短语包含当前短语,则移除该较短短语
+					result = append(result[:i], result[i+1:]...)
+				}
+			}
+		}
+	}
+	return result
+}
+
+// CountChineseCharacters 函数统计字符串数组中汉字的总数
+func CountChineseCharacters(strs []string) int {
+	var totalCount int
+	for _, str := range strs {
+		for _, r := range str {
+			// 判断字符是否为汉字且不是标点符号
+			if unicode.Is(unicode.Han, r) && !unicode.IsPunct(r) {
+				totalCount++
+			}
+		}
+	}
+	return totalCount
+}

部分文件因为文件数量过多而无法显示