wcc 1 zi în urmă
părinte
comite
6d8fced7ea

+ 6 - 168
project_chuan/project_new.go

@@ -9,6 +9,7 @@ import (
 	util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	"log"
 	"sort"
+	"strconv"
 
 	//"sort"
 	"strings"
@@ -30,7 +31,7 @@ func SearchProjectFullScoring(client *elastic.Client, target InputData, projectN
 		city = util.ObjToString((*res)["city"])
 	}
 
-	// 1. 精准查询(权重 1.0)
+	// 1. 精准查询
 	preciseHits, err := searchPrecise(client, projectName, province, city, publish, 20)
 	if err != nil {
 		return nil, err
@@ -41,7 +42,7 @@ func SearchProjectFullScoring(client *elastic.Client, target InputData, projectN
 		}
 	}
 
-	// 2. 分词查询(权重 0.8)
+	// 2. 分词查询(
 	tokenHits, err := searchByToken(client, projectName, province, city, publish, 20)
 	if err != nil {
 		return nil, err
@@ -52,7 +53,7 @@ func SearchProjectFullScoring(client *elastic.Client, target InputData, projectN
 		}
 	}
 
-	// 3. common 查询(权重 0.5)
+	// 3. common 查询
 	commonHits, err := searchCommon(client, projectName, province, city, publish, 10)
 	if err != nil {
 		return nil, err
@@ -104,28 +105,16 @@ func SearchProjectFullScoring(client *elastic.Client, target InputData, projectN
 		}
 
 		score := calculateConfidenceScore(target, candidate)
-		candidate.Score = score
+		//candidate.Score = score
+		candidate.Score, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", score), 64)
 		allCandidates = append(allCandidates, candidate)
 	}
 
-	// 打印打分调试
-	//for i, c := range allCandidates {
-	//	fmt.Printf("Candidat 排序前: %d Score: %.4f\n", i, c.Score)
-	//}
-
 	// 排序(降序)
 	sort.SliceStable(allCandidates, func(i, j int) bool {
 		return allCandidates[i].Score > allCandidates[j].Score
 	})
 
-	//for i, c := range allCandidates {
-	//	fmt.Printf("Candidate 排序后: %d Score: %.4f\n", i, c.Score)
-	//}
-	//// 5. 排序
-	//sort.Slice(allCandidates, func(i, j int) bool {
-	//	return allCandidates[i].Score > allCandidates[j].Score
-	//})
-
 	for _, doc := range allCandidates {
 		item := map[string]interface{}{
 			"id":          doc.Id,
@@ -154,51 +143,6 @@ func SearchProjectFullScoring(client *elastic.Client, target InputData, projectN
 	return results, nil
 }
 
-// searchPrecise 精准查询
-func searchPrecise22(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
-	fieldsToTry := []string{"projectname.pname", "title", "detail"}
-	filtersToTry := [][]elastic.Query{
-		{elastic.NewTermsQuery("subtype", "中标", "成交", "合同", "单一")},
-		{elastic.NewTermsQuery("toptype", "招标", "预告", "采购意向")},
-		{elastic.NewTermsQuery("toptype", "拟建")},
-	}
-	var allResults []*elastic.SearchHit
-	seenIDs := make(map[string]bool)
-
-	query := elastic.NewBoolQuery()
-
-	for _, field := range fieldsToTry {
-		if field == "detail" && len(allResults) > maxResults {
-			break
-		}
-		for _, filter := range filtersToTry {
-			//query := elastic.NewBoolQuery().
-			query.Must(elastic.NewMultiMatchQuery(projectName, field).Type("phrase")).
-				Filter(filter...)
-
-			fetchFields := elastic.NewFetchSourceContext(true).Include("id", "title", "projectname", "projectcode", "bidamount", "area", "city", "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel", "s_winner", "winnertel", "agency", "publishtime")
-
-			searchResult, err := client.Search().
-				Index("bidding").
-				Query(query).
-				Size(maxResults).
-				FetchSourceContext(fetchFields).
-				Do(context.Background())
-			if err != nil {
-				return nil, err
-			}
-
-			for _, hit := range searchResult.Hits.Hits {
-				if !seenIDs[hit.Id] {
-					seenIDs[hit.Id] = true
-					allResults = append(allResults, hit)
-				}
-			}
-		}
-	}
-	return allResults, nil
-}
-
 func searchPrecise(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
 	fieldsToTry := []string{"projectname.pname", "title", "detail"}
 	filtersToTry := [][]elastic.Query{
@@ -295,11 +239,6 @@ func searchPrecise(client *elastic.Client, projectName, province, city, publish
 					return nil, err
 				}
 
-				// 打印 query JSON(调试用)
-				//if sourceQ, err := query.Source(); err == nil {
-				//	log.Println(printInterfaceAsJSON(sourceQ))
-				//}
-
 				for _, hit := range searchResult.Hits.Hits {
 					if !seenIDs[hit.Id] {
 						seenIDs[hit.Id] = true
@@ -318,66 +257,6 @@ func searchPrecise(client *elastic.Client, projectName, province, city, publish
 	return allResults, nil
 }
 
-// searchByToken 分词查询
-func searchByToken22(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
-	fieldsToTry := []string{"projectname.pname", "title", "detail"}
-	filtersToTry := [][]elastic.Query{
-		{elastic.NewTermsQuery("subtype", "中标", "成交", "合同", "单一")},
-		{elastic.NewTermsQuery("toptype", "招标", "预告", "采购意向")},
-		{elastic.NewTermsQuery("toptype", "拟建")},
-	}
-	analyzeResp, err := client.IndexAnalyze().
-		Index("bidding").
-		Analyzer("ik_smart").
-		Text(projectName).
-		Do(context.Background())
-	if err != nil {
-		return nil, err
-	}
-
-	var tokens []string
-	for _, token := range analyzeResp.Tokens {
-		tokens = append(tokens, token.Token)
-	}
-	if len(tokens) == 0 {
-		return nil, fmt.Errorf("no tokens found from ik_smart")
-	}
-	queryText := strings.Join(tokens, " ")
-
-	var allHits []*elastic.SearchHit
-	seen := make(map[string]bool)
-
-	for _, filter := range filtersToTry {
-		query := elastic.NewBoolQuery().
-			Must(elastic.NewMultiMatchQuery(queryText, fieldsToTry...).MinimumShouldMatch("100%")).
-			Filter(filter...)
-
-		searchResult, err := client.Search().
-			Index("bidding").
-			Query(query).
-			Size(maxResults).
-			Do(context.Background())
-		if err != nil {
-			continue
-		}
-
-		for _, hit := range searchResult.Hits.Hits {
-			if !seen[hit.Id] {
-				seen[hit.Id] = true
-				allHits = append(allHits, hit)
-				if len(allHits) >= maxResults {
-					break
-				}
-			}
-		}
-
-		if len(allHits) >= maxResults {
-			break
-		}
-	}
-	return allHits, nil
-}
-
 func searchByToken(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
 	fieldsToTry := []string{"projectname.pname", "title", "detail"}
 	filtersToTry := [][]elastic.Query{
@@ -510,47 +389,6 @@ func searchByToken(client *elastic.Client, projectName, province, city, publish
 	return allHits, nil
 }
 
-// searchCommon common 查询
-func searchCommon22(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
-	queryMap := map[string]interface{}{
-		"bool": map[string]interface{}{
-			"should": []interface{}{
-				map[string]interface{}{"common": map[string]interface{}{"projectname.pname": map[string]interface{}{"query": projectName, "cutoff_frequency": 0.01, "low_freq_operator": "and", "boost": 0.2}}},
-				map[string]interface{}{"common": map[string]interface{}{"title": map[string]interface{}{"query": projectName, "cutoff_frequency": 0.01, "low_freq_operator": "and", "boost": 0.2}}},
-				map[string]interface{}{"common": map[string]interface{}{"detail": map[string]interface{}{"query": projectName, "cutoff_frequency": 0.01, "low_freq_operator": "and", "boost": 0.1}}},
-			},
-			"minimum_should_match": 1,
-		},
-	}
-	queryBytes, _ := json.Marshal(queryMap)
-	queryBase64 := base64.StdEncoding.EncodeToString(queryBytes)
-	query := elastic.NewWrapperQuery(queryBase64)
-
-	fetchFields := elastic.NewFetchSourceContext(true).Include("id", "title", "projectname", "projectcode", "bidamount", "area", "city", "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel", "s_winner", "winnertel", "agency", "publishtime")
-
-	searchResult, err := client.Search().
-		Index("bidding").
-		Query(query).
-		Size(maxResults).
-		FetchSourceContext(fetchFields).
-		Do(context.Background())
-	if err != nil {
-		return nil, err
-	}
-
-	var allHits []*elastic.SearchHit
-	seen := make(map[string]bool)
-
-	for _, hit := range searchResult.Hits.Hits {
-		if !seen[hit.Id] {
-			seen[hit.Id] = true
-			allHits = append(allHits, hit)
-		}
-	}
-
-	return allHits, nil
-}
-
 func searchCommon(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
 	fields := []string{"projectname.pname", "title", "detail"}
 	var t time.Time

+ 105 - 16
project_chuan/project_test.go

@@ -5,6 +5,7 @@ import (
 	"github.com/olivere/elastic/v7"
 	"github.com/xuri/excelize/v2"
 	"go.uber.org/zap"
+	util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
 	"strconv"
@@ -125,9 +126,6 @@ func TestSearchProjectFullScoring(t *testing.T) {
 		return
 	}
 
-	//读取 Excel文件 条件参数
-	//var targets []InputData
-	// Start from row 1 to skip the header
 	for i, row := range rows {
 		if i == 0 {
 			continue
@@ -154,22 +152,113 @@ func TestSearchProjectFullScoring(t *testing.T) {
 		areacode := rows[i][2]
 		publish := rows[i][5]
 
+		fmt.Println(i, projectName)
 		res, err := SearchProjectFullScoring(client, target, projectName, areacode, publish)
-		fmt.Println(res, err)
+		if err != nil {
+			fmt.Println(err)
+		}
+
+		dds := make([]string, 0)
+		for k, v := range res {
+			if k > 2 {
+				break
+			}
+
+			id := util.ObjToString(v["id"])
+			jyurl := GetJyURLByID(id)
+			socre := v["score"]
+			pname := v["projectname"]
+			dd := fmt.Sprintf("%v,%v,%v", pname, socre, jyurl)
+			dds = append(dds, dd)
+		}
 
+		ddda := strings.Join(dds, "\n")
+		f.SetCellValue("Sheet1", fmt.Sprintf("%s%d", "K", i+1), ddda)
 	}
 
-	//for i := 2; i < len(rows); i++ {
-	//	projectName := rows[i][1]
-	//	areacode := rows[i][2]
-	//	publish := rows[i][5]
-	//
-	//	res, err := SearchProjectFullScoring(client, projectName, areacode, publish)
-	//	fmt.Println(res, err)
-	//	results, err := searchES24(client, projectName, "", 20, 50)
-	//
-	//	fmt.Println(results, err)
-	//
-	//}
+}
+
+func TestSearchProjectFullScoring2(t *testing.T) {
+
+	url := "http://127.0.0.1:19908"
+	username := "jybid"
+	password := "Top2023_JEB01i@31"
+
+	client, err := elastic.NewClient(
+		elastic.SetURL(url),
+		elastic.SetBasicAuth(username, password),
+		elastic.SetSniff(false),
+	)
+	if err != nil {
+		log.Fatal("创建 Elasticsearch 客户端失败", zap.Error(err))
+	}
+	MgoQY = &mongodb.MongodbSim{
+		//MongodbAddr: "172.31.31.202:27081,172.20.45.128:27080",
+		MongodbAddr: "127.0.0.1:27083",
+		Size:        10,
+		DbName:      "mixdata",
+		UserName:    "SJZY_RWbid_ES",
+		Password:    "SJZY@B4i4D5e6S",
+		Direct:      true,
+	}
+	MgoQY.InitPool()
+
+	f, err := excelize.OpenFile("./剑鱼匹配疑似问题7月.xlsx")
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+	defer func() {
+		f.Save()
+		if err := f.Close(); err != nil {
+			fmt.Println(err)
+		}
+	}()
+
+	rows, err := f.GetRows("Sheet1")
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+
+	for i, row := range rows {
+		if i == 0 {
+			continue
+		}
+
+		target := InputData{
+			ProjectName: row[0],
+		}
+
+		projectName := rows[i][0]
+		if projectName == "" {
+			continue
+		}
+		areacode := ""
+		publish := ""
+
+		fmt.Println(i, projectName)
+		res, err := SearchProjectFullScoring(client, target, projectName, areacode, publish)
+		if err != nil {
+			fmt.Println(err)
+		}
+
+		dds := make([]string, 0)
+		for k, v := range res {
+			if k > 2 {
+				break
+			}
+
+			id := util.ObjToString(v["id"])
+			jyurl := GetJyURLByID(id)
+			socre := v["score"]
+			pname := v["projectname"]
+			dd := fmt.Sprintf("%v,%v,%v", pname, socre, jyurl)
+			dds = append(dds, dd)
+		}
+
+		ddda := strings.Join(dds, "\n")
+		f.SetCellValue("Sheet1", fmt.Sprintf("%s%d", "J", i+1), ddda)
+	}
 
 }

+ 24 - 30
project_chuan/score.go

@@ -1,7 +1,6 @@
 package main
 
 import (
-	"fmt"
 	"math"
 	"strings"
 	"time"
@@ -10,11 +9,6 @@ import (
 	"github.com/adrg/strutil/metrics"
 )
 
-type ScoredResult struct {
-	Document   EsDocument
-	Confidence float64
-}
-
 type InputData struct {
 	ProjectCode       string
 	Area              string
@@ -50,7 +44,7 @@ type EsDocument struct {
 }
 
 // calculateConfidenceScore calculates a score from 0-100 based on multiple factors.
-func calculateConfidenceScore(target InputData, candidate EsDocument) float64 {
+func calculateConfidenceScore22(target InputData, candidate EsDocument) float64 {
 	var totalScore float64 = 0.0
 	nameSimilarity := EnhancedSimilarity(strings.ReplaceAll(target.ProjectName, target.ConstructionUnit, ""), strings.ReplaceAll(strings.ReplaceAll(candidate.Projectname, candidate.Buyer, ""), target.ConstructionUnit, ""))
 	// 1. Project Name Similarity (Weight: 40 points)
@@ -92,6 +86,29 @@ func calculateConfidenceScore(target InputData, candidate EsDocument) float64 {
 	return totalScore
 }
 
+func calculateConfidenceScore(target InputData, candidate EsDocument) float64 {
+	var totalScore float64 = 0.0
+	nameSimilarity := EnhancedSimilarity(strings.ReplaceAll(target.ProjectName, target.ConstructionUnit, ""), strings.ReplaceAll(strings.ReplaceAll(candidate.Projectname, candidate.Buyer, ""), target.ConstructionUnit, ""))
+	// 1. Project Name Similarity (Weight: 40 points)
+	totalScore += nameSimilarity * 80.0
+	//log.Println(target.ProjectName, candidate.Projectname, nameSimilarity)
+
+	// 2. Construction Unit Similarity (Weight: 30 points)
+	unitSimilarity := EnhancedSimilarity(target.ConstructionUnit, candidate.Buyer)
+	totalScore += unitSimilarity * 10.0
+
+	//log.Println(target.ConstructionUnit, candidate.Buyer, unitSimilarity)
+
+	if target.Area != "" && target.Area == candidate.Area {
+		totalScore += 5
+		if target.City != "" && target.City == candidate.City {
+			totalScore += 5
+		}
+	}
+
+	return totalScore
+}
+
 // 计算时间
 func TimeProximity(t1, t2 time.Time) float64 {
 	// 解析时间
@@ -189,26 +206,3 @@ func checkLongestCommonSubstring(s1, s2 string) float64 {
 	}
 	return float64(maxLen) / float64(avgLen)
 }
-
-func main000() {
-	str1 := "绿色低碳节能项目"
-	str2 := "河北纵横集团丰南钢铁有限公司绿色低碳节能项目"
-
-	similarity := EnhancedSimilarity(str1, str2)
-	fmt.Printf("相似度: %.4f\n", similarity)
-
-	// 更多测试用例
-	testCases := []struct {
-		s1, s2 string
-	}{
-		{"绿色低碳", "绿色低碳节能项目"},
-		{"钢铁项目", "丰南钢铁有限公司项目"},
-		{"节能环保", "新能源开发"},
-		{"相同的字符串", "相同的字符串"},
-	}
-
-	for _, tc := range testCases {
-		sim := EnhancedSimilarity(tc.s1, tc.s2)
-		fmt.Printf("'%s' vs '%s': %.4f\n", tc.s1, tc.s2, sim)
-	}
-}

BIN
project_chuan/剑鱼匹配疑似问题7月.xlsx


BIN
project_chuan/项目信息.xlsx