project_new.go 16 KB


  1. package main
  2. import (
  3. "context"
  4. "encoding/base64"
  5. "encoding/json"
  6. "fmt"
  7. "github.com/olivere/elastic/v7"
  8. util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  9. "log"
  10. "sort"
  11. "strconv"
  12. //"sort"
  13. "strings"
  14. "time"
  15. )
  16. func SearchProjectFullScoring(client *elastic.Client, target InputData, projectName, areacode, publish string) ([]map[string]interface{}, error) {
  17. seenIDs := make(map[string]*elastic.SearchHit)
  18. province, city := "", ""
  19. if areacode != "" {
  20. code := areacode[:6]
  21. where := map[string]interface{}{
  22. "code": code,
  23. }
  24. res, _ := MgoQY.FindOne("address_new_2020", where)
  25. province = util.ObjToString((*res)["province"])
  26. city = util.ObjToString((*res)["city"])
  27. }
  28. // 1. 精准查询
  29. preciseHits, err := searchPrecise(client, projectName, province, city, publish, 20)
  30. if err != nil {
  31. return nil, err
  32. }
  33. for _, hit := range preciseHits {
  34. if _, exists := seenIDs[hit.Id]; !exists {
  35. seenIDs[hit.Id] = hit
  36. }
  37. }
  38. // 2. 分词查询(
  39. tokenHits, err := searchByToken(client, projectName, province, city, publish, 20)
  40. if err != nil {
  41. return nil, err
  42. }
  43. for _, hit := range tokenHits {
  44. if _, exists := seenIDs[hit.Id]; !exists {
  45. seenIDs[hit.Id] = hit
  46. }
  47. }
  48. // 3. common 查询
  49. commonHits, err := searchCommon(client, projectName, province, city, publish, 10)
  50. if err != nil {
  51. return nil, err
  52. }
  53. for _, hit := range commonHits {
  54. if _, exists := seenIDs[hit.Id]; !exists {
  55. seenIDs[hit.Id] = hit
  56. }
  57. }
  58. // 4. 合并 + 打分增强
  59. var results []map[string]interface{}
  60. //nameScore := computeNameScore(projectName)
  61. var allCandidates = []EsDocument{}
  62. for id, hit := range seenIDs {
  63. var doc map[string]interface{}
  64. if err := json.Unmarshal(hit.Source, &doc); err != nil {
  65. continue
  66. }
  67. // 从 Mongo 读取 detail 字段用于后续 buyer 过滤
  68. bidd, _ := MgoB.FindById("bidding", id, nil)
  69. detail := util.ObjToString((*bidd)["detail"])
  70. doc["detail"] = detail
  71. /**
  72. "id", "title", "projectname", "projectcode", "bidamount", "area", "city",
  73. "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel",
  74. "s_winner", "winnertel", "agency", "publishtime"
  75. */
  76. candidate := EsDocument{
  77. Id: util.ObjToString(doc["id"]),
  78. Title: util.ObjToString(doc["title"]),
  79. Projectname: util.ObjToString(doc["projectname"]),
  80. Toptype: util.ObjToString(doc["toptype"]),
  81. Subtype: util.ObjToString(doc["subtype"]),
  82. Area: util.ObjToString(doc["area"]),
  83. City: util.ObjToString(doc["city"]),
  84. Buyer: util.ObjToString(doc["buyer"]),
  85. SWinner: util.ObjToString(doc["s_winner"]),
  86. Bidamount: util.Float64All(doc["bidamount"]),
  87. Publishtime: util.Int64All(doc["publishtime"]),
  88. Agency: util.ObjToString(doc["agency"]),
  89. WinnerTel: util.ObjToString(doc["winnertel"]),
  90. BuyerTel: util.ObjToString(doc["buyertel"]),
  91. BuyerPerson: util.ObjToString(doc["buyerperson"]),
  92. Budget: util.Float64All(doc["budget"]),
  93. }
  94. score := calculateConfidenceScore(target, candidate)
  95. //candidate.Score = score
  96. candidate.Score, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", score), 64)
  97. allCandidates = append(allCandidates, candidate)
  98. }
  99. // 排序(降序)
  100. sort.SliceStable(allCandidates, func(i, j int) bool {
  101. return allCandidates[i].Score > allCandidates[j].Score
  102. })
  103. for _, doc := range allCandidates {
  104. item := map[string]interface{}{
  105. "id": doc.Id,
  106. "title": doc.Title,
  107. "projectname": doc.Projectname,
  108. "projectcode": doc.ProjectCode,
  109. "toptype": doc.Toptype,
  110. "subtype": doc.Subtype,
  111. "area": doc.Area,
  112. "city": doc.City,
  113. "buyer": doc.Buyer,
  114. "budget": doc.Budget,
  115. "bidamount": doc.Bidamount,
  116. "winner": doc.Winner,
  117. "detail": doc.Detail,
  118. "publishtime": doc.Publishtime,
  119. "agency": doc.Agency,
  120. "s_winner": doc.SWinner,
  121. "winnertel": doc.WinnerTel,
  122. "buyertel": doc.BuyerTel,
  123. "buyerperson": doc.BuyerPerson,
  124. "score": doc.Score,
  125. }
  126. results = append(results, item)
  127. }
  128. return results, nil
  129. }
  130. func searchPrecise(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
  131. fieldsToTry := []string{"projectname.pname", "title", "detail"}
  132. filtersToTry := [][]elastic.Query{
  133. {elastic.NewTermsQuery("subtype", "中标", "成交", "合同", "单一")},
  134. {elastic.NewTermsQuery("toptype", "招标", "预告", "采购意向")},
  135. {elastic.NewTermsQuery("toptype", "拟建")},
  136. }
  137. var allResults []*elastic.SearchHit
  138. seenIDs := make(map[string]bool)
  139. // 解析发布时间
  140. var t time.Time
  141. var err error
  142. if publish != "" {
  143. t, err = time.Parse("200601", publish)
  144. if err != nil {
  145. log.Println("时间解析失败:", err)
  146. }
  147. }
  148. for _, field := range fieldsToTry {
  149. var dateRangeStart, dateRangeEnd int64
  150. if !t.IsZero() {
  151. if field == "detail" {
  152. dateRangeStart, dateRangeEnd = getYearRange(t, 60)
  153. } else {
  154. dateRangeStart, dateRangeEnd = getYearRange(t, 36)
  155. }
  156. }
  157. for _, filters := range filtersToTry {
  158. var queries []*elastic.BoolQuery
  159. if field == "detail" {
  160. // detail 只加时间 + filter
  161. query := elastic.NewBoolQuery()
  162. query = query.Must(elastic.NewMultiMatchQuery(projectName, field).Type("phrase"))
  163. query = query.Filter(filters...)
  164. if !t.IsZero() {
  165. query = query.Must(elastic.NewRangeQuery("publishtime").Gte(dateRangeStart).Lt(dateRangeEnd))
  166. }
  167. queries = append(queries, query)
  168. } else {
  169. // 1. 省 + 市 查询
  170. if province != "" && city != "" {
  171. query1 := elastic.NewBoolQuery()
  172. query1 = query1.Must(elastic.NewMultiMatchQuery(projectName, field).Type("phrase"))
  173. query1 = query1.Must(elastic.NewTermQuery("province", province))
  174. query1 = query1.Must(elastic.NewTermQuery("city", city))
  175. query1 = query1.Filter(filters...)
  176. if !t.IsZero() {
  177. query1 = query1.Must(elastic.NewRangeQuery("publishtime").Gte(dateRangeStart).Lt(dateRangeEnd))
  178. }
  179. queries = append(queries, query1)
  180. }
  181. // 2. 仅省份(城市为空或不同)
  182. if province != "" {
  183. query2 := elastic.NewBoolQuery()
  184. query2 = query2.Must(elastic.NewMultiMatchQuery(projectName, field).Type("phrase"))
  185. query2 = query2.Must(elastic.NewTermQuery("province", province))
  186. query2 = query2.Filter(filters...)
  187. if !t.IsZero() {
  188. query2 = query2.Must(elastic.NewRangeQuery("publishtime").Gte(dateRangeStart).Lt(dateRangeEnd))
  189. }
  190. queries = append(queries, query2)
  191. }
  192. // 3. 不限制省市
  193. if province != "" {
  194. query3 := elastic.NewBoolQuery()
  195. query3 = query3.Must(elastic.NewMultiMatchQuery(projectName, field).Type("phrase"))
  196. query3 = query3.Filter(filters...)
  197. if !t.IsZero() {
  198. query3 = query3.Must(elastic.NewRangeQuery("publishtime").Gte(dateRangeStart).Lt(dateRangeEnd))
  199. }
  200. queries = append(queries, query3)
  201. }
  202. }
  203. for _, query := range queries {
  204. fetchFields := elastic.NewFetchSourceContext(true).Include(
  205. "id", "title", "projectname", "projectcode", "bidamount", "area", "city",
  206. "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel",
  207. "s_winner", "winnertel", "agency", "publishtime")
  208. searchResult, err := client.Search().
  209. Index("bidding").
  210. Query(query).
  211. Size(maxResults).
  212. FetchSourceContext(fetchFields).
  213. Do(context.Background())
  214. if err != nil {
  215. return nil, err
  216. }
  217. for _, hit := range searchResult.Hits.Hits {
  218. if !seenIDs[hit.Id] {
  219. seenIDs[hit.Id] = true
  220. allResults = append(allResults, hit)
  221. }
  222. }
  223. }
  224. }
  225. // detail 的命中足够就提前结束
  226. if field == "detail" && len(allResults) > maxResults {
  227. break
  228. }
  229. }
  230. return allResults, nil
  231. }
  232. func searchByToken(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
  233. fieldsToTry := []string{"projectname.pname", "title", "detail"}
  234. filtersToTry := [][]elastic.Query{
  235. {elastic.NewTermsQuery("subtype", "中标", "成交", "合同", "单一")},
  236. {elastic.NewTermsQuery("toptype", "招标", "预告", "采购意向")},
  237. {elastic.NewTermsQuery("toptype", "拟建")},
  238. }
  239. // 解析时间
  240. var t time.Time
  241. var err error
  242. if publish != "" {
  243. t, err = time.Parse("200601", publish)
  244. if err != nil {
  245. log.Println("时间解析失败:", err)
  246. }
  247. }
  248. // 分词处理
  249. analyzeResp, err := client.IndexAnalyze().
  250. Index("bidding").
  251. Analyzer("ik_smart").
  252. Text(projectName).
  253. Do(context.Background())
  254. if err != nil {
  255. return nil, err
  256. }
  257. var tokens []string
  258. for _, token := range analyzeResp.Tokens {
  259. tokens = append(tokens, token.Token)
  260. }
  261. if len(tokens) == 0 {
  262. return nil, fmt.Errorf("no tokens found from ik_smart")
  263. }
  264. queryText := strings.Join(tokens, " ")
  265. // 指定返回字段
  266. fetchFields := elastic.NewFetchSourceContext(true).Include(
  267. "id", "title", "projectname", "projectcode", "bidamount", "area", "city",
  268. "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel",
  269. "s_winner", "winnertel", "agency", "publishtime")
  270. var allHits []*elastic.SearchHit
  271. seen := make(map[string]bool)
  272. for _, field := range fieldsToTry {
  273. var dateRangeStart, dateRangeEnd int64
  274. if !t.IsZero() {
  275. if field == "detail" {
  276. dateRangeStart, dateRangeEnd = getYearRange(t, 60)
  277. } else {
  278. dateRangeStart, dateRangeEnd = getYearRange(t, 36)
  279. }
  280. }
  281. for _, filters := range filtersToTry {
  282. var queries []*elastic.BoolQuery
  283. if field == "detail" {
  284. query := elastic.NewBoolQuery().
  285. Must(elastic.NewMatchQuery(field, queryText)).
  286. Filter(filters...)
  287. if !t.IsZero() {
  288. query = query.Must(elastic.NewRangeQuery("publishtime").Gte(dateRangeStart).Lt(dateRangeEnd))
  289. }
  290. queries = append(queries, query)
  291. } else {
  292. // 省+市
  293. if province != "" && city != "" {
  294. q := elastic.NewBoolQuery().
  295. Must(elastic.NewMatchQuery(field, queryText)).
  296. Must(elastic.NewTermQuery("province", province)).
  297. Must(elastic.NewTermQuery("city", city)).
  298. Filter(filters...)
  299. if !t.IsZero() {
  300. q = q.Must(elastic.NewRangeQuery("publishtime").Gte(dateRangeStart).Lt(dateRangeEnd))
  301. }
  302. queries = append(queries, q)
  303. }
  304. // 仅省
  305. if province != "" {
  306. q := elastic.NewBoolQuery().
  307. Must(elastic.NewMatchQuery(field, queryText)).
  308. Must(elastic.NewTermQuery("province", province)).
  309. Filter(filters...)
  310. if !t.IsZero() {
  311. q = q.Must(elastic.NewRangeQuery("publishtime").Gte(dateRangeStart).Lt(dateRangeEnd))
  312. }
  313. queries = append(queries, q)
  314. }
  315. // 无省市
  316. q := elastic.NewBoolQuery().
  317. Must(elastic.NewMatchQuery(field, queryText)).
  318. Filter(filters...)
  319. if !t.IsZero() {
  320. q = q.Must(elastic.NewRangeQuery("publishtime").Gte(dateRangeStart).Lt(dateRangeEnd))
  321. }
  322. queries = append(queries, q)
  323. }
  324. for _, query := range queries {
  325. searchResult, err := client.Search().
  326. Index("bidding").
  327. Query(query).
  328. Size(maxResults).
  329. FetchSourceContext(fetchFields).
  330. Do(context.Background())
  331. if err != nil {
  332. continue
  333. }
  334. for _, hit := range searchResult.Hits.Hits {
  335. if !seen[hit.Id] {
  336. seen[hit.Id] = true
  337. allHits = append(allHits, hit)
  338. if len(allHits) >= maxResults {
  339. return allHits, nil
  340. }
  341. }
  342. }
  343. }
  344. }
  345. if field == "detail" && len(allHits) >= maxResults {
  346. break
  347. }
  348. }
  349. return allHits, nil
  350. }
  351. func searchCommon(client *elastic.Client, projectName, province, city, publish string, maxResults int) ([]*elastic.SearchHit, error) {
  352. fields := []string{"projectname.pname", "title", "detail"}
  353. var t time.Time
  354. var err error
  355. if publish != "" {
  356. t, err = time.Parse("200601", publish)
  357. if err != nil {
  358. log.Println("时间解析失败:", err)
  359. }
  360. }
  361. var allHits []*elastic.SearchHit
  362. seen := make(map[string]bool)
  363. fetchFields := elastic.NewFetchSourceContext(true).Include(
  364. "id", "title", "projectname", "projectcode", "bidamount", "area", "city",
  365. "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel",
  366. "s_winner", "winnertel", "agency", "publishtime")
  367. for _, field := range fields {
  368. var dateRangeStart, dateRangeEnd int64
  369. if !t.IsZero() {
  370. if field == "detail" {
  371. dateRangeStart, dateRangeEnd = getYearRange(t, 60)
  372. } else {
  373. dateRangeStart, dateRangeEnd = getYearRange(t, 36)
  374. }
  375. }
  376. var queries []map[string]interface{}
  377. commonQuery := func(f string, boost float64) map[string]interface{} {
  378. return map[string]interface{}{
  379. "common": map[string]interface{}{
  380. f: map[string]interface{}{
  381. "query": projectName,
  382. "cutoff_frequency": 0.01,
  383. "low_freq_operator": "and",
  384. "boost": boost,
  385. },
  386. },
  387. }
  388. }
  389. if field == "detail" {
  390. // 只做普通匹配 + 时间
  391. boolQuery := map[string]interface{}{
  392. "bool": map[string]interface{}{
  393. "must": []interface{}{
  394. commonQuery(field, 0.1),
  395. },
  396. },
  397. }
  398. if !t.IsZero() {
  399. boolQuery["bool"].(map[string]interface{})["filter"] = []interface{}{
  400. map[string]interface{}{
  401. "range": map[string]interface{}{
  402. "publishtime": map[string]interface{}{
  403. "gte": dateRangeStart,
  404. "lt": dateRangeEnd,
  405. },
  406. },
  407. },
  408. }
  409. }
  410. queries = append(queries, boolQuery)
  411. } else {
  412. // 1. 省+市
  413. if province != "" && city != "" {
  414. q := map[string]interface{}{
  415. "bool": map[string]interface{}{
  416. "must": []interface{}{
  417. commonQuery(field, 0.2),
  418. map[string]interface{}{"term": map[string]interface{}{"province": province}},
  419. map[string]interface{}{"term": map[string]interface{}{"city": city}},
  420. },
  421. },
  422. }
  423. if !t.IsZero() {
  424. q["bool"].(map[string]interface{})["filter"] = []interface{}{
  425. map[string]interface{}{
  426. "range": map[string]interface{}{
  427. "publishtime": map[string]interface{}{
  428. "gte": dateRangeStart,
  429. "lt": dateRangeEnd,
  430. },
  431. },
  432. },
  433. }
  434. }
  435. queries = append(queries, q)
  436. }
  437. // 2. 仅省
  438. if province != "" {
  439. q := map[string]interface{}{
  440. "bool": map[string]interface{}{
  441. "must": []interface{}{
  442. commonQuery(field, 0.2),
  443. map[string]interface{}{"term": map[string]interface{}{"province": province}},
  444. },
  445. },
  446. }
  447. if !t.IsZero() {
  448. q["bool"].(map[string]interface{})["filter"] = []interface{}{
  449. map[string]interface{}{
  450. "range": map[string]interface{}{
  451. "publishtime": map[string]interface{}{
  452. "gte": dateRangeStart,
  453. "lt": dateRangeEnd,
  454. },
  455. },
  456. },
  457. }
  458. }
  459. queries = append(queries, q)
  460. }
  461. // 3. 不加省市
  462. q := map[string]interface{}{
  463. "bool": map[string]interface{}{
  464. "must": []interface{}{
  465. commonQuery(field, 0.2),
  466. },
  467. },
  468. }
  469. if !t.IsZero() {
  470. q["bool"].(map[string]interface{})["filter"] = []interface{}{
  471. map[string]interface{}{
  472. "range": map[string]interface{}{
  473. "publishtime": map[string]interface{}{
  474. "gte": dateRangeStart,
  475. "lt": dateRangeEnd,
  476. },
  477. },
  478. },
  479. }
  480. }
  481. queries = append(queries, q)
  482. }
  483. for _, q := range queries {
  484. // 编码 query 为 base64
  485. queryBytes, _ := json.Marshal(q)
  486. queryBase64 := base64.StdEncoding.EncodeToString(queryBytes)
  487. query := elastic.NewWrapperQuery(queryBase64)
  488. searchResult, err := client.Search().
  489. Index("bidding").
  490. Query(query).
  491. Size(maxResults).
  492. FetchSourceContext(fetchFields).
  493. Do(context.Background())
  494. if err != nil {
  495. log.Println("searchCommon 查询失败:", err)
  496. continue
  497. }
  498. for _, hit := range searchResult.Hits.Hits {
  499. if !seen[hit.Id] {
  500. seen[hit.Id] = true
  501. allHits = append(allHits, hit)
  502. if len(allHits) >= maxResults {
  503. return allHits, nil
  504. }
  505. }
  506. }
  507. }
  508. }
  509. return allHits, nil
  510. }
  511. // getYearRange calculates a date range of +/- 1 year from the base date.
  512. func getYearRange(baseDate time.Time, m int) (start, end int64) {
  513. endTime := baseDate.AddDate(0, m, 0)
  514. startTime := baseDate.AddDate(0, -m, 0)
  515. return startTime.Unix(), endTime.Unix()
  516. }