123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364 |
- package main
- import (
- "context"
- "encoding/json"
- "fmt"
- "github.com/olivere/elastic/v7"
- util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "regexp"
- "sort"
- "strings"
- "unicode"
- )
- var (
- FilterReg_3 = regexp.MustCompile("(项目|公告|公示)$")
- FilterReg_2 = regexp.MustCompile("^[)\\)>》】\\]}}〕,,;;::'\"“”。.\\??、/+=\\_—*&……\\^%$¥@!!`~·(\\(<《【\\[{{〔]+$")
- FilterReg_1 = regexp.MustCompile("^([0-9]{1,3}|[零一二三四五六七八九十]{1,2}|联系人?|电话|地址|编号|采购|政府采购|成交|更正|招标|中标|变更|结果)$")
- FilterReg = regexp.MustCompile("^[的人号时元万公告项目地址电话邮编日期联系招标中结果成交项目项目采购采购项目政府采购公告更正公告]+$")
- )
- // getSearch 模拟网站所有,只提供精准和分词;不再打分;只过滤省市,只匹配标题、项目名称;根据参数决定是否匹配详情;
- func getSearch(client *elastic.Client, projectName, areacode string, isDetail int) ([]map[string]interface{}, error) {
- var results []map[string]interface{}
- seenIDs := make(map[string]*elastic.SearchHit)
- province, city := "", ""
- if areacode != "" {
- code := areacode[:6]
- where := map[string]interface{}{
- "code": code,
- }
- res, _ := MgoQY.FindOne("address_new_2020", where)
- province = util.ObjToString((*res)["province"])
- city = util.ObjToString((*res)["city"])
- }
- //fmt.Println(province, city)
- projectName = RemoveInvisibleChars(projectName)
- projectName = FilterGeneric(projectName)
- // 1. 精准查询
- preciseHits, err := searchPreciseOther(client, projectName, province, city, isDetail)
- if err != nil {
- return nil, err
- }
- for _, hit := range preciseHits {
- if _, exists := seenIDs[hit.Id]; !exists {
- seenIDs[hit.Id] = hit
- }
- }
- // 2. 分词查询(
- tokenHits, err := searchByTokenOther(client, projectName, province, city, isDetail)
- if err != nil {
- return nil, err
- }
- for _, hit := range tokenHits {
- if _, exists := seenIDs[hit.Id]; !exists {
- seenIDs[hit.Id] = hit
- }
- }
- for id, hit := range seenIDs {
- var doc map[string]interface{}
- if err = json.Unmarshal(hit.Source, &doc); err != nil {
- continue
- }
- // 从 Mongo 读取 detail 字段用于后续 buyer 过滤
- bidd, _ := MgoB.FindById("bidding", id, nil)
- detail := util.ObjToString((*bidd)["detail"])
- if detail != "" {
- doc["detail"] = detail
- }
- results = append(results, doc)
- }
- sort.SliceStable(results, func(i, j int) bool {
- return util.Int64All(results[i]["publishtime"]) > util.Int64All(results[j]["publishtime"])
- })
- return results, nil
- }
- // searchPreciseOther 精准搜索;m默认项目名称+标题;详情可选参数
- func searchPreciseOther(client *elastic.Client, projectName, area, city string, isDetail int) ([]*elastic.SearchHit, error) {
- fieldsToTry := []string{"projectname.pname", "title"}
- if isDetail > 0 {
- fieldsToTry = append(fieldsToTry, "detail")
- }
- filtersToTry := [][]elastic.Query{
- {elastic.NewTermsQuery("subtype", "中标", "成交", "合同", "单一")},
- {elastic.NewTermsQuery("toptype", "招标", "预告", "采购意向", "拟建")},
- }
- runQuery := func(withArea bool) ([]*elastic.SearchHit, error) {
- var allResults []*elastic.SearchHit
- seenIDs := make(map[string]bool)
- for _, field := range fieldsToTry {
- for _, filters := range filtersToTry {
- var queries []*elastic.BoolQuery
- queryBase := elastic.NewBoolQuery().
- Must(elastic.NewMultiMatchQuery(projectName, field).Type("phrase")).
- Filter(filters...)
- if withArea {
- if area != "" {
- queryBase = queryBase.Must(elastic.NewTermQuery("area", area))
- }
- if city != "" {
- queryBase = queryBase.Must(elastic.NewTermQuery("city", city))
- }
- }
- queries = append(queries, queryBase)
- for _, query := range queries {
- fetchFields := elastic.NewFetchSourceContext(true).Include(
- "id", "title", "projectname", "projectcode", "bidamount", "area", "city",
- "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel",
- "s_winner", "winnertel", "agency", "publishtime")
- searchResult, err := client.Search().
- Index("bidding").
- Query(query).
- FetchSourceContext(fetchFields).
- Do(context.Background())
- if err != nil {
- return nil, err
- }
- for _, hit := range searchResult.Hits.Hits {
- if !seenIDs[hit.Id] {
- seenIDs[hit.Id] = true
- allResults = append(allResults, hit)
- }
- }
- }
- }
- }
- return allResults, nil
- }
- // 第一次查询:包含省份和城市过滤(如果有)
- results, err := runQuery(true)
- if err != nil {
- return nil, err
- }
- if len(results) == 0 && area != "" {
- // 如果查不到,并且存在省份条件,再执行一次去掉 area 的查询
- return runQuery(false)
- }
- return results, nil
- }
- // searchByTokenOther 分词查询;
- func searchByTokenOther2(client *elastic.Client, projectName, province, city string, isDetail int) ([]*elastic.SearchHit, error) {
- fieldsToTry := []string{"projectname.pname", "title"}
- if isDetail > 0 {
- fieldsToTry = append(fieldsToTry, "detail")
- }
- filtersToTry := [][]elastic.Query{
- {elastic.NewTermsQuery("subtype", "中标", "成交", "合同", "单一")},
- {elastic.NewTermsQuery("toptype", "招标", "预告", "采购意向", "拟建")},
- }
- // 分词处理
- analyzeResp, err := client.IndexAnalyze().
- Index("bidding").
- Analyzer("ik_smart").
- Text(projectName).
- Do(context.Background())
- if err != nil {
- return nil, err
- }
- var tokens []string
- for _, token := range analyzeResp.Tokens {
- tokens = append(tokens, token.Token)
- }
- if len(tokens) == 0 {
- return nil, fmt.Errorf("no tokens found from ik_smart")
- }
- queryText := strings.Join(tokens, " ")
- // 指定返回字段
- fetchFields := elastic.NewFetchSourceContext(true).Include(
- "id", "title", "projectname", "projectcode", "bidamount", "area", "city",
- "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel",
- "s_winner", "winnertel", "agency", "publishtime")
- var allHits []*elastic.SearchHit
- seen := make(map[string]bool)
- for _, field := range fieldsToTry {
- for _, filters := range filtersToTry {
- var queries []*elastic.BoolQuery
- if field == "detail" {
- query := elastic.NewBoolQuery()
- if province != "" {
- query = query.Must(elastic.NewTermQuery("area", province))
- }
- query = query.Must(elastic.NewMatchQuery(field, queryText)).
- Filter(filters...)
- queries = append(queries, query)
- } else {
- // 省+市
- if province != "" && city != "" {
- q := elastic.NewBoolQuery().
- Must(elastic.NewMatchQuery(field, queryText)).
- Must(elastic.NewTermQuery("area", province)).
- Must(elastic.NewTermQuery("city", city)).
- Filter(filters...)
- queries = append(queries, q)
- }
- // 仅省
- if province != "" {
- q := elastic.NewBoolQuery().
- Must(elastic.NewMatchQuery(field, queryText)).
- Must(elastic.NewTermQuery("area", province)).
- Filter(filters...)
- queries = append(queries, q)
- }
- }
- for _, query := range queries {
- searchResult, err := client.Search().
- Index("bidding").
- Query(query).
- FetchSourceContext(fetchFields).
- Do(context.Background())
- if err != nil {
- continue
- }
- for _, hit := range searchResult.Hits.Hits {
- if !seen[hit.Id] {
- seen[hit.Id] = true
- allHits = append(allHits, hit)
- }
- }
- }
- }
- }
- return allHits, nil
- }
- func searchByTokenOther(client *elastic.Client, projectName, province, city string, isDetail int) ([]*elastic.SearchHit, error) {
- fieldsToTry := []string{"projectname.pname", "title"}
- if isDetail > 0 {
- fieldsToTry = append(fieldsToTry, "detail")
- }
- filtersToTry := [][]elastic.Query{
- {elastic.NewTermsQuery("subtype", "中标", "成交", "合同", "单一")},
- {elastic.NewTermsQuery("toptype", "招标", "预告", "采购意向", "拟建")},
- }
- // 分词处理
- analyzeResp, err := client.IndexAnalyze().
- Index("bidding").
- Analyzer("ik_smart").
- Text(projectName).
- Do(context.Background())
- if err != nil {
- return nil, err
- }
- var tokens []string
- for _, token := range analyzeResp.Tokens {
- tokens = append(tokens, token.Token)
- }
- if len(tokens) == 0 {
- return nil, fmt.Errorf("no tokens found from ik_smart")
- }
- queryText := strings.Join(tokens, " ")
- // 指定返回字段
- fetchFields := elastic.NewFetchSourceContext(true).Include(
- "id", "title", "projectname", "projectcode", "bidamount", "area", "city",
- "toptype", "subtype", "buyer", "budget", "buyerperson", "buyertel",
- "s_winner", "winnertel", "agency", "publishtime")
- // 抽象出内部查询逻辑,参数控制是否使用省份过滤
- runQuery := func(withProvince bool) ([]*elastic.SearchHit, error) {
- var allHits []*elastic.SearchHit
- seen := make(map[string]bool)
- for _, field := range fieldsToTry {
- for _, filters := range filtersToTry {
- //query := elastic.NewBoolQuery().
- // Must(elastic.NewMatchQuery(field, queryText)).
- // Filter(filters...)
- query := elastic.NewBoolQuery().
- Must(
- elastic.NewMultiMatchQuery(queryText, field).
- MinimumShouldMatch("100%"),
- ).
- Filter(filters...)
- // 动态加上 area/city 条件
- if withProvince && province != "" {
- query = query.Must(elastic.NewTermQuery("area", province))
- }
- if city != "" {
- query = query.Must(elastic.NewTermQuery("city", city))
- }
- searchResult, err := client.Search().
- Index("bidding").
- Query(query).
- FetchSourceContext(fetchFields).
- Do(context.Background())
- if err != nil {
- continue
- }
- for _, hit := range searchResult.Hits.Hits {
- if !seen[hit.Id] {
- seen[hit.Id] = true
- allHits = append(allHits, hit)
- }
- }
- }
- }
- return allHits, nil
- }
- // 第一次尝试带上 province
- results, err := runQuery(true)
- if err != nil {
- return nil, err
- }
- // 如果查不到,并且设置了省份,则再试一次去掉 province
- if len(results) == 0 && province != "" {
- return runQuery(false)
- }
- return results, nil
- }
- // RemoveInvisibleChars 移除控制字符和不可见字符
- func RemoveInvisibleChars(s string) string {
- return strings.Map(func(r rune) rune {
- // 保留普通字符、中文、标点等可见字符
- if unicode.IsGraphic(r) && !unicode.IsControl(r) {
- return r
- }
- return -1
- }, s)
- }
- // FilterGeneric 通用词处理
- func FilterGeneric(keyWords string) string {
- keyWords = FilterReg_3.ReplaceAllString(keyWords, "")
- keyWords = FilterReg_2.ReplaceAllString(keyWords, "")
- keyWords = FilterReg_1.ReplaceAllString(keyWords, "")
- keyWords = FilterReg.ReplaceAllString(keyWords, "")
- return keyWords
- }
|