123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224 |
- package extract
- import (
- "data_ai/clean"
- "data_ai/prompt"
- "data_ai/ul"
- log "github.com/donnie4w/go-logger/logger"
- "github.com/shopspring/decimal"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "regexp"
- "strings"
- "unicode/utf8"
- )
- func FilterDetail(con string) string {
- return Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "")
- }
- var Reg = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+")
- var Filter = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
- var SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
- var CleanReg0 = regexp.MustCompile("([eE][\\+])")
- var CleanReg1 = regexp.MustCompile("([::](([1-9][.][0-9]+)([eE][\\+])([0]+[6-9])))")
- var CleanReg2 = regexp.MustCompile("((([1-9][.][0-9]+)([eE][\\+])([0]*[6-9])))")
- // 确认抽取范围
- func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
- dict := map[string]interface{}{}
- sess := ul.SourceMgo.GetMgoConn()
- defer ul.SourceMgo.DestoryMongoConn(sess)
- total := 0
- it := sess.DB(ul.SourceMgo.DbName).C(ul.Ext_Name).Find(&q).Select(map[string]interface{}{"_id": 1, "ai_zhipu": 1}).Iter()
- for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
- if total%1000 == 0 {
- log.Debug("cur index ", total)
- }
- if tmp["ai_zhipu"] == nil { //已经识别的数据-不再识别
- tmpid := ul.BsonTOStringId(tmp["_id"])
- dict[tmpid] = tmpid
- }
- tmp = make(map[string]interface{})
- }
- return dict
- }
- // 获取附件名字信息
- func GetFnsInfo(tmp map[string]interface{}) []string {
- arr := []string{}
- if projectinfo := qu.ObjToMap(tmp["projectinfo"]); projectinfo != nil {
- if attachments := qu.ObjToMap((*projectinfo)["attachments"]); attachments != nil {
- for _, v := range *attachments {
- if info := qu.ObjToMap(v); info != nil {
- if filename := qu.ObjToString((*info)["filename"]); filename != "" {
- arr = append(arr, filename)
- }
- }
- }
- }
- }
- return arr
- }
- // 获取正文数据
- func getDetailText(v map[string]interface{}, tmpid string) string {
- //按照最新文本请求的数据···
- //detail := ul.PostDetailContentHtmlText("detail", tmpid)
- //if detail != "" {
- // return detail
- //}
- detail := qu.ObjToString(v["detail"])
- if ul.IsTool {
- if details := qu.ObjToString(v["details"]); details != "" {
- detail = details
- }
- filetext := qu.ObjToString(v["filetext"])
- if utf8.RuneCountInString(detail) < 100 && filetext != "" {
- detail = filetext
- }
- }
- return detail
- }
- // 获取标的物-过滤产权-拟建
- func getPurList(v map[string]interface{}, detail string, f_info map[string]interface{}) map[string]interface{} {
- if qu.ObjToString(v["toptype"]) == "拟建" || qu.ObjToString(v["toptype"]) == "产权" {
- return map[string]interface{}{}
- }
- p_data := map[string]interface{}{}
- p_data["detail"] = qu.ObjToString(v["title"]) + "\n" + detail
- p_data["site"] = v["site"]
- p_data["attach_text"] = v["attach_text"]
- p_data["toptype"] = v["toptype"]
- if f_info["s_toptype"] != nil {
- p_data["toptype"] = f_info["s_toptype"]
- }
- //结果有 标的物+标的物label+剑鱼码
- if p_info := ul.PostPurchasingList(p_data); len(p_info) > 0 {
- if qu.IntAll(p_info["status"]) == 200 {
- //消息体
- message := qu.ObjToMap(p_info["message"])
- if message != nil {
- return *message
- }
- return map[string]interface{}{}
- }
- }
- return map[string]interface{}{}
- }
- /*
- ****************************************
- ****************************************
- ****************************************
- */
- // 过滤信息规则···
- func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool {
- if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
- return true
- }
- detail = FilterDetail(detail) //只保留文本内容
- dl := utf8.RuneCountInString(detail) //文本长度
- if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) {
- return true
- }
- return false
- }
- // 二次校验采购单位
- func CheckOutBuyerInfo(f_data map[string]interface{}) {
- if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
- if zp_buyer := prompt.AcquireBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
- if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
- f_data["s_buyer"] = ns_buyer
- }
- }
- }
- }
- // 二次校验采购单位
- func CheckOutDeepSeekBuyerInfo(f_data map[string]interface{}) {
- if s_buyer := qu.ObjToString(f_data["s_buyer"]); s_buyer != "" {
- if zp_buyer := prompt.AcquireDeepSeekBuyerInfo(s_buyer); zp_buyer["实体单位"] != nil {
- if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
- f_data["s_buyer"] = ns_buyer
- }
- }
- }
- }
- // 合并字段
- func MergeInfo(infos []map[string]interface{}) map[string]interface{} {
- info := map[string]interface{}{}
- for _, v := range infos {
- for k1, v1 := range v {
- info[k1] = v1
- }
- }
- return info
- }
- // 强制逻辑判断数据
- func ForcedLogicDecideInfo(f_data map[string]interface{}) {
- //多单位不能一致,原则大模型
- s_buyer := qu.ObjToString(f_data["s_buyer"])
- s_winner := qu.ObjToString(f_data["s_winner"])
- if s_buyer == s_winner && s_buyer != "" {
- /*
- 1、若单位名称-不含公司保留采购单位
- 2、若单位名称-含公司保留中标单位
- */
- if strings.Contains(s_buyer, "公司") {
- f_data["s_buyer"] = ""
- } else {
- f_data["s_winner"] = ""
- }
- }
- //代理机构
- if s_agency := qu.ObjToString(f_data["s_agency"]); s_agency != "" {
- if s_agency == s_buyer || s_agency == s_winner {
- f_data["s_agency"] = ""
- }
- }
- }
- // 科学计数法标记
- func ScientificUnit(detail string) (string, float64) {
- if !CleanReg0.MatchString(detail) {
- return "", 0.0
- }
- x, u := "", ""
- //符合条件1···修最后一个金额
- if arr := CleanReg1.FindAllString(detail, -1); len(arr) > 0 {
- str1 := arr[len(arr)-1]
- x = CleanReg1.ReplaceAllString(str1, "${3}")
- u = CleanReg1.ReplaceAllString(str1, "${5}")
- } else {
- //符合条件2···修第一个金额
- if str2 := CleanReg2.FindString(detail); str2 != "" {
- x = CleanReg2.ReplaceAllString(str2, "${3}")
- u = CleanReg2.ReplaceAllString(str2, "${5}")
- } else {
- return "", 0.0
- }
- }
- ut := qu.IntAll(u)
- if ut >= 10 {
- return "", 0.0
- }
- if xf := qu.Float64All(x); xf > 0.0 {
- a := decimal.NewFromFloat(xf)
- b := decimal.New(1, int32(ut))
- v := a.Mul(b)
- fv, _ := v.Float64()
- sv := v.String()
- if fv > 10000000000 {
- return "", 0.0
- }
- return sv, fv
- }
- return "", 0.0
- }
|