123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290 |
- package extract
- import (
- "data_ai/clean"
- "data_ai/prompt"
- "data_ai/ul"
- log "github.com/donnie4w/go-logger/logger"
- qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "strings"
- "sync"
- "time"
- "unicode/utf8"
- )
- // 识别结构化字段
- func ExtractFieldInfo(sid string, eid string) {
- q := map[string]interface{}{
- "_id": map[string]interface{}{
- "$gt": ul.StringTOBsonId(sid),
- "$lte": ul.StringTOBsonId(eid),
- },
- }
- //先查询抽取表-确定大模型需要识别到范围
- dict := ConfrimExtractInfo(q)
- log.Debug("查询语句...", q, "~", len(dict))
- if len(dict) >= ul.MaxUdp { //根据数量限制使用具体模型
- log.Debug("数量超过限制临时使用:glm-4-flashx")
- ul.FlashModel = "glm-4-flashx"
- } else {
- ul.FlashModel = "glm-4-flash"
- }
- pool_mgo := make(chan bool, ul.Reading)
- wg_mgo := &sync.WaitGroup{}
- sess := ul.BidMgo.GetMgoConn()
- defer ul.BidMgo.DestoryMongoConn(sess)
- total, isok := 0, 0
- it := sess.DB(ul.BidMgo.DbName).C(ul.Bid_Name).Find(&q).Sort("_id").Iter()
- for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
- if total%1000 == 0 {
- log.Debug("cur ai index ", total)
- }
- tmpid := ul.BsonTOStringId(tmp["_id"])
- infoformat := qu.IntAll(tmp["infoformat"])
- if infoformat > 1 || dict[tmpid] == nil {
- tmp = make(map[string]interface{})
- continue
- }
- isok++
- pool_mgo <- true
- wg_mgo.Add(1)
- go func(tmp map[string]interface{}) {
- defer func() {
- <-pool_mgo
- wg_mgo.Done()
- }()
- info := map[string]interface{}{}
- u_id := ul.BsonTOStringId(tmp["_id"])
- //大模型数据···
- ai_zhipu := ResolveInfo(tmp, u_id)
- if len(ai_zhipu) > 0 {
- info["ai_zhipu"] = ai_zhipu
- }
- //科学计数法标记···数据标记···会冗余
- s, f := ScientificUnit(qu.ObjToString(tmp["detail"]))
- if s != "" && f > 0.0 {
- info["e_bidamount"] = f
- }
- //更新方法
- if len(info) > 0 && u_id != "" {
- ul.SourceMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
- "$set": info,
- })
- }
- }(tmp)
- tmp = make(map[string]interface{})
- }
- wg_mgo.Wait()
- log.Debug("ai is over ...", sid, "~", eid)
- }
- // 获取处理数据...
- func ResolveInfo(v map[string]interface{}, tmpid string) map[string]interface{} {
- title := qu.ObjToString(v["title"])
- old_detail := getDetailText(v, tmpid) //获取正文文本
- //是否表格
- isTable := false
- if strings.Contains(old_detail, "<table>") {
- isTable = false //可以屏蔽表格的识别内容
- }
- //过滤信息
- if NotInProgressInfo(title, old_detail, v) {
- return map[string]interface{}{}
- }
- //识别结构,短文本结构
- f_data, shorText := map[string]interface{}{}, false
- if utf8.RuneCountInString(old_detail) < 100 {
- shorText = true
- }
- //文本格式转换
- new_detail := ul.HttpConvertToMarkdown(old_detail)
- //特殊文本转换
- new_detail = CleanText(new_detail)
- //短文本判断是否有效性
- if shorText {
- if info := prompt.AcquireJudgeShortInfo(new_detail); info["结果"] != "是" {
- return map[string]interface{}{}
- }
- }
- //获取外围字段数据-拆分合并字段
- f_info_1 := prompt.AcquireExtractFieldInfoFirst(new_detail)
- f_info_2 := prompt.AcquireExtractFieldInfoSecond(new_detail)
- f_info_3 := prompt.AcquireExtractFieldInfoThird(new_detail)
- f_info := MergeInfo([]map[string]interface{}{f_info_1, f_info_2, f_info_3})
- //非短文本以下识别-纯测试
- if !shorText {
- //获取分包信息
- if pkg := prompt.AcquireNewMultiplePackageInfo(new_detail, isTable); len(pkg) > 0 {
- f_info["s_pkg"] = pkg
- }
- //获取分类字段数据
- s_toptype, s_subtype := prompt.AcquireClassInfo(new_detail, title, qu.ObjToString(v["toptype"]))
- f_info["s_toptype"] = s_toptype
- f_info["s_subtype"] = s_subtype
- //调用标的物识别
- if !ul.IsTool && !ul.IsLocal {
- if s_purchasinglist := getPurList(v, old_detail, f_info); len(s_purchasinglist) > 0 {
- f_info["s_purchasinglist"] = s_purchasinglist
- }
- }
- }
- //字段清洗
- fns := GetFnsInfo(v) //获取附件名字
- f_data = clean.CleanFieldInfo(f_info, fns, isTable)
- //采购单位二级校验
- CheckOutBuyerInfo(f_data)
- //标题提取采购单位
- if qu.ObjToString(f_data["s_buyer"]) == "" {
- if zp_buyer := prompt.AcquireBuyerInfo(title); zp_buyer["实体单位"] != nil {
- if ns_buyer := clean.CleanBuyer(qu.ObjToString(zp_buyer["实体单位"])); ns_buyer != "" {
- f_data["s_buyer"] = ns_buyer
- }
- }
- }
- //强制逻辑判断-
- ForcedLogicDecideInfo(f_data)
- //返回数据
- return f_data
- }
- /*
- ************************************************************
- ************************************************************
- ************************************************************
- 支持新模型-deepseek的轮询查询
- */
- func RunDeepSeek() {
- log.Debug("执行轮询定时···deepseek···", ul.Ext_Name)
- tmp_data := ul.PyMgo.FindById(ul.Ext_Name, "67c50d2088dabe81a67a2468")
- ttt := ExtractDeepSeekInfo(tmp_data)
- log.Debug(ttt)
- return
- for {
- log.Debug("开始处理线程数···", ul.Reading)
- pool_mgo := make(chan bool, ul.Reading)
- wg_mgo := &sync.WaitGroup{}
- sess := ul.PyMgo.GetMgoConn()
- defer ul.PyMgo.DestoryMongoConn(sess)
- q, total, isok := map[string]interface{}{}, 0, 0
- it := sess.DB(ul.PyMgo.DbName).C(ul.Ext_Name).Find(&q).Iter()
- for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
- if total%100 == 0 {
- log.Debug("cur ai index ", total)
- }
- isok++
- if tmp["ai_updatetime"] != nil {
- tmp = make(map[string]interface{})
- continue
- }
- pool_mgo <- true
- wg_mgo.Add(1)
- go func(tmp map[string]interface{}) {
- defer func() {
- <-pool_mgo
- wg_mgo.Done()
- }()
- u_id := ul.BsonTOStringId(tmp["_id"])
- //抽取deepseek数据···并更新
- data := ExtractDeepSeekInfo(tmp)
- update_info := make(map[string]interface{}, 0)
- if len(data) > 0 && u_id != "" {
- tmp["ai_zhipu"] = data
- ul.ChooseCheckDataAI(tmp, &update_info)
- if update_info["com_package"] == nil { //构建单包信息···
- com_package := ul.CreatSingleFieldInfo(tmp, update_info)
- update_info["com_package"] = com_package
- }
- update_info["ai_zhipu"] = data
- }
- update_info["ai_updatetime"] = time.Now().Unix()
- ul.PyMgo.UpdateById(ul.Ext_Name, u_id, map[string]interface{}{
- "$set": update_info,
- })
- }(tmp)
- tmp = make(map[string]interface{})
- }
- wg_mgo.Wait()
- log.Debug("deepseek完毕······", isok)
- break
- time.Sleep(time.Second * 1800)
- }
- }
- // deepseek模型识别字段
- func ExtractDeepSeekInfo(tmp map[string]interface{}) map[string]interface{} {
- //基础信息
- tmpid := ul.BsonTOStringId(tmp["_id"])
- title := qu.ObjToString(tmp["title"])
- old_detail := getDetailText(tmp, tmpid) //获取正文文本
- //过滤信息
- if NotInProgressInfo(title, old_detail, tmp) {
- return map[string]interface{}{}
- }
- //识别结构,短文本结构,不想进行分类识别
- shorText := false
- if utf8.RuneCountInString(old_detail) < 100 {
- shorText = true
- }
- //文本格式转换
- new_detail := ul.HttpConvertToMarkdown(title + "\n" + old_detail)
- //特殊文本转换
- new_detail = CleanText(new_detail)
- //短文本判断是否有效性
- if shorText {
- if info := prompt.AcquireJudgeDeepSeekShortInfo(new_detail); info["结果"] != "是" {
- return map[string]interface{}{}
- }
- }
- //获取通用该字段
- f_info := prompt.AcquireExtractFieldDeepSeekInfo(new_detail)
- //******字段清洗******
- f_data := clean.CleanDeepSeekInfo(f_info, tmp)
- //******二级校验******
- CheckOutDeepSeekBuyerInfo(f_data)
- //******强制判断******
- ForcedLogicDecideInfo(f_data)
- return f_data
- }
- // 暂时不启用...无限重试
- func RunResetUpdateFieldInfo(arr []string, name string, s_name string) {
- //log.Debug("开始重置更新...", len(arr))
- //reset := []string{}
- //for k, v := range arr {
- // log.Debug("...", k, "...", v)
- // data := ul.SourceMgo.FindById(name, v)
- // content := PromptFieldText(qu.ObjToString(data["detail"]))
- // zp, ok := map[string]interface{}{}, 0
- // for {
- // ok++
- // if zp = ai.PostZhiPuAI(content); len(zp) > 0 {
- // break
- // }
- // if ok >= 5 {
- // log.Debug("请求数据失败...", v)
- // reset = append(reset, v)
- // break
- // }
- // }
- // ul.SourceMgo.UpdateById(s_name, v, map[string]interface{}{
- // "$set": map[string]interface{}{
- // "zhipu": zp,
- // },
- // })
- //}
- //if len(reset) > 0 { //无限尝试
- // RunResetUpdateFieldInfo(reset, name, s_name)
- //}
- //log.Debug("本轮重置更新结束......")
- }
|