package main import ( "crypto/sha256" "encoding/hex" "fmt" "log" "math/big" "regexp" "strings" util "app.yhyue.com/moapp/jybase/common" "app.yhyue.com/moapp/jybase/encrypt" mg "app.yhyue.com/moapp/jybase/mongodb" "github.com/gogf/gf/v2/util/gconv" "github.com/tealeg/xlsx" ) type config struct { Mongodb struct { Main *mgoConf } } type mgoConf struct { Address string Size int DbName string UserName string Password string Collection string Collection_back string } var ( Sysconfig *config MQFW mg.MongodbSim ) func init() { util.ReadConfig(&Sysconfig) MQFW = mg.MongodbSim{ MongodbAddr: Sysconfig.Mongodb.Main.Address, Size: Sysconfig.Mongodb.Main.Size, DbName: Sysconfig.Mongodb.Main.DbName, UserName: Sysconfig.Mongodb.Main.UserName, Password: Sysconfig.Mongodb.Main.Password, } MQFW.InitPool() } func xiufu() { // sess := MQFW.GetMgoConn() defer MQFW.DestoryMongoConn(sess) query := map[string]interface{}{ "district": "开发区", "city": "呼和浩特市", } i := 0 it := sess.DB("yusuan").C("yusuan_fileitem").Find(query).Iter() for tp := make(map[string]interface{}); it.Next(&tp); { _id := mg.BsonIdToSId(tp["_id"]) file_path := gconv.String(tp["file_path"]) file_path = strings.ReplaceAll(file_path, `/开发区/`, `/呼和浩特经济技术开发区/`) MQFW.UpdateById("yusuan_fileitem", _id, map[string]interface{}{ "$set": map[string]interface{}{ "file_path": file_path, "district": "呼和浩特经济技术开发区", }, }) i++ if i%100 == 0 { log.Println("i:", i) } tp = make(map[string]interface{}) } log.Println(i) } //是否正文匹配表格不匹配 : table:0不带表格 1带表格 //正文 全取 表格 匹配 func projectHash(year, projectname, procure_content, kpi, institution string) string { projectname = RemoveNewlines(projectname) procure_content = RemoveNewlines(procure_content) kpi = RemoveNewlines(kpi) institution = RemoveNewlines(institution) return fmt.Sprintf("%s@@%s@@%s@@%s", projectname, procure_content, kpi, institution) } func purchasingHash(year, projectname, pro_code, pro_item, institution, number, unitprice string) string { projectname = RemoveNewlines(projectname) pro_item = RemoveNewlines(pro_item) pro_code = RemoveNewlines(pro_code) institution = RemoveNewlines(institution) unitprice = gconv.String(gconv.Float64(unitprice)) s := fmt.Sprintf("%s@@%s@@%s@@%s@@%s", projectname, pro_item, number, unitprice, institution) return s } func initZXZ() map[string]bool { m := map[string]bool{} for _, v := range []string{"1.xlsx", "2.xlsx", "3.xlsx", "4.xlsx", "5.xlsx", "6.xlsx", "7.xlsx"} { // 打开 Excel 文件 xlFile, err := xlsx.OpenFile(v) if err != nil { fmt.Println("Error:", err) } // 获取第一个工作表 sheet := xlFile.Sheets[0] // 遍历每一行 for i, row := range sheet.Rows { // 获取 FGHI 列的值 if i >= 3 { projectname := row.Cells[7].String() procure_content := row.Cells[8].String() kpi := row.Cells[9].String() institution := row.Cells[10].String() projectname = RemoveNewlines(projectname) procure_content = RemoveNewlines(procure_content) kpi = RemoveNewlines(kpi) institution = RemoveNewlines(institution) s := fmt.Sprintf("%s@@%s@@%s@@%s", projectname, procure_content, kpi, institution) m[s] = true } } } for _, v := range []string{"20230523_1.xlsx", "20240510_1.xlsx", "20240510_2.xlsx"} { // for _, v := range []string{"20230523_1.xlsx"} { // 打开 Excel 文件 xlFile, err := xlsx.OpenFile(v) if err != nil { fmt.Println("Error:", err) } // 获取第一个工作表 sheet := xlFile.Sheets[0] // 遍历每一行 for i, row := range sheet.Rows { // 获取 FGHI 列的值 if i >= 2 { projectname := row.Cells[7].String() pro_item := row.Cells[8].String() number := row.Cells[9].String() totalprice := row.Cells[10].String() institution := row.Cells[11].String() projectname = RemoveNewlines(projectname) pro_item = RemoveNewlines(pro_item) number = RemoveNewlines(number) totalprice = RemoveNewlines(totalprice) institution = RemoveNewlines(institution) s := fmt.Sprintf("%s@@%s@@%s@@%s@@%s", projectname, pro_item, number, totalprice, institution) m[s] = true } } } log.Println(len(m)) return m } func initJCL() map[string]bool { m := map[string]bool{} for _, v := range []string{"j1.xlsx", "j2.xlsx", "j3.xlsx", "j4.xlsx", "j5.xlsx", "j6.xlsx", "j7.xlsx", "j8.xlsx"} { // 打开 Excel 文件 xlFile, err := xlsx.OpenFile(v) if err != nil { fmt.Println("Error:", err) } // 获取第一个工作表 sheet := xlFile.Sheets[0] // 遍历每一行 for i, row := range sheet.Rows { // 获取 FGHI 列的值 if i >= 3 { projectname := row.Cells[7].String() procure_content := row.Cells[8].String() kpi := row.Cells[9].String() institution := row.Cells[10].String() projectname = RemoveNewlines(projectname) procure_content = RemoveNewlines(procure_content) kpi = RemoveNewlines(kpi) institution = RemoveNewlines(institution) s := fmt.Sprintf("%s@@%s@@%s@@%s", projectname, procure_content, kpi, institution) m[s] = true } } } // for _, v := range []string{"mx_j1.xlsx", "mx_j2.xlsx", "mx_j3.xlsx", "mx_j4.xlsx"} { for _, v := range []string{"mx_j3.xlsx"} { // 打开 Excel 文件 xlFile, err := xlsx.OpenFile(v) if err != nil { fmt.Println("Error:", err) } // 获取第一个工作表 sheet := xlFile.Sheets[0] // 遍历每一行 for i, row := range sheet.Rows { // 获取 FGHI 列的值 if i >= 2 { projectname := row.Cells[7].String() pro_item := row.Cells[8].String() number := row.Cells[9].String() totalprice := row.Cells[10].String() institution := row.Cells[11].String() projectname = RemoveNewlines(projectname) pro_item = RemoveNewlines(pro_item) number = RemoveNewlines(number) totalprice = RemoveNewlines(totalprice) institution = RemoveNewlines(institution) if institution == "石家庄市城市更新促进中心" { log.Println(row.Cells[10]) log.Println(row.Cells[10].String()) } s := fmt.Sprintf("%s@@%s@@%s@@%s@@%s", projectname, pro_item, number, totalprice, institution) m[s] = true if institution == "石家庄市城市更新促进中心" { log.Println("---", s) } } } } log.Println(len(m)) return m } // SafeDivide 安全的除法操作,解决浮点数精度丢失问题 func SafeDivide(a, b float64) float64 { // 将 float64 转换为 big.Rat aRat := big.NewRat(int64(a*1e9), 1e9) bRat := big.NewRat(int64(b*1e9), 1e9) // 执行除法操作 result := new(big.Rat).Quo(aRat, bRat) // 将结果转换回 float64 resultFloat, _ := result.Float64() return resultFloat } func main() { dbname := "hp_zxl_2" JCL_hashMap := initJCL() // for k, _ := range JCL_hashMap { // if strings.Contains(k, "石家庄市人民代表大会常务委员会") { // log.Println(k) // } // } // purchasing, _ := MQFW.Find("purchasing_huipu", map[string]interface{}{ // "file_path": "2024年/河北/石家庄市/政府/2024_石家庄市人大常委会_1.pdf", // "unitprice": "0.40", // }, nil, nil, false, -1, -1) // log.Println("len:", len(*purchasing)) // for _, pv := range *purchasing { // pv["type"] = "purchasing" // year := gconv.String(pv["year"]) // projectname := gconv.String(pv["projectname"]) // pro_code := gconv.String(pv["pro_code"]) // pro_item := gconv.String(pv["pro_item"]) // institution := gconv.String(pv["institution"]) // number := gconv.String(pv["number"]) // totalprice := gconv.String(pv["totalprice"]) // projectname = RemoveNewlines(projectname) // pro_item = RemoveNewlines(pro_item) // institution = RemoveNewlines(institution) // pv["projectname"] = projectname // pv["pro_item"] = pro_item // pv["institution"] = institution // hs := purchasingHash(year, projectname, pro_code, pro_item, institution, number, totalprice) // log.Println("hs", hs) // log.Println(JCL_hashMap[hs]) // } // return hashMap := initZXZ() saveMap := map[string]bool{} // sess := MQFW.GetMgoConn() defer MQFW.DestoryMongoConn(sess) query := map[string]interface{}{ "exists_key": 1, } it := sess.DB("yusuan").C("yusuan_txt").Find(query).Select(map[string]interface{}{ "detail": 1, "file_path": 1, }).Iter() notfind := 0 ccc := 0 for tp := make(map[string]interface{}); it.Next(&tp); { file_path := gconv.String(tp["file_path"]) detail := gconv.Maps(tp["detail"]) ccc++ if ccc%100 == 0 { log.Println("txt 解析:", ccc) } project, _ := MQFW.Find("yusuan_project", map[string]interface{}{ "file_path": file_path, }, nil, nil, false, -1, -1) purchasing, _ := MQFW.Find("yusuan_purchasing", map[string]interface{}{ "file_path": file_path, }, nil, nil, false, -1, -1) if (project == nil || len(*project) == 0) && (purchasing == nil || len(*purchasing) == 0) { fmt.Println(file_path) notfind++ continue } for _, v := range detail { key_words := gconv.String(v["key_words"]) paragraph := gconv.String(v["paragraph"]) table := gconv.Int(v["table"]) //是否正文匹配表格不匹配 : table:0不带表格 1带表格 //正文 全取 表格 匹配 if table == 0 { if project != nil && len(*project) > 0 { for _, pv := range *project { pv["matchkey"] = key_words pv["paragraph"] = paragraph pv["table"] = table pv["type"] = "project" year := gconv.String((pv)["year"]) projectname := gconv.String((pv)["projectname"]) procure_content := gconv.String((pv)["procure_content"]) kpi := gconv.String((pv)["kpi"]) institution := gconv.String((pv)["institution"]) projectname = RemoveNewlines(projectname) institution = RemoveNewlines(institution) pv["projectname"] = projectname pv["institution"] = institution hs := projectHash(year, projectname, procure_content, kpi, institution) if saveMap[hs] { continue } saveMap[hs] = true pv["zxz"] = util.If(hashMap[hs], "是", "否") pv["jcl"] = util.If(JCL_hashMap[hs], "是", "否") if !JCL_hashMap[hs] { hash_code := gconv.String(pv["hash_code"]) if MQFW.Count("project_huipu", map[string]interface{}{ "hash_code": hash_code, }) > 0 { pv["jcl"] = "是" } } id := MQFW.Save(dbname, pv) if id != "" { MQFW.UpdateById(dbname, id, map[string]interface{}{ "$set": map[string]interface{}{ "eid": encrypt.EncodeArticleId2ByCheck(id), }, }) } } } if purchasing != nil && len(*purchasing) > 0 { for _, pv := range *purchasing { pv["matchkey"] = key_words pv["paragraph"] = paragraph pv["table"] = table pv["type"] = "purchasing" year := gconv.String(pv["year"]) projectname := gconv.String(pv["projectname"]) pro_code := gconv.String(pv["pro_code"]) pro_item := gconv.String(pv["pro_item"]) institution := gconv.String(pv["institution"]) number := gconv.String(pv["number"]) totalprice := gconv.String(pv["totalprice"]) projectname = RemoveNewlines(projectname) pro_item = RemoveNewlines(pro_item) institution = RemoveNewlines(institution) pv["projectname"] = projectname pv["pro_item"] = pro_item pv["institution"] = institution hs := purchasingHash(year, projectname, pro_code, pro_item, institution, number, totalprice) if saveMap[hs] { continue } saveMap[hs] = true pv["zxz"] = util.If(hashMap[hs], "是", "否") pv["jcl"] = util.If(JCL_hashMap[hs], "是", "否") if !JCL_hashMap[hs] { hash_code := gconv.String(pv["hash_code"]) if MQFW.Count("purchasing_huipu", map[string]interface{}{ "hash_code": hash_code, }) > 0 { pv["jcl"] = "是" } } id := MQFW.Save(dbname, pv) if id != "" { MQFW.UpdateById(dbname, id, map[string]interface{}{ "$set": map[string]interface{}{ "eid": encrypt.EncodeArticleId2ByCheck(id), }, }) } } } } else { //表格 匹配 for _, vkey := range strings.Split(key_words, ",") { if project != nil && len(*project) > 0 { for _, pv := range *project { pv["matchkey"] = key_words pv["paragraph"] = paragraph pv["table"] = table pv["type"] = "project" year := gconv.String((pv)["year"]) projectname := gconv.String((pv)["projectname"]) procure_content := gconv.String((pv)["procure_content"]) kpi := gconv.String((pv)["kpi"]) institution := gconv.String((pv)["institution"]) projectname = RemoveNewlines(projectname) institution = RemoveNewlines(institution) pv["projectname"] = projectname pv["institution"] = institution if !strings.Contains(projectname, vkey) && !strings.Contains(procure_content, vkey) && !strings.Contains(kpi, vkey) { continue pv["table_jiexi"] = 1 //漏解析 } hs := projectHash(year, projectname, procure_content, kpi, institution) if saveMap[hs] { continue } if gconv.String(pv["fileitem_id"]) == "660491d7138c4f04f70f5838" { log.Println(hs) } saveMap[hs] = true pv["zxz"] = util.If(hashMap[hs], "是", "否") pv["jcl"] = util.If(JCL_hashMap[hs], "是", "否") if !JCL_hashMap[hs] { hash_code := gconv.String(pv["hash_code"]) if MQFW.Count("project_huipu", map[string]interface{}{ "hash_code": hash_code, }) > 0 { pv["jcl"] = "是" } } id := MQFW.Save(dbname, pv) if id != "" { MQFW.UpdateById(dbname, id, map[string]interface{}{ "$set": map[string]interface{}{ "eid": encrypt.EncodeArticleId2ByCheck(id), }, }) } } } if purchasing != nil && len(*purchasing) > 0 { for _, pv := range *purchasing { pv["matchkey"] = key_words pv["paragraph"] = paragraph pv["table"] = table pv["type"] = "purchasing" year := gconv.String(pv["year"]) projectname := gconv.String(pv["projectname"]) pro_code := gconv.String(pv["pro_code"]) pro_item := gconv.String(pv["pro_item"]) institution := gconv.String(pv["institution"]) number := gconv.String(pv["number"]) totalprice := gconv.String(pv["totalprice"]) projectname = RemoveNewlines(projectname) institution = RemoveNewlines(institution) pro_item = RemoveNewlines(pro_item) pv["projectname"] = projectname pv["institution"] = institution pv["pro_item"] = pro_item if !strings.Contains(projectname, vkey) && !strings.Contains(pro_item, vkey) { continue pv["table_jiexi"] = 1 //漏解析 } hs := purchasingHash(year, projectname, pro_code, pro_item, institution, number, totalprice) if saveMap[hs] { continue } saveMap[hs] = true pv["zxz"] = util.If(hashMap[hs], "是", "否") pv["jcl"] = util.If(JCL_hashMap[hs], "是", "否") if !JCL_hashMap[hs] { hash_code := gconv.String(pv["hash_code"]) if MQFW.Count("purchasing_huipu", map[string]interface{}{ "hash_code": hash_code, }) > 0 { pv["jcl"] = "是" } } id := MQFW.Save(dbname, pv) if id != "" { MQFW.UpdateById(dbname, id, map[string]interface{}{ "$set": map[string]interface{}{ "eid": encrypt.EncodeArticleId2ByCheck(id), }, }) } } } } } } tp = make(map[string]interface{}) } log.Println("未解析到数据数量:", notfind) } //生成hashCode func HashCode(input string) string { hash := sha256.Sum256([]byte(input)) hashString := hex.EncodeToString(hash[:]) return hashString } func RemoveNewlines(str string) string { regex := regexp.MustCompile(`\r?\n`) return regex.ReplaceAllString(str, "") }