package main import ( "fmt" "github.com/spf13/viper" "github.com/xuri/excelize/v2" "go.uber.org/zap" "jygit.jydev.jianyu360.cn/data_processing/common_utils" "jygit.jydev.jianyu360.cn/data_processing/common_utils/log" "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb" llog "log" "os" "regexp" "strconv" "strings" "time" ) var ( GF GlobalConf Mgo *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址 MgoB *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址 MgoN *mongodb.MongodbSim // infourls = make(map[string]bool, 0) // insertUrl = make(map[string]bool, 0) //specialChars = []string{"�", "Γ", "ΩΙ", "Δ", "Σ", "≤", "ζ", "Ψ", "®", "Φ", "ώ", "≈", // "’Ώ", "μ", "Λ", "Θ", "ß", "±", "Ύ", "©", "φ", "¬", "≤", "Й", "ж", "Щ", "Ъ"} zkMap = make(map[string]string) //重客类型 ) func InitConfig() (err error) { viper.SetConfigFile("config.toml") // 指定配置文件路径 viper.SetConfigName("config") // 配置文件名称(无扩展名) viper.SetConfigType("toml") // 如果配置文件的名称中没有扩展名,则需要配置此项 viper.AddConfigPath("./") viper.AddConfigPath("./conf/") // 还可以在工作目录中查找配置 viper.AddConfigPath("../conf/") // 还可以在工作目录中查找配置 err = viper.ReadInConfig() // 查找并读取配置文件 if err != nil { // 处理读取配置文件的错误 return } err = viper.Unmarshal(&GF) return err } func InitLog() { err := log.InitLog( //log.Path("./logs/log.out"), log.Path(""), log.Level("info"), log.Compress(true), log.MaxSize(10), log.MaxBackups(10), log.MaxAge(7), log.Format("json"), ) if err != nil { fmt.Printf("InitLog failed: %v\n", err) } } func InitMgo() { Mgo = &mongodb.MongodbSim{ MongodbAddr: GF.Mongo.Host, DbName: GF.Mongo.DB, Size: GF.Mongo.Size, UserName: GF.Mongo.Username, Password: GF.Mongo.Password, Direct: GF.MongoB.Direct, } Mgo.InitPool() MgoB = &mongodb.MongodbSim{ MongodbAddr: GF.MongoB.Host, DbName: GF.MongoB.DB, Size: GF.MongoB.Size, UserName: GF.MongoB.Username, Password: GF.MongoB.Password, Direct: GF.MongoB.Direct, } MgoB.InitPool() MgoN = &mongodb.MongodbSim{ MongodbAddr: GF.MongoN.Host, DbName: GF.MongoN.DB, Size: GF.MongoN.Size, UserName: GF.MongoN.Username, Password: GF.MongoN.Password, Direct: GF.MongoN.Direct, } MgoN.InitPool() } func main() { InitConfig() InitLog() InitMgo() //readFile() // 读文件写入数据库 //readZK() // //exportFiles() fmt.Println("44444") select {} } // readFile 读取文件 func readFile() { files := GF.Env.Files //ch := make(chan bool, 10) //wg := &sync.WaitGroup{} if len(files) > 0 { for _, file := range files { //ch <- true //wg.Add(1) //go func(file string) { // defer func() { // <-ch // wg.Done() // }() fmt.Println("开始读取文件:", file) _, err := os.Stat(file) if err != nil { log.Error("readFile", zap.Error(err)) } f, err := excelize.OpenFile(file, excelize.Options{ ShortDatePattern: "yyyy/mm/dd", }) if err != nil { fmt.Println(err) return } defer func() { if err := f.Close(); err != nil { fmt.Println(err) } }() rows, err := f.GetRows("有效数据") if err != nil { fmt.Println(err) return } for i := 1; i < len(rows); i++ { if i%100 == 0 { log.Info("readFile", zap.Int(file+" 当前读取行数:", i)) } insert := map[string]interface{}{ "pubulishtime": strings.Replace(rows[i][0], "/", "-", -1), "site": rows[i][1], "title": rows[i][2], "detail": rows[i][3], "infourl": rows[i][4], "type": rows[i][5], "createtime": time.Now().Format("2006-01-02:15:04:05"), } if len(rows[i]) > 6 { insert["wtype"] = rows[i][6] } if len(rows[i]) > 7 { insert["rule_type"] = rows[i][7] } //大模型给的是否相关 if len(rows[i]) > 8 { insert["model_ai"] = rows[i][8] } if len(rows[i]) > 9 { insert["model_weight"] = rows[i][9] } if len(rows[i]) > 10 { insert["l2"] = rows[i][10] } MgoN.Save(GF.MongoN.Coll, insert) } log.Info("readFile", zap.String(file, " over")) //}(file) } } //wg.Wait() fmt.Println("所有文件 读取结束") } // exportFiles 导出文件 func exportFiles() { sess := Mgo.GetMgoConn() defer Mgo.DestoryMongoConn(sess) sessN := MgoN.GetMgoConn() defer MgoN.DestoryMongoConn(sessN) fmt.Println("开始数据导出") now := time.Now() targetTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.Start, GF.Env.Shour, 0, 0, 0, now.Location()) todayTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.End, GF.Env.Ehour, 0, 0, 0, now.Location()) q := map[string]interface{}{ "comeintime": map[string]interface{}{ "$gt": targetTime.Unix(), "$lte": todayTime.Unix(), }, } log.Info("dealBidding", zap.Any("q", q)) query := sessN.DB(GF.MongoN.DB).C(GF.MongoN.Coll).Find(nil).Select(map[string]interface{}{ "contenthtml": 0}).Iter() count := 0 //1.读取所有 已经保存交付的infourl for tmp := make(map[string]interface{}); query.Next(tmp); count++ { infourl := util.ObjToString(tmp["infourl"]) if infourl != "" { infourls[infourl] = true } if count%10000 == 0 { log.Info("infourl", zap.Int("current", count)) } } log.Info("infourl", zap.Int("来源网址数量:", len(infourls))) //上月1号时间戳 lastMonthTimestamp := getLastMonthFirstDayTimestamp() //2.官网数据 var data = make([]map[string]interface{}, 0) count2 := 0 query2 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll1).Find(q).Select(map[string]interface{}{ "contenthtml": 0}).Iter() for tmp := make(map[string]interface{}); query2.Next(tmp); count2++ { if count2%100 == 0 { log.Info("count2", zap.Int("current", count2)) } infourl := util.ObjToString(tmp["infourl"]) if infourls[infourl] || insertUrl[infourl] { continue } else { pubulishtime := util.ObjToString(tmp["pubulishtime"]) if pubulishtime == "" { continue } if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" { continue } if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) { continue } //标题内容排除词 rsa, rsb := isOutData(tmp) if rsa { continue } else { tmp["typea"] = rsb } datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"])) ////发布时间,小于上月1号,直接过滤 if datetime < lastMonthTimestamp || datetime > time.Now().Unix() { continue } if datetime > 0 { tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02") } tmp["wtype"] = "官网-python" tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型 data = append(data, tmp) insertUrl[infourl] = true } tmp = map[string]interface{}{} } log.Info(GF.Env.Coll1, zap.Int("数量:", count2)) //3.百度数据 count3 := 0 query3 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll2).Find(q).Select(map[string]interface{}{ "contenthtml": 0}).Iter() for tmp := make(map[string]interface{}); query3.Next(tmp); count3++ { if count3%100 == 0 { log.Info("count3", zap.Int("current", count3)) } infourl := util.ObjToString(tmp["infourl"]) if infourls[infourl] || insertUrl[infourl] { continue } else { pubulishtime := util.ObjToString(tmp["pubulishtime"]) if pubulishtime == "" { continue } if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" { continue } if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) { continue } //标题内容排除词 rsa, rsb := isOutData(tmp) if rsa { continue } else { tmp["typea"] = rsb } if util.ObjToString(tmp["site"]) == "" && util.ObjToString(tmp["searchwords"]) != "" { res, _ := Mgo.FindOne(GF.Env.Keyword, map[string]interface{}{"key": tmp["searchwords"]}) re := *res tmp["site"] = re["site"] } datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"])) ////发布时间,小于上月1号,直接过滤 if datetime < lastMonthTimestamp || datetime > time.Now().Unix() { continue } if datetime > 0 { tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02") } tmp["wtype"] = "百度-Python" tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型 data = append(data, tmp) insertUrl[infourl] = true } tmp = map[string]interface{}{} } log.Info(GF.Env.Coll2, zap.Int("数量:", count3)) //官网-lua 数据bidding_yq if GF.Env.Collb != "" { sess2 := MgoB.GetMgoConn() defer MgoB.DestoryMongoConn(sess2) count4 := 0 query4 := sess2.DB(GF.MongoB.DB).C(GF.Env.Collb).Find(q).Select(map[string]interface{}{ "contenthtml": 0}).Iter() for tmp := make(map[string]interface{}); query4.Next(tmp); count4++ { if count4%100 == 0 { log.Info("count4", zap.Int("current", count4)) } infourl := util.ObjToString(tmp["href"]) if infourls[infourl] || insertUrl[infourl] { continue } else { if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" { continue } if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) { continue } //标题内容排除词 rsa, rsb := isOutData(tmp) if rsa { continue } else { tmp["typea"] = rsb } datetime := util.Int64All(tmp["publishtime"]) ////发布时间,小于上月1号,直接过滤 if datetime < lastMonthTimestamp || datetime > time.Now().Unix() { continue } if datetime > 0 { tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02") } tmp["infourl"] = tmp["href"] tmp["wtype"] = "官网-lua" tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型 data = append(data, tmp) insertUrl[infourl] = true } tmp = map[string]interface{}{} } log.Info(GF.Env.Collb, zap.Int("数量:", count4)) } //4. file := now.Format("20060102") + "惠普_舆情数据.xlsx" //var xlsx *excelize.File currentPwd, _ := os.Getwd() exportFile := fmt.Sprintf("%s/%s", currentPwd, file) xlsx := excelize.NewFile(excelize.Options{ShortDatePattern: "yyyy/m/dd"}) styleOne, _ := xlsx.NewStyle( &excelize.Style{ Alignment: &excelize.Alignment{ Horizontal: "left", Vertical: "left", }, }, ) line := 0 sheet := "有效数据" xlsx.NewSheet(sheet) xlsx.DeleteSheet("Sheet1") _ = xlsx.SetColWidth(sheet, "A", "A", 20) _ = xlsx.SetColWidth(sheet, "B", "B", 25) _ = xlsx.SetColWidth(sheet, "C", "C", 30) _ = xlsx.SetColWidth(sheet, "D", "D", 35) _ = xlsx.SetColWidth(sheet, "E", "E", 45) _ = xlsx.SetColWidth(sheet, "F", "F", 20) subtitles := []interface{}{"发布日期", "来源单位", "标题", "正文内容", "来源网址", "重客类型", "采集方式", "规则相关度", "ai相关度", "ai权重", "L2"} line++ //设置第一行title _ = xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &subtitles) //file := "20230825惠普_舆情.xlsx" fmt.Println("导出数据总数:-------", len(data)) for k, _ := range data { llog.Println("导出数据-------", k) line++ val := []interface{}{ data[k]["pubulishtime"], data[k]["site"], data[k]["title"], data[k]["detail"], data[k]["infourl"], data[k]["zk"], data[k]["wtype"], data[k]["typea"], } //调用智普AI if GF.Env.Key != "" && GF.Env.Model != "" { res := ZpRelated(GF.Env.Key, GF.Env.Model, util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"])) //res := normalChat(util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"])) val = append(val, res["type_ai"]) val = append(val, res["type_weight"]) //只要在L0 名单里,就不需要返回L2,否则返回大模型识别的主体 if zkMap[util.ObjToString(data[k]["site"])] != "" { val = append(val, res["name"]) } } insert := map[string]interface{}{ "pubulishtime": strings.Replace(util.ObjToString(data[k]["pubulishtime"]), "/", "-", -1), "site": data[k]["site"], "title": data[k]["title"], "detail": data[k]["detail"], "infourl": data[k]["infourl"], "type": data[k]["zk"], "wtype": data[k]["wtype"], "label": data[k]["typea"], "type_ai": data[k]["type_ai"], "type_weight": data[k]["type_weight"], "name": data[k]["name"], "createtime": time.Now().Format("2006-01-02:15:04:05"), } MgoN.Save(GF.MongoN.Coll, insert) err := xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &val) if err != nil { log.Debug("set sheetrow line err", zap.Any(fmt.Sprintf("%s%d", "A", line), err)) return } _ = xlsx.SetCellStyle(sheet, fmt.Sprintf("%s%d", "A", line), "BA"+strconv.Itoa(line), styleOne) } xlsx.Path = exportFile xlsx.Save() fmt.Println("数据导出结束") } // readZK 读取重客类型 func readZK() { f, err := excelize.OpenFile("重客类型.xlsx") if err != nil { fmt.Println(err) return } defer func() { if err := f.Close(); err != nil { fmt.Println(err) } }() rows, err := f.GetRows("Sheet1") if err != nil { fmt.Println(err) return } // for i := 1; i < len(rows); i++ { zkMap[rows[i][1]] = rows[i][2] } } // contains contains func contains(data string, specials []string) bool { for _, v := range specials { if strings.Contains(data, v) { return true } } return false } // ParseDateString s时间字符串转时间戳 func ParseDateString(dateString string) (int64, error) { // Regular expressions for different date formats regexPatterns := []string{ //`^(\d{4})年(\d{1,2})月(\d{1,2})日?$`, //`^(\d{4})年(\d{1,2})月$`, //`^(\d{4})-(\d{1,2})-(\d{1,2})$`, //`^(\d{4})年(\d{1,2})$`, //`^(\d{4})\-(\d{1,2})$`, //`^(\d{4})\.(\d{1,2})\.(\d{1,2})$`, //`^(\d{4})\.(\d{1,2})$`, //`^(\d{4})年$`, //`^(\d{4})$`, `(\d{4})[年.\-/]?(\d{1,2})[月.\-/]?(\d{1,2})日?`, `(\d{1,2})[月.\-/]?(\d{1,2})日?`, } for _, pattern := range regexPatterns { re := regexp.MustCompile(pattern) match := re.FindStringSubmatch(dateString) if len(match) > 0 { year, _ := strconv.Atoi(match[1]) month, _ := strconv.Atoi(match[2]) var day int if len(match) >= 4 && match[3] != "" { day, _ = strconv.Atoi(match[3]) } else { day = 1 } parsedTime := time.Date(year, time.Month(month), day, 0, 0, 0, 0, time.UTC) return parsedTime.Unix(), nil } } return 0, fmt.Errorf("unrecognized date format") }