123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538 |
- package main
- import (
- "fmt"
- "github.com/spf13/viper"
- "github.com/xuri/excelize/v2"
- "go.uber.org/zap"
- "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
- "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
- llog "log"
- "os"
- "regexp"
- "strconv"
- "strings"
- "time"
- )
- var (
- GF GlobalConf
- Mgo *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址
- MgoB *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址
- MgoN *mongodb.MongodbSim //
- infourls = make(map[string]bool, 0) //
- insertUrl = make(map[string]bool, 0)
- //specialChars = []string{"�", "Γ", "ΩΙ", "Δ", "Σ", "≤", "ζ", "Ψ", "®", "Φ", "ώ", "≈",
- // "’Ώ", "μ", "Λ", "Θ", "ß", "±", "Ύ", "©", "φ", "¬", "≤", "Й", "ж", "Щ", "Ъ"}
- zkMap = make(map[string]string) //重客类型
- )
- func InitConfig() (err error) {
- viper.SetConfigFile("config.toml") // 指定配置文件路径
- viper.SetConfigName("config") // 配置文件名称(无扩展名)
- viper.SetConfigType("toml") // 如果配置文件的名称中没有扩展名,则需要配置此项
- viper.AddConfigPath("./")
- viper.AddConfigPath("./conf/") // 还可以在工作目录中查找配置
- viper.AddConfigPath("../conf/") // 还可以在工作目录中查找配置
- err = viper.ReadInConfig() // 查找并读取配置文件
- if err != nil { // 处理读取配置文件的错误
- return
- }
- err = viper.Unmarshal(&GF)
- return err
- }
- func InitLog() {
- err := log.InitLog(
- //log.Path("./logs/log.out"),
- log.Path(""),
- log.Level("info"),
- log.Compress(true),
- log.MaxSize(10),
- log.MaxBackups(10),
- log.MaxAge(7),
- log.Format("json"),
- )
- if err != nil {
- fmt.Printf("InitLog failed: %v\n", err)
- }
- }
- func InitMgo() {
- Mgo = &mongodb.MongodbSim{
- MongodbAddr: GF.Mongo.Host,
- DbName: GF.Mongo.DB,
- Size: GF.Mongo.Size,
- UserName: GF.Mongo.Username,
- Password: GF.Mongo.Password,
- Direct: GF.MongoB.Direct,
- }
- Mgo.InitPool()
- MgoB = &mongodb.MongodbSim{
- MongodbAddr: GF.MongoB.Host,
- DbName: GF.MongoB.DB,
- Size: GF.MongoB.Size,
- UserName: GF.MongoB.Username,
- Password: GF.MongoB.Password,
- Direct: GF.MongoB.Direct,
- }
- MgoB.InitPool()
- MgoN = &mongodb.MongodbSim{
- MongodbAddr: GF.MongoN.Host,
- DbName: GF.MongoN.DB,
- Size: GF.MongoN.Size,
- UserName: GF.MongoN.Username,
- Password: GF.MongoN.Password,
- Direct: GF.MongoN.Direct,
- }
- MgoN.InitPool()
- }
- func main() {
- InitConfig()
- InitLog()
- InitMgo()
- //readFile() // 读文件写入数据库
- readZK() //
- exportFiles()
- select {}
- }
- // readFile 读取文件
- func readFile() {
- files := GF.Env.Files
- //ch := make(chan bool, 10)
- //wg := &sync.WaitGroup{}
- if len(files) > 0 {
- for _, file := range files {
- //ch <- true
- //wg.Add(1)
- //go func(file string) {
- // defer func() {
- // <-ch
- // wg.Done()
- // }()
- fmt.Println("开始读取文件:", file)
- _, err := os.Stat(file)
- if err != nil {
- log.Error("readFile", zap.Error(err))
- }
- f, err := excelize.OpenFile(file, excelize.Options{
- ShortDatePattern: "yyyy/mm/dd",
- })
- if err != nil {
- fmt.Println(err)
- return
- }
- defer func() {
- if err := f.Close(); err != nil {
- fmt.Println(err)
- }
- }()
- rows, err := f.GetRows("有效数据")
- if err != nil {
- fmt.Println(err)
- return
- }
- for i := 1; i < len(rows); i++ {
- if i%100 == 0 {
- log.Info("readFile", zap.Int(file+" 当前读取行数:", i))
- }
- insert := map[string]interface{}{
- "pubulishtime": strings.Replace(rows[i][0], "/", "-", -1),
- "site": rows[i][1],
- "title": rows[i][2],
- "detail": rows[i][3],
- "infourl": rows[i][4],
- "type": rows[i][5],
- "createtime": time.Now().Format("2006-01-02:15:04:05"),
- }
- if len(rows[i]) > 6 {
- insert["wtype"] = rows[i][6]
- }
- if len(rows[i]) > 7 {
- insert["rule_type"] = rows[i][7]
- }
- //大模型给的是否相关
- if len(rows[i]) > 8 {
- insert["model_ai"] = rows[i][8]
- }
- if len(rows[i]) > 9 {
- insert["model_weight"] = rows[i][9]
- }
- if len(rows[i]) > 10 {
- insert["l2"] = rows[i][10]
- }
- MgoN.Save(GF.MongoN.Coll, insert)
- }
- log.Info("readFile", zap.String(file, " over"))
- //}(file)
- }
- }
- //wg.Wait()
- fmt.Println("所有文件 读取结束")
- }
- // exportFiles 导出文件
- func exportFiles() {
- sess := Mgo.GetMgoConn()
- defer Mgo.DestoryMongoConn(sess)
- sessN := MgoN.GetMgoConn()
- defer MgoN.DestoryMongoConn(sessN)
- fmt.Println("开始数据导出")
- now := time.Now()
- targetTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.Start, GF.Env.Shour, 0, 0, 0, now.Location())
- todayTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.End, GF.Env.Ehour, 0, 0, 0, now.Location())
- q := map[string]interface{}{
- "comeintime": map[string]interface{}{
- "$gt": targetTime.Unix(),
- "$lte": todayTime.Unix(),
- },
- }
- log.Info("dealBidding", zap.Any("q", q))
- query := sessN.DB(GF.MongoN.DB).C(GF.MongoN.Coll).Find(nil).Select(map[string]interface{}{
- "contenthtml": 0}).Iter()
- count := 0
- //1.读取所有 已经保存交付的infourl
- for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
- infourl := util.ObjToString(tmp["infourl"])
- if infourl != "" {
- infourls[infourl] = true
- }
- if count%10000 == 0 {
- log.Info("infourl", zap.Int("current", count))
- }
- }
- log.Info("infourl", zap.Int("来源网址数量:", len(infourls)))
- //上月1号时间戳
- lastMonthTimestamp := getLastMonthFirstDayTimestamp()
- //2.官网数据
- var data = make([]map[string]interface{}, 0)
- count2 := 0
- query2 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll1).Find(q).Select(map[string]interface{}{
- "contenthtml": 0}).Iter()
- for tmp := make(map[string]interface{}); query2.Next(tmp); count2++ {
- if count2%100 == 0 {
- log.Info("count2", zap.Int("current", count2))
- }
- infourl := util.ObjToString(tmp["infourl"])
- if infourls[infourl] || insertUrl[infourl] {
- continue
- } else {
- pubulishtime := util.ObjToString(tmp["pubulishtime"])
- if pubulishtime == "" {
- continue
- }
- if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
- continue
- }
- if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
- continue
- }
- //标题内容排除词
- rsa, rsb := isOutData(tmp)
- if rsa {
- continue
- } else {
- tmp["typea"] = rsb
- }
- datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
- ////发布时间,小于上月1号,直接过滤
- if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
- continue
- }
- if datetime > 0 {
- tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
- }
- tmp["wtype"] = "官网-python"
- tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
- data = append(data, tmp)
- insertUrl[infourl] = true
- }
- tmp = map[string]interface{}{}
- }
- log.Info(GF.Env.Coll1, zap.Int("数量:", count2))
- //3.百度数据
- count3 := 0
- query3 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll2).Find(q).Select(map[string]interface{}{
- "contenthtml": 0}).Iter()
- for tmp := make(map[string]interface{}); query3.Next(tmp); count3++ {
- if count3%100 == 0 {
- log.Info("count3", zap.Int("current", count3))
- }
- infourl := util.ObjToString(tmp["infourl"])
- if infourls[infourl] || insertUrl[infourl] {
- continue
- } else {
- pubulishtime := util.ObjToString(tmp["pubulishtime"])
- if pubulishtime == "" {
- continue
- }
- if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
- continue
- }
- if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
- continue
- }
- //标题内容排除词
- rsa, rsb := isOutData(tmp)
- if rsa {
- continue
- } else {
- tmp["typea"] = rsb
- }
- if util.ObjToString(tmp["site"]) == "" && util.ObjToString(tmp["searchwords"]) != "" {
- res, _ := Mgo.FindOne(GF.Env.Keyword, map[string]interface{}{"key": tmp["searchwords"]})
- re := *res
- tmp["site"] = re["site"]
- }
- datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
- ////发布时间,小于上月1号,直接过滤
- if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
- continue
- }
- if datetime > 0 {
- tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
- }
- tmp["wtype"] = "百度-Python"
- tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
- data = append(data, tmp)
- insertUrl[infourl] = true
- }
- tmp = map[string]interface{}{}
- }
- log.Info(GF.Env.Coll2, zap.Int("数量:", count3))
- //官网-lua 数据bidding_yq
- if GF.Env.Collb != "" {
- sess2 := MgoB.GetMgoConn()
- defer MgoB.DestoryMongoConn(sess2)
- count4 := 0
- query4 := sess2.DB(GF.MongoB.DB).C(GF.Env.Collb).Find(q).Select(map[string]interface{}{
- "contenthtml": 0}).Iter()
- for tmp := make(map[string]interface{}); query4.Next(tmp); count4++ {
- if count4%100 == 0 {
- log.Info("count4", zap.Int("current", count4))
- }
- infourl := util.ObjToString(tmp["href"])
- if infourls[infourl] || insertUrl[infourl] {
- continue
- } else {
- if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
- continue
- }
- if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
- continue
- }
- //标题内容排除词
- rsa, rsb := isOutData(tmp)
- if rsa {
- continue
- } else {
- tmp["typea"] = rsb
- }
- datetime := util.Int64All(tmp["publishtime"])
- ////发布时间,小于上月1号,直接过滤
- if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
- continue
- }
- if datetime > 0 {
- tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
- }
- tmp["infourl"] = tmp["href"]
- tmp["wtype"] = "官网-lua"
- tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
- data = append(data, tmp)
- insertUrl[infourl] = true
- }
- tmp = map[string]interface{}{}
- }
- log.Info(GF.Env.Collb, zap.Int("数量:", count4))
- }
- //4.
- file := now.Format("20060102") + "惠普_舆情数据.xlsx"
- //var xlsx *excelize.File
- currentPwd, _ := os.Getwd()
- exportFile := fmt.Sprintf("%s/%s", currentPwd, file)
- xlsx := excelize.NewFile(excelize.Options{ShortDatePattern: "yyyy/m/dd"})
- styleOne, _ := xlsx.NewStyle(
- &excelize.Style{
- Alignment: &excelize.Alignment{
- Horizontal: "left",
- Vertical: "left",
- },
- },
- )
- line := 0
- sheet := "有效数据"
- xlsx.NewSheet(sheet)
- xlsx.DeleteSheet("Sheet1")
- _ = xlsx.SetColWidth(sheet, "A", "A", 20)
- _ = xlsx.SetColWidth(sheet, "B", "B", 25)
- _ = xlsx.SetColWidth(sheet, "C", "C", 30)
- _ = xlsx.SetColWidth(sheet, "D", "D", 35)
- _ = xlsx.SetColWidth(sheet, "E", "E", 45)
- _ = xlsx.SetColWidth(sheet, "F", "F", 20)
- subtitles := []interface{}{"发布日期", "来源单位", "标题", "正文内容", "来源网址", "重客类型", "采集方式", "规则相关度", "ai相关度", "ai权重", "L2"}
- line++
- //设置第一行title
- _ = xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &subtitles)
- //file := "20230825惠普_舆情.xlsx"
- fmt.Println("导出数据总数:-------", len(data))
- for k, _ := range data {
- llog.Println("导出数据-------", k)
- line++
- val := []interface{}{
- data[k]["pubulishtime"], data[k]["site"], data[k]["title"], data[k]["detail"], data[k]["infourl"], data[k]["zk"], data[k]["wtype"], data[k]["typea"],
- }
- //调用智普AI
- if GF.Env.Key != "" && GF.Env.Model != "" {
- res := ZpRelated(GF.Env.Key, GF.Env.Model, util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"]))
- //res := normalChat(util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"]))
- val = append(val, res["type_ai"])
- val = append(val, res["type_weight"])
- //只要在L0 名单里,就不需要返回L2,否则返回大模型识别的主体
- if zkMap[util.ObjToString(data[k]["site"])] != "" {
- val = append(val, res["name"])
- }
- }
- insert := map[string]interface{}{
- "pubulishtime": strings.Replace(util.ObjToString(data[k]["pubulishtime"]), "/", "-", -1),
- "site": data[k]["site"],
- "title": data[k]["title"],
- "detail": data[k]["detail"],
- "infourl": data[k]["infourl"],
- "type": data[k]["zk"],
- "wtype": data[k]["wtype"],
- "label": data[k]["typea"],
- "type_ai": data[k]["type_ai"],
- "type_weight": data[k]["type_weight"],
- "name": data[k]["name"],
- "createtime": time.Now().Format("2006-01-02:15:04:05"),
- }
- MgoN.Save(GF.MongoN.Coll, insert)
- err := xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &val)
- if err != nil {
- log.Debug("set sheetrow line err", zap.Any(fmt.Sprintf("%s%d", "A", line), err))
- return
- }
- _ = xlsx.SetCellStyle(sheet, fmt.Sprintf("%s%d", "A", line), "BA"+strconv.Itoa(line), styleOne)
- }
- xlsx.Path = exportFile
- xlsx.Save()
- fmt.Println("数据导出结束")
- }
- // readZK 读取重客类型
- func readZK() {
- f, err := excelize.OpenFile("重客类型.xlsx")
- if err != nil {
- fmt.Println(err)
- return
- }
- defer func() {
- if err := f.Close(); err != nil {
- fmt.Println(err)
- }
- }()
- rows, err := f.GetRows("Sheet1")
- if err != nil {
- fmt.Println(err)
- return
- }
- //
- for i := 1; i < len(rows); i++ {
- zkMap[rows[i][1]] = rows[i][2]
- }
- }
- // contains contains
- func contains(data string, specials []string) bool {
- for _, v := range specials {
- if strings.Contains(data, v) {
- return true
- }
- }
- return false
- }
- // ParseDateString s时间字符串转时间戳
- func ParseDateString(dateString string) (int64, error) {
- // Regular expressions for different date formats
- regexPatterns := []string{
- //`^(\d{4})年(\d{1,2})月(\d{1,2})日?$`,
- //`^(\d{4})年(\d{1,2})月$`,
- //`^(\d{4})-(\d{1,2})-(\d{1,2})$`,
- //`^(\d{4})年(\d{1,2})$`,
- //`^(\d{4})\-(\d{1,2})$`,
- //`^(\d{4})\.(\d{1,2})\.(\d{1,2})$`,
- //`^(\d{4})\.(\d{1,2})$`,
- //`^(\d{4})年$`,
- //`^(\d{4})$`,
- `(\d{4})[年.\-/]?(\d{1,2})[月.\-/]?(\d{1,2})日?`,
- `(\d{1,2})[月.\-/]?(\d{1,2})日?`,
- }
- for _, pattern := range regexPatterns {
- re := regexp.MustCompile(pattern)
- match := re.FindStringSubmatch(dateString)
- if len(match) > 0 {
- year, _ := strconv.Atoi(match[1])
- month, _ := strconv.Atoi(match[2])
- var day int
- if len(match) >= 4 && match[3] != "" {
- day, _ = strconv.Atoi(match[3])
- } else {
- day = 1
- }
- parsedTime := time.Date(year, time.Month(month), day, 0, 0, 0, 0, time.UTC)
- return parsedTime.Unix(), nil
- }
- }
- return 0, fmt.Errorf("unrecognized date format")
- }
|