main.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. package main
  2. import (
  3. "fmt"
  4. "github.com/spf13/viper"
  5. "github.com/xuri/excelize/v2"
  6. "go.uber.org/zap"
  7. "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
  9. "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
  10. llog "log"
  11. "os"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "time"
  16. )
  17. var (
  18. GF GlobalConf
  19. Mgo *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址
  20. MgoB *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址
  21. MgoN *mongodb.MongodbSim //
  22. infourls = make(map[string]bool, 0) //
  23. insertUrl = make(map[string]bool, 0)
  24. //specialChars = []string{"�", "Γ", "ΩΙ", "Δ", "Σ", "≤", "ζ", "Ψ", "®", "Φ", "ώ", "≈",
  25. // "’Ώ", "μ", "Λ", "Θ", "ß", "±", "Ύ", "©", "φ", "¬", "≤", "Й", "ж", "Щ", "Ъ"}
  26. zkMap = make(map[string]string) //重客类型
  27. )
  28. func InitConfig() (err error) {
  29. viper.SetConfigFile("config.toml") // 指定配置文件路径
  30. viper.SetConfigName("config") // 配置文件名称(无扩展名)
  31. viper.SetConfigType("toml") // 如果配置文件的名称中没有扩展名,则需要配置此项
  32. viper.AddConfigPath("./")
  33. viper.AddConfigPath("./conf/") // 还可以在工作目录中查找配置
  34. viper.AddConfigPath("../conf/") // 还可以在工作目录中查找配置
  35. err = viper.ReadInConfig() // 查找并读取配置文件
  36. if err != nil { // 处理读取配置文件的错误
  37. return
  38. }
  39. err = viper.Unmarshal(&GF)
  40. return err
  41. }
  42. func InitLog() {
  43. err := log.InitLog(
  44. //log.Path("./logs/log.out"),
  45. log.Path(""),
  46. log.Level("info"),
  47. log.Compress(true),
  48. log.MaxSize(10),
  49. log.MaxBackups(10),
  50. log.MaxAge(7),
  51. log.Format("json"),
  52. )
  53. if err != nil {
  54. fmt.Printf("InitLog failed: %v\n", err)
  55. }
  56. }
  57. func InitMgo() {
  58. Mgo = &mongodb.MongodbSim{
  59. MongodbAddr: GF.Mongo.Host,
  60. DbName: GF.Mongo.DB,
  61. Size: GF.Mongo.Size,
  62. UserName: GF.Mongo.Username,
  63. Password: GF.Mongo.Password,
  64. Direct: GF.MongoB.Direct,
  65. }
  66. Mgo.InitPool()
  67. MgoB = &mongodb.MongodbSim{
  68. MongodbAddr: GF.MongoB.Host,
  69. DbName: GF.MongoB.DB,
  70. Size: GF.MongoB.Size,
  71. UserName: GF.MongoB.Username,
  72. Password: GF.MongoB.Password,
  73. Direct: GF.MongoB.Direct,
  74. }
  75. MgoB.InitPool()
  76. MgoN = &mongodb.MongodbSim{
  77. MongodbAddr: GF.MongoN.Host,
  78. DbName: GF.MongoN.DB,
  79. Size: GF.MongoN.Size,
  80. UserName: GF.MongoN.Username,
  81. Password: GF.MongoN.Password,
  82. Direct: GF.MongoN.Direct,
  83. }
  84. MgoN.InitPool()
  85. }
  86. func main() {
  87. InitConfig()
  88. InitLog()
  89. InitMgo()
  90. //readFile() // 读文件写入数据库
  91. //readZK() //
  92. //exportFiles()
  93. fmt.Println("44444")
  94. select {}
  95. }
  96. // readFile 读取文件
  97. func readFile() {
  98. files := GF.Env.Files
  99. //ch := make(chan bool, 10)
  100. //wg := &sync.WaitGroup{}
  101. if len(files) > 0 {
  102. for _, file := range files {
  103. //ch <- true
  104. //wg.Add(1)
  105. //go func(file string) {
  106. // defer func() {
  107. // <-ch
  108. // wg.Done()
  109. // }()
  110. fmt.Println("开始读取文件:", file)
  111. _, err := os.Stat(file)
  112. if err != nil {
  113. log.Error("readFile", zap.Error(err))
  114. }
  115. f, err := excelize.OpenFile(file, excelize.Options{
  116. ShortDatePattern: "yyyy/mm/dd",
  117. })
  118. if err != nil {
  119. fmt.Println(err)
  120. return
  121. }
  122. defer func() {
  123. if err := f.Close(); err != nil {
  124. fmt.Println(err)
  125. }
  126. }()
  127. rows, err := f.GetRows("有效数据")
  128. if err != nil {
  129. fmt.Println(err)
  130. return
  131. }
  132. for i := 1; i < len(rows); i++ {
  133. if i%100 == 0 {
  134. log.Info("readFile", zap.Int(file+" 当前读取行数:", i))
  135. }
  136. insert := map[string]interface{}{
  137. "pubulishtime": strings.Replace(rows[i][0], "/", "-", -1),
  138. "site": rows[i][1],
  139. "title": rows[i][2],
  140. "detail": rows[i][3],
  141. "infourl": rows[i][4],
  142. "type": rows[i][5],
  143. "createtime": time.Now().Format("2006-01-02:15:04:05"),
  144. }
  145. if len(rows[i]) > 6 {
  146. insert["wtype"] = rows[i][6]
  147. }
  148. if len(rows[i]) > 7 {
  149. insert["rule_type"] = rows[i][7]
  150. }
  151. //大模型给的是否相关
  152. if len(rows[i]) > 8 {
  153. insert["model_ai"] = rows[i][8]
  154. }
  155. if len(rows[i]) > 9 {
  156. insert["model_weight"] = rows[i][9]
  157. }
  158. if len(rows[i]) > 10 {
  159. insert["l2"] = rows[i][10]
  160. }
  161. MgoN.Save(GF.MongoN.Coll, insert)
  162. }
  163. log.Info("readFile", zap.String(file, " over"))
  164. //}(file)
  165. }
  166. }
  167. //wg.Wait()
  168. fmt.Println("所有文件 读取结束")
  169. }
  170. // exportFiles 导出文件
  171. func exportFiles() {
  172. sess := Mgo.GetMgoConn()
  173. defer Mgo.DestoryMongoConn(sess)
  174. sessN := MgoN.GetMgoConn()
  175. defer MgoN.DestoryMongoConn(sessN)
  176. fmt.Println("开始数据导出")
  177. now := time.Now()
  178. targetTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.Start, GF.Env.Shour, 0, 0, 0, now.Location())
  179. todayTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.End, GF.Env.Ehour, 0, 0, 0, now.Location())
  180. q := map[string]interface{}{
  181. "comeintime": map[string]interface{}{
  182. "$gt": targetTime.Unix(),
  183. "$lte": todayTime.Unix(),
  184. },
  185. }
  186. log.Info("dealBidding", zap.Any("q", q))
  187. query := sessN.DB(GF.MongoN.DB).C(GF.MongoN.Coll).Find(nil).Select(map[string]interface{}{
  188. "contenthtml": 0}).Iter()
  189. count := 0
  190. //1.读取所有 已经保存交付的infourl
  191. for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
  192. infourl := util.ObjToString(tmp["infourl"])
  193. if infourl != "" {
  194. infourls[infourl] = true
  195. }
  196. if count%10000 == 0 {
  197. log.Info("infourl", zap.Int("current", count))
  198. }
  199. }
  200. log.Info("infourl", zap.Int("来源网址数量:", len(infourls)))
  201. //上月1号时间戳
  202. lastMonthTimestamp := getLastMonthFirstDayTimestamp()
  203. //2.官网数据
  204. var data = make([]map[string]interface{}, 0)
  205. count2 := 0
  206. query2 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll1).Find(q).Select(map[string]interface{}{
  207. "contenthtml": 0}).Iter()
  208. for tmp := make(map[string]interface{}); query2.Next(tmp); count2++ {
  209. if count2%100 == 0 {
  210. log.Info("count2", zap.Int("current", count2))
  211. }
  212. infourl := util.ObjToString(tmp["infourl"])
  213. if infourls[infourl] || insertUrl[infourl] {
  214. continue
  215. } else {
  216. pubulishtime := util.ObjToString(tmp["pubulishtime"])
  217. if pubulishtime == "" {
  218. continue
  219. }
  220. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  221. continue
  222. }
  223. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  224. continue
  225. }
  226. //标题内容排除词
  227. rsa, rsb := isOutData(tmp)
  228. if rsa {
  229. continue
  230. } else {
  231. tmp["typea"] = rsb
  232. }
  233. datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
  234. ////发布时间,小于上月1号,直接过滤
  235. if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
  236. continue
  237. }
  238. if datetime > 0 {
  239. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  240. }
  241. tmp["wtype"] = "官网-python"
  242. tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
  243. data = append(data, tmp)
  244. insertUrl[infourl] = true
  245. }
  246. tmp = map[string]interface{}{}
  247. }
  248. log.Info(GF.Env.Coll1, zap.Int("数量:", count2))
  249. //3.百度数据
  250. count3 := 0
  251. query3 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll2).Find(q).Select(map[string]interface{}{
  252. "contenthtml": 0}).Iter()
  253. for tmp := make(map[string]interface{}); query3.Next(tmp); count3++ {
  254. if count3%100 == 0 {
  255. log.Info("count3", zap.Int("current", count3))
  256. }
  257. infourl := util.ObjToString(tmp["infourl"])
  258. if infourls[infourl] || insertUrl[infourl] {
  259. continue
  260. } else {
  261. pubulishtime := util.ObjToString(tmp["pubulishtime"])
  262. if pubulishtime == "" {
  263. continue
  264. }
  265. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  266. continue
  267. }
  268. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  269. continue
  270. }
  271. //标题内容排除词
  272. rsa, rsb := isOutData(tmp)
  273. if rsa {
  274. continue
  275. } else {
  276. tmp["typea"] = rsb
  277. }
  278. if util.ObjToString(tmp["site"]) == "" && util.ObjToString(tmp["searchwords"]) != "" {
  279. res, _ := Mgo.FindOne(GF.Env.Keyword, map[string]interface{}{"key": tmp["searchwords"]})
  280. re := *res
  281. tmp["site"] = re["site"]
  282. }
  283. datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
  284. ////发布时间,小于上月1号,直接过滤
  285. if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
  286. continue
  287. }
  288. if datetime > 0 {
  289. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  290. }
  291. tmp["wtype"] = "百度-Python"
  292. tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
  293. data = append(data, tmp)
  294. insertUrl[infourl] = true
  295. }
  296. tmp = map[string]interface{}{}
  297. }
  298. log.Info(GF.Env.Coll2, zap.Int("数量:", count3))
  299. //官网-lua 数据bidding_yq
  300. if GF.Env.Collb != "" {
  301. sess2 := MgoB.GetMgoConn()
  302. defer MgoB.DestoryMongoConn(sess2)
  303. count4 := 0
  304. query4 := sess2.DB(GF.MongoB.DB).C(GF.Env.Collb).Find(q).Select(map[string]interface{}{
  305. "contenthtml": 0}).Iter()
  306. for tmp := make(map[string]interface{}); query4.Next(tmp); count4++ {
  307. if count4%100 == 0 {
  308. log.Info("count4", zap.Int("current", count4))
  309. }
  310. infourl := util.ObjToString(tmp["href"])
  311. if infourls[infourl] || insertUrl[infourl] {
  312. continue
  313. } else {
  314. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  315. continue
  316. }
  317. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  318. continue
  319. }
  320. //标题内容排除词
  321. rsa, rsb := isOutData(tmp)
  322. if rsa {
  323. continue
  324. } else {
  325. tmp["typea"] = rsb
  326. }
  327. datetime := util.Int64All(tmp["publishtime"])
  328. ////发布时间,小于上月1号,直接过滤
  329. if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
  330. continue
  331. }
  332. if datetime > 0 {
  333. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  334. }
  335. tmp["infourl"] = tmp["href"]
  336. tmp["wtype"] = "官网-lua"
  337. tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
  338. data = append(data, tmp)
  339. insertUrl[infourl] = true
  340. }
  341. tmp = map[string]interface{}{}
  342. }
  343. log.Info(GF.Env.Collb, zap.Int("数量:", count4))
  344. }
  345. //4.
  346. file := now.Format("20060102") + "惠普_舆情数据.xlsx"
  347. //var xlsx *excelize.File
  348. currentPwd, _ := os.Getwd()
  349. exportFile := fmt.Sprintf("%s/%s", currentPwd, file)
  350. xlsx := excelize.NewFile(excelize.Options{ShortDatePattern: "yyyy/m/dd"})
  351. styleOne, _ := xlsx.NewStyle(
  352. &excelize.Style{
  353. Alignment: &excelize.Alignment{
  354. Horizontal: "left",
  355. Vertical: "left",
  356. },
  357. },
  358. )
  359. line := 0
  360. sheet := "有效数据"
  361. xlsx.NewSheet(sheet)
  362. xlsx.DeleteSheet("Sheet1")
  363. _ = xlsx.SetColWidth(sheet, "A", "A", 20)
  364. _ = xlsx.SetColWidth(sheet, "B", "B", 25)
  365. _ = xlsx.SetColWidth(sheet, "C", "C", 30)
  366. _ = xlsx.SetColWidth(sheet, "D", "D", 35)
  367. _ = xlsx.SetColWidth(sheet, "E", "E", 45)
  368. _ = xlsx.SetColWidth(sheet, "F", "F", 20)
  369. subtitles := []interface{}{"发布日期", "来源单位", "标题", "正文内容", "来源网址", "重客类型", "采集方式", "规则相关度", "ai相关度", "ai权重", "L2"}
  370. line++
  371. //设置第一行title
  372. _ = xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &subtitles)
  373. //file := "20230825惠普_舆情.xlsx"
  374. fmt.Println("导出数据总数:-------", len(data))
  375. for k, _ := range data {
  376. llog.Println("导出数据-------", k)
  377. line++
  378. val := []interface{}{
  379. data[k]["pubulishtime"], data[k]["site"], data[k]["title"], data[k]["detail"], data[k]["infourl"], data[k]["zk"], data[k]["wtype"], data[k]["typea"],
  380. }
  381. //调用智普AI
  382. if GF.Env.Key != "" && GF.Env.Model != "" {
  383. res := ZpRelated(GF.Env.Key, GF.Env.Model, util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"]))
  384. //res := normalChat(util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"]))
  385. val = append(val, res["type_ai"])
  386. val = append(val, res["type_weight"])
  387. //只要在L0 名单里,就不需要返回L2,否则返回大模型识别的主体
  388. if zkMap[util.ObjToString(data[k]["site"])] != "" {
  389. val = append(val, res["name"])
  390. }
  391. }
  392. insert := map[string]interface{}{
  393. "pubulishtime": strings.Replace(util.ObjToString(data[k]["pubulishtime"]), "/", "-", -1),
  394. "site": data[k]["site"],
  395. "title": data[k]["title"],
  396. "detail": data[k]["detail"],
  397. "infourl": data[k]["infourl"],
  398. "type": data[k]["zk"],
  399. "wtype": data[k]["wtype"],
  400. "label": data[k]["typea"],
  401. "type_ai": data[k]["type_ai"],
  402. "type_weight": data[k]["type_weight"],
  403. "name": data[k]["name"],
  404. "createtime": time.Now().Format("2006-01-02:15:04:05"),
  405. }
  406. MgoN.Save(GF.MongoN.Coll, insert)
  407. err := xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &val)
  408. if err != nil {
  409. log.Debug("set sheetrow line err", zap.Any(fmt.Sprintf("%s%d", "A", line), err))
  410. return
  411. }
  412. _ = xlsx.SetCellStyle(sheet, fmt.Sprintf("%s%d", "A", line), "BA"+strconv.Itoa(line), styleOne)
  413. }
  414. xlsx.Path = exportFile
  415. xlsx.Save()
  416. fmt.Println("数据导出结束")
  417. }
  418. // readZK 读取重客类型
  419. func readZK() {
  420. f, err := excelize.OpenFile("重客类型.xlsx")
  421. if err != nil {
  422. fmt.Println(err)
  423. return
  424. }
  425. defer func() {
  426. if err := f.Close(); err != nil {
  427. fmt.Println(err)
  428. }
  429. }()
  430. rows, err := f.GetRows("Sheet1")
  431. if err != nil {
  432. fmt.Println(err)
  433. return
  434. }
  435. //
  436. for i := 1; i < len(rows); i++ {
  437. zkMap[rows[i][1]] = rows[i][2]
  438. }
  439. }
  440. // contains contains
  441. func contains(data string, specials []string) bool {
  442. for _, v := range specials {
  443. if strings.Contains(data, v) {
  444. return true
  445. }
  446. }
  447. return false
  448. }
  449. // ParseDateString s时间字符串转时间戳
  450. func ParseDateString(dateString string) (int64, error) {
  451. // Regular expressions for different date formats
  452. regexPatterns := []string{
  453. //`^(\d{4})年(\d{1,2})月(\d{1,2})日?$`,
  454. //`^(\d{4})年(\d{1,2})月$`,
  455. //`^(\d{4})-(\d{1,2})-(\d{1,2})$`,
  456. //`^(\d{4})年(\d{1,2})$`,
  457. //`^(\d{4})\-(\d{1,2})$`,
  458. //`^(\d{4})\.(\d{1,2})\.(\d{1,2})$`,
  459. //`^(\d{4})\.(\d{1,2})$`,
  460. //`^(\d{4})年$`,
  461. //`^(\d{4})$`,
  462. `(\d{4})[年.\-/]?(\d{1,2})[月.\-/]?(\d{1,2})日?`,
  463. `(\d{1,2})[月.\-/]?(\d{1,2})日?`,
  464. }
  465. for _, pattern := range regexPatterns {
  466. re := regexp.MustCompile(pattern)
  467. match := re.FindStringSubmatch(dateString)
  468. if len(match) > 0 {
  469. year, _ := strconv.Atoi(match[1])
  470. month, _ := strconv.Atoi(match[2])
  471. var day int
  472. if len(match) >= 4 && match[3] != "" {
  473. day, _ = strconv.Atoi(match[3])
  474. } else {
  475. day = 1
  476. }
  477. parsedTime := time.Date(year, time.Month(month), day, 0, 0, 0, 0, time.UTC)
  478. return parsedTime.Unix(), nil
  479. }
  480. }
  481. return 0, fmt.Errorf("unrecognized date format")
  482. }