main.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. package main
  2. import (
  3. "fmt"
  4. "github.com/spf13/viper"
  5. "github.com/xuri/excelize/v2"
  6. "go.uber.org/zap"
  7. "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
  9. "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
  10. llog "log"
  11. "os"
  12. "regexp"
  13. "strconv"
  14. "strings"
  15. "time"
  16. )
  17. var (
  18. GF GlobalConf
  19. Mgo *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址
  20. MgoB *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址
  21. MgoN *mongodb.MongodbSim //
  22. infourls = make(map[string]bool, 0) //
  23. insertUrl = make(map[string]bool, 0)
  24. //specialChars = []string{"�", "Γ", "ΩΙ", "Δ", "Σ", "≤", "ζ", "Ψ", "®", "Φ", "ώ", "≈",
  25. // "’Ώ", "μ", "Λ", "Θ", "ß", "±", "Ύ", "©", "φ", "¬", "≤", "Й", "ж", "Щ", "Ъ"}
  26. zkMap = make(map[string]string) //重客类型
  27. )
  28. func InitConfig() (err error) {
  29. viper.SetConfigFile("config.toml") // 指定配置文件路径
  30. viper.SetConfigName("config") // 配置文件名称(无扩展名)
  31. viper.SetConfigType("toml") // 如果配置文件的名称中没有扩展名,则需要配置此项
  32. viper.AddConfigPath("./")
  33. viper.AddConfigPath("./conf/") // 还可以在工作目录中查找配置
  34. viper.AddConfigPath("../conf/") // 还可以在工作目录中查找配置
  35. err = viper.ReadInConfig() // 查找并读取配置文件
  36. if err != nil { // 处理读取配置文件的错误
  37. return
  38. }
  39. err = viper.Unmarshal(&GF)
  40. return err
  41. }
  42. func InitLog() {
  43. err := log.InitLog(
  44. //log.Path("./logs/log.out"),
  45. log.Path(""),
  46. log.Level("info"),
  47. log.Compress(true),
  48. log.MaxSize(10),
  49. log.MaxBackups(10),
  50. log.MaxAge(7),
  51. log.Format("json"),
  52. )
  53. if err != nil {
  54. fmt.Printf("InitLog failed: %v\n", err)
  55. }
  56. }
  57. func InitMgo() {
  58. Mgo = &mongodb.MongodbSim{
  59. MongodbAddr: GF.Mongo.Host,
  60. DbName: GF.Mongo.DB,
  61. Size: GF.Mongo.Size,
  62. UserName: GF.Mongo.Username,
  63. Password: GF.Mongo.Password,
  64. Direct: GF.MongoB.Direct,
  65. }
  66. Mgo.InitPool()
  67. MgoB = &mongodb.MongodbSim{
  68. MongodbAddr: GF.MongoB.Host,
  69. DbName: GF.MongoB.DB,
  70. Size: GF.MongoB.Size,
  71. UserName: GF.MongoB.Username,
  72. Password: GF.MongoB.Password,
  73. Direct: GF.MongoB.Direct,
  74. }
  75. MgoB.InitPool()
  76. MgoN = &mongodb.MongodbSim{
  77. MongodbAddr: GF.MongoN.Host,
  78. DbName: GF.MongoN.DB,
  79. Size: GF.MongoN.Size,
  80. UserName: GF.MongoN.Username,
  81. Password: GF.MongoN.Password,
  82. Direct: GF.MongoN.Direct,
  83. }
  84. MgoN.InitPool()
  85. }
  86. func main() {
  87. InitConfig()
  88. InitLog()
  89. InitMgo()
  90. //readFile() // 读文件写入数据库
  91. readZK() //
  92. exportFiles()
  93. select {}
  94. }
  95. // readFile 读取文件
  96. func readFile() {
  97. files := GF.Env.Files
  98. //ch := make(chan bool, 10)
  99. //wg := &sync.WaitGroup{}
  100. if len(files) > 0 {
  101. for _, file := range files {
  102. //ch <- true
  103. //wg.Add(1)
  104. //go func(file string) {
  105. // defer func() {
  106. // <-ch
  107. // wg.Done()
  108. // }()
  109. fmt.Println("开始读取文件:", file)
  110. _, err := os.Stat(file)
  111. if err != nil {
  112. log.Error("readFile", zap.Error(err))
  113. }
  114. f, err := excelize.OpenFile(file, excelize.Options{
  115. ShortDatePattern: "yyyy/mm/dd",
  116. })
  117. if err != nil {
  118. fmt.Println(err)
  119. return
  120. }
  121. defer func() {
  122. if err := f.Close(); err != nil {
  123. fmt.Println(err)
  124. }
  125. }()
  126. rows, err := f.GetRows("有效数据")
  127. if err != nil {
  128. fmt.Println(err)
  129. return
  130. }
  131. for i := 1; i < len(rows); i++ {
  132. if i%100 == 0 {
  133. log.Info("readFile", zap.Int(file+" 当前读取行数:", i))
  134. }
  135. insert := map[string]interface{}{
  136. "pubulishtime": strings.Replace(rows[i][0], "/", "-", -1),
  137. "site": rows[i][1],
  138. "title": rows[i][2],
  139. "detail": rows[i][3],
  140. "infourl": rows[i][4],
  141. "type": rows[i][5],
  142. "createtime": time.Now().Format("2006-01-02:15:04:05"),
  143. }
  144. if len(rows[i]) > 6 {
  145. insert["wtype"] = rows[i][6]
  146. }
  147. if len(rows[i]) > 7 {
  148. insert["rule_type"] = rows[i][7]
  149. }
  150. //大模型给的是否相关
  151. if len(rows[i]) > 8 {
  152. insert["model_ai"] = rows[i][8]
  153. }
  154. if len(rows[i]) > 9 {
  155. insert["model_weight"] = rows[i][9]
  156. }
  157. if len(rows[i]) > 10 {
  158. insert["l2"] = rows[i][10]
  159. }
  160. MgoN.Save(GF.MongoN.Coll, insert)
  161. }
  162. log.Info("readFile", zap.String(file, " over"))
  163. //}(file)
  164. }
  165. }
  166. //wg.Wait()
  167. fmt.Println("所有文件 读取结束")
  168. }
  169. // exportFiles 导出文件
  170. func exportFiles() {
  171. sess := Mgo.GetMgoConn()
  172. defer Mgo.DestoryMongoConn(sess)
  173. sessN := MgoN.GetMgoConn()
  174. defer MgoN.DestoryMongoConn(sessN)
  175. fmt.Println("开始数据导出")
  176. now := time.Now()
  177. targetTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.Start, GF.Env.Shour, 0, 0, 0, now.Location())
  178. todayTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.End, GF.Env.Ehour, 0, 0, 0, now.Location())
  179. q := map[string]interface{}{
  180. "comeintime": map[string]interface{}{
  181. "$gt": targetTime.Unix(),
  182. "$lte": todayTime.Unix(),
  183. },
  184. }
  185. log.Info("dealBidding", zap.Any("q", q))
  186. query := sessN.DB(GF.MongoN.DB).C(GF.MongoN.Coll).Find(nil).Select(map[string]interface{}{
  187. "contenthtml": 0}).Iter()
  188. count := 0
  189. //1.读取所有 已经保存交付的infourl
  190. for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
  191. infourl := util.ObjToString(tmp["infourl"])
  192. if infourl != "" {
  193. infourls[infourl] = true
  194. }
  195. if count%10000 == 0 {
  196. log.Info("infourl", zap.Int("current", count))
  197. }
  198. }
  199. log.Info("infourl", zap.Int("来源网址数量:", len(infourls)))
  200. //上月1号时间戳
  201. lastMonthTimestamp := getLastMonthFirstDayTimestamp()
  202. //2.官网数据
  203. var data = make([]map[string]interface{}, 0)
  204. count2 := 0
  205. query2 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll1).Find(q).Select(map[string]interface{}{
  206. "contenthtml": 0}).Iter()
  207. for tmp := make(map[string]interface{}); query2.Next(tmp); count2++ {
  208. if count2%100 == 0 {
  209. log.Info("count2", zap.Int("current", count2))
  210. }
  211. infourl := util.ObjToString(tmp["infourl"])
  212. if infourls[infourl] || insertUrl[infourl] {
  213. continue
  214. } else {
  215. pubulishtime := util.ObjToString(tmp["pubulishtime"])
  216. if pubulishtime == "" {
  217. continue
  218. }
  219. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  220. continue
  221. }
  222. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  223. continue
  224. }
  225. //标题内容排除词
  226. rsa, rsb := isOutData(tmp)
  227. if rsa {
  228. continue
  229. } else {
  230. tmp["typea"] = rsb
  231. }
  232. datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
  233. ////发布时间,小于上月1号,直接过滤
  234. if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
  235. continue
  236. }
  237. if datetime > 0 {
  238. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  239. }
  240. tmp["wtype"] = "官网-python"
  241. tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
  242. data = append(data, tmp)
  243. insertUrl[infourl] = true
  244. }
  245. tmp = map[string]interface{}{}
  246. }
  247. log.Info(GF.Env.Coll1, zap.Int("数量:", count2))
  248. //3.百度数据
  249. count3 := 0
  250. query3 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll2).Find(q).Select(map[string]interface{}{
  251. "contenthtml": 0}).Iter()
  252. for tmp := make(map[string]interface{}); query3.Next(tmp); count3++ {
  253. if count3%100 == 0 {
  254. log.Info("count3", zap.Int("current", count3))
  255. }
  256. infourl := util.ObjToString(tmp["infourl"])
  257. if infourls[infourl] || insertUrl[infourl] {
  258. continue
  259. } else {
  260. pubulishtime := util.ObjToString(tmp["pubulishtime"])
  261. if pubulishtime == "" {
  262. continue
  263. }
  264. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  265. continue
  266. }
  267. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  268. continue
  269. }
  270. //标题内容排除词
  271. rsa, rsb := isOutData(tmp)
  272. if rsa {
  273. continue
  274. } else {
  275. tmp["typea"] = rsb
  276. }
  277. if util.ObjToString(tmp["site"]) == "" && util.ObjToString(tmp["searchwords"]) != "" {
  278. res, _ := Mgo.FindOne(GF.Env.Keyword, map[string]interface{}{"key": tmp["searchwords"]})
  279. re := *res
  280. tmp["site"] = re["site"]
  281. }
  282. datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
  283. ////发布时间,小于上月1号,直接过滤
  284. if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
  285. continue
  286. }
  287. if datetime > 0 {
  288. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  289. }
  290. tmp["wtype"] = "百度-Python"
  291. tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
  292. data = append(data, tmp)
  293. insertUrl[infourl] = true
  294. }
  295. tmp = map[string]interface{}{}
  296. }
  297. log.Info(GF.Env.Coll2, zap.Int("数量:", count3))
  298. //官网-lua 数据bidding_yq
  299. if GF.Env.Collb != "" {
  300. sess2 := MgoB.GetMgoConn()
  301. defer MgoB.DestoryMongoConn(sess2)
  302. count4 := 0
  303. query4 := sess2.DB(GF.MongoB.DB).C(GF.Env.Collb).Find(q).Select(map[string]interface{}{
  304. "contenthtml": 0}).Iter()
  305. for tmp := make(map[string]interface{}); query4.Next(tmp); count4++ {
  306. if count4%100 == 0 {
  307. log.Info("count4", zap.Int("current", count4))
  308. }
  309. infourl := util.ObjToString(tmp["href"])
  310. if infourls[infourl] || insertUrl[infourl] {
  311. continue
  312. } else {
  313. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  314. continue
  315. }
  316. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  317. continue
  318. }
  319. //标题内容排除词
  320. rsa, rsb := isOutData(tmp)
  321. if rsa {
  322. continue
  323. } else {
  324. tmp["typea"] = rsb
  325. }
  326. datetime := util.Int64All(tmp["publishtime"])
  327. ////发布时间,小于上月1号,直接过滤
  328. if datetime < lastMonthTimestamp || datetime > time.Now().Unix() {
  329. continue
  330. }
  331. if datetime > 0 {
  332. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  333. }
  334. tmp["infourl"] = tmp["href"]
  335. tmp["wtype"] = "官网-lua"
  336. tmp["zk"] = zkMap[util.ObjToString(tmp["site"])] //重客类型
  337. data = append(data, tmp)
  338. insertUrl[infourl] = true
  339. }
  340. tmp = map[string]interface{}{}
  341. }
  342. log.Info(GF.Env.Collb, zap.Int("数量:", count4))
  343. }
  344. //4.
  345. file := now.Format("20060102") + "惠普_舆情数据.xlsx"
  346. //var xlsx *excelize.File
  347. currentPwd, _ := os.Getwd()
  348. exportFile := fmt.Sprintf("%s/%s", currentPwd, file)
  349. xlsx := excelize.NewFile(excelize.Options{ShortDatePattern: "yyyy/m/dd"})
  350. styleOne, _ := xlsx.NewStyle(
  351. &excelize.Style{
  352. Alignment: &excelize.Alignment{
  353. Horizontal: "left",
  354. Vertical: "left",
  355. },
  356. },
  357. )
  358. line := 0
  359. sheet := "有效数据"
  360. xlsx.NewSheet(sheet)
  361. xlsx.DeleteSheet("Sheet1")
  362. _ = xlsx.SetColWidth(sheet, "A", "A", 20)
  363. _ = xlsx.SetColWidth(sheet, "B", "B", 25)
  364. _ = xlsx.SetColWidth(sheet, "C", "C", 30)
  365. _ = xlsx.SetColWidth(sheet, "D", "D", 35)
  366. _ = xlsx.SetColWidth(sheet, "E", "E", 45)
  367. _ = xlsx.SetColWidth(sheet, "F", "F", 20)
  368. subtitles := []interface{}{"发布日期", "来源单位", "标题", "正文内容", "来源网址", "重客类型", "采集方式", "规则相关度", "ai相关度", "ai权重", "L2"}
  369. line++
  370. //设置第一行title
  371. _ = xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &subtitles)
  372. //file := "20230825惠普_舆情.xlsx"
  373. fmt.Println("导出数据总数:-------", len(data))
  374. for k, _ := range data {
  375. llog.Println("导出数据-------", k)
  376. line++
  377. val := []interface{}{
  378. data[k]["pubulishtime"], data[k]["site"], data[k]["title"], data[k]["detail"], data[k]["infourl"], data[k]["zk"], data[k]["wtype"], data[k]["typea"],
  379. }
  380. //调用智普AI
  381. if GF.Env.Key != "" && GF.Env.Model != "" {
  382. res := ZpRelated(GF.Env.Key, GF.Env.Model, util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"]))
  383. //res := normalChat(util.ObjToString(data[k]["title"]), util.ObjToString(data[k]["detail"]))
  384. val = append(val, res["type_ai"])
  385. val = append(val, res["type_weight"])
  386. //只要在L0 名单里,就不需要返回L2,否则返回大模型识别的主体
  387. if zkMap[util.ObjToString(data[k]["site"])] != "" {
  388. val = append(val, res["name"])
  389. }
  390. }
  391. insert := map[string]interface{}{
  392. "pubulishtime": strings.Replace(util.ObjToString(data[k]["pubulishtime"]), "/", "-", -1),
  393. "site": data[k]["site"],
  394. "title": data[k]["title"],
  395. "detail": data[k]["detail"],
  396. "infourl": data[k]["infourl"],
  397. "type": data[k]["zk"],
  398. "wtype": data[k]["wtype"],
  399. "label": data[k]["typea"],
  400. "type_ai": data[k]["type_ai"],
  401. "type_weight": data[k]["type_weight"],
  402. "name": data[k]["name"],
  403. "createtime": time.Now().Format("2006-01-02:15:04:05"),
  404. }
  405. MgoN.Save(GF.MongoN.Coll, insert)
  406. err := xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &val)
  407. if err != nil {
  408. log.Debug("set sheetrow line err", zap.Any(fmt.Sprintf("%s%d", "A", line), err))
  409. return
  410. }
  411. _ = xlsx.SetCellStyle(sheet, fmt.Sprintf("%s%d", "A", line), "BA"+strconv.Itoa(line), styleOne)
  412. }
  413. xlsx.Path = exportFile
  414. xlsx.Save()
  415. fmt.Println("数据导出结束")
  416. }
  417. // readZK 读取重客类型
  418. func readZK() {
  419. f, err := excelize.OpenFile("重客类型.xlsx")
  420. if err != nil {
  421. fmt.Println(err)
  422. return
  423. }
  424. defer func() {
  425. if err := f.Close(); err != nil {
  426. fmt.Println(err)
  427. }
  428. }()
  429. rows, err := f.GetRows("Sheet1")
  430. if err != nil {
  431. fmt.Println(err)
  432. return
  433. }
  434. //
  435. for i := 1; i < len(rows); i++ {
  436. zkMap[rows[i][1]] = rows[i][2]
  437. }
  438. }
  439. // contains contains
  440. func contains(data string, specials []string) bool {
  441. for _, v := range specials {
  442. if strings.Contains(data, v) {
  443. return true
  444. }
  445. }
  446. return false
  447. }
  448. // ParseDateString s时间字符串转时间戳
  449. func ParseDateString(dateString string) (int64, error) {
  450. // Regular expressions for different date formats
  451. regexPatterns := []string{
  452. //`^(\d{4})年(\d{1,2})月(\d{1,2})日?$`,
  453. //`^(\d{4})年(\d{1,2})月$`,
  454. //`^(\d{4})-(\d{1,2})-(\d{1,2})$`,
  455. //`^(\d{4})年(\d{1,2})$`,
  456. //`^(\d{4})\-(\d{1,2})$`,
  457. //`^(\d{4})\.(\d{1,2})\.(\d{1,2})$`,
  458. //`^(\d{4})\.(\d{1,2})$`,
  459. //`^(\d{4})年$`,
  460. //`^(\d{4})$`,
  461. `(\d{4})[年.\-/]?(\d{1,2})[月.\-/]?(\d{1,2})日?`,
  462. `(\d{1,2})[月.\-/]?(\d{1,2})日?`,
  463. }
  464. for _, pattern := range regexPatterns {
  465. re := regexp.MustCompile(pattern)
  466. match := re.FindStringSubmatch(dateString)
  467. if len(match) > 0 {
  468. year, _ := strconv.Atoi(match[1])
  469. month, _ := strconv.Atoi(match[2])
  470. var day int
  471. if len(match) >= 4 && match[3] != "" {
  472. day, _ = strconv.Atoi(match[3])
  473. } else {
  474. day = 1
  475. }
  476. parsedTime := time.Date(year, time.Month(month), day, 0, 0, 0, 0, time.UTC)
  477. return parsedTime.Unix(), nil
  478. }
  479. }
  480. return 0, fmt.Errorf("unrecognized date format")
  481. }