main.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. package main
  2. import (
  3. "fmt"
  4. "github.com/spf13/viper"
  5. "github.com/xuri/excelize/v2"
  6. "go.uber.org/zap"
  7. "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  8. "jygit.jydev.jianyu360.cn/data_processing/common_utils/log"
  9. "jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
  10. "os"
  11. "regexp"
  12. "strconv"
  13. "strings"
  14. "time"
  15. )
  16. var (
  17. GF GlobalConf
  18. Mgo *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址
  19. MgoB *mongodb.MongodbSim //读取公司名录 MongoDB,也是更新的链接地址
  20. MgoN *mongodb.MongodbSim //
  21. infourls = make(map[string]bool, 0) //
  22. insertUrl = make(map[string]bool, 0)
  23. //specialChars = []string{"�", "Γ", "ΩΙ", "Δ", "Σ", "≤", "ζ", "Ψ", "®", "Φ", "ώ", "≈",
  24. // "’Ώ", "μ", "Λ", "Θ", "ß", "±", "Ύ", "©", "φ", "¬", "≤", "Й", "ж", "Щ", "Ъ"}
  25. )
  26. func InitConfig() (err error) {
  27. viper.SetConfigFile("config.toml") // 指定配置文件路径
  28. viper.SetConfigName("config") // 配置文件名称(无扩展名)
  29. viper.SetConfigType("toml") // 如果配置文件的名称中没有扩展名,则需要配置此项
  30. viper.AddConfigPath("./")
  31. viper.AddConfigPath("./conf/") // 还可以在工作目录中查找配置
  32. viper.AddConfigPath("../conf/") // 还可以在工作目录中查找配置
  33. err = viper.ReadInConfig() // 查找并读取配置文件
  34. if err != nil { // 处理读取配置文件的错误
  35. return
  36. }
  37. err = viper.Unmarshal(&GF)
  38. return err
  39. }
  40. func InitLog() {
  41. err := log.InitLog(
  42. //log.Path("./logs/log.out"),
  43. log.Path(""),
  44. log.Level("info"),
  45. log.Compress(true),
  46. log.MaxSize(10),
  47. log.MaxBackups(10),
  48. log.MaxAge(7),
  49. log.Format("json"),
  50. )
  51. if err != nil {
  52. fmt.Printf("InitLog failed: %v\n", err)
  53. }
  54. }
  55. func InitMgo() {
  56. Mgo = &mongodb.MongodbSim{
  57. MongodbAddr: GF.Mongo.Host,
  58. DbName: GF.Mongo.DB,
  59. Size: GF.Mongo.Size,
  60. UserName: GF.Mongo.Username,
  61. Password: GF.Mongo.Password,
  62. Direct: GF.MongoB.Direct,
  63. }
  64. Mgo.InitPool()
  65. MgoB = &mongodb.MongodbSim{
  66. MongodbAddr: GF.MongoB.Host,
  67. DbName: GF.MongoB.DB,
  68. Size: GF.MongoB.Size,
  69. UserName: GF.MongoB.Username,
  70. Password: GF.MongoB.Password,
  71. Direct: GF.MongoB.Direct,
  72. }
  73. MgoB.InitPool()
  74. MgoN = &mongodb.MongodbSim{
  75. MongodbAddr: GF.MongoN.Host,
  76. DbName: GF.MongoN.DB,
  77. Size: GF.MongoN.Size,
  78. UserName: GF.MongoN.Username,
  79. Password: GF.MongoN.Password,
  80. Direct: GF.MongoN.Direct,
  81. }
  82. MgoN.InitPool()
  83. }
  84. func main() {
  85. InitConfig()
  86. InitLog()
  87. InitMgo()
  88. //readFile()
  89. exportFiles()
  90. select {}
  91. }
  92. // readFile 读取文件
  93. func readFile() {
  94. files := GF.Env.Files
  95. //ch := make(chan bool, 10)
  96. //wg := &sync.WaitGroup{}
  97. if len(files) > 0 {
  98. for _, file := range files {
  99. //ch <- true
  100. //wg.Add(1)
  101. //go func(file string) {
  102. // defer func() {
  103. // <-ch
  104. // wg.Done()
  105. // }()
  106. fmt.Println("开始读取文件:", file)
  107. _, err := os.Stat(file)
  108. if err != nil {
  109. log.Error("readFile", zap.Error(err))
  110. }
  111. f, err := excelize.OpenFile(file, excelize.Options{
  112. ShortDatePattern: "yyyy/mm/dd",
  113. })
  114. if err != nil {
  115. fmt.Println(err)
  116. return
  117. }
  118. defer func() {
  119. if err := f.Close(); err != nil {
  120. fmt.Println(err)
  121. }
  122. }()
  123. rows, err := f.GetRows("有效数据")
  124. if err != nil {
  125. fmt.Println(err)
  126. return
  127. }
  128. for i := 1; i < len(rows); i++ {
  129. if i%100 == 0 {
  130. log.Info("readFile", zap.Int(file+" 当前读取行数:", i))
  131. }
  132. insert := map[string]interface{}{
  133. "pubulishtime": strings.Replace(rows[i][0], "/", "-", -1),
  134. "site": rows[i][1],
  135. "title": rows[i][2],
  136. "detail": rows[i][3],
  137. "infourl": rows[i][4],
  138. "type": rows[i][5],
  139. "createtime": time.Now().Format("2006-01-02:15:04:05"),
  140. }
  141. if len(rows[i]) > 6 {
  142. insert["wtype"] = rows[i][6]
  143. }
  144. MgoN.Save(GF.MongoN.Coll, insert)
  145. }
  146. log.Info("readFile", zap.String(file, " over"))
  147. //}(file)
  148. }
  149. }
  150. //wg.Wait()
  151. fmt.Println("所有文件 读取结束")
  152. }
  153. // exportFiles 导出文件
  154. func exportFiles() {
  155. sess := Mgo.GetMgoConn()
  156. defer Mgo.DestoryMongoConn(sess)
  157. sessN := MgoN.GetMgoConn()
  158. defer MgoN.DestoryMongoConn(sessN)
  159. fmt.Println("开始数据导出")
  160. now := time.Now()
  161. targetTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.Start, GF.Env.Shour, 0, 0, 0, now.Location())
  162. todayTime := time.Date(now.Year(), now.Month(), now.Day()+GF.Env.End, GF.Env.Ehour, 0, 0, 0, now.Location())
  163. q := map[string]interface{}{
  164. "comeintime": map[string]interface{}{
  165. "$gt": targetTime.Unix(),
  166. "$lte": todayTime.Unix(),
  167. },
  168. }
  169. log.Info("dealBidding", zap.Any("q", q))
  170. query := sessN.DB(GF.MongoN.DB).C(GF.MongoN.Coll).Find(nil).Select(map[string]interface{}{
  171. "contenthtml": 0}).Iter()
  172. count := 0
  173. //1.读取所有 已经保存交付的infourl
  174. for tmp := make(map[string]interface{}); query.Next(tmp); count++ {
  175. infourl := util.ObjToString(tmp["infourl"])
  176. if infourl != "" {
  177. infourls[infourl] = true
  178. }
  179. if count%1000 == 0 {
  180. log.Info("infourl", zap.Int("current", count))
  181. }
  182. }
  183. log.Info("infourl", zap.Int("来源网址数量:", len(infourls)))
  184. //2.官网数据
  185. var data = make([]map[string]interface{}, 0)
  186. count2 := 0
  187. query2 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll1).Find(q).Select(map[string]interface{}{
  188. "contenthtml": 0}).Iter()
  189. for tmp := make(map[string]interface{}); query2.Next(tmp); count2++ {
  190. if count2%100 == 0 {
  191. log.Info("count2", zap.Int("current", count2))
  192. }
  193. infourl := util.ObjToString(tmp["infourl"])
  194. if infourls[infourl] || insertUrl[infourl] {
  195. continue
  196. } else {
  197. pubulishtime := util.ObjToString(tmp["pubulishtime"])
  198. if pubulishtime == "" {
  199. continue
  200. }
  201. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  202. continue
  203. }
  204. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  205. continue
  206. }
  207. datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
  208. if datetime > 0 {
  209. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  210. }
  211. tmp["wtype"] = "官网-python"
  212. data = append(data, tmp)
  213. insertUrl[infourl] = true
  214. }
  215. tmp = map[string]interface{}{}
  216. }
  217. log.Info(GF.Env.Coll1, zap.Int("数量:", count2))
  218. //3.百度数据
  219. count3 := 0
  220. query3 := sess.DB(GF.Mongo.DB).C(GF.Env.Coll2).Find(q).Select(map[string]interface{}{
  221. "contenthtml": 0}).Iter()
  222. for tmp := make(map[string]interface{}); query3.Next(tmp); count3++ {
  223. if count3%100 == 0 {
  224. log.Info("count3", zap.Int("current", count3))
  225. }
  226. infourl := util.ObjToString(tmp["infourl"])
  227. if infourls[infourl] || insertUrl[infourl] {
  228. continue
  229. } else {
  230. pubulishtime := util.ObjToString(tmp["pubulishtime"])
  231. if pubulishtime == "" {
  232. continue
  233. }
  234. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  235. continue
  236. }
  237. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  238. continue
  239. }
  240. if util.ObjToString(tmp["site"]) == "" && util.ObjToString(tmp["searchwords"]) != "" {
  241. res, _ := Mgo.FindOne(GF.Env.Keyword, map[string]interface{}{"key": tmp["searchwords"]})
  242. re := *res
  243. tmp["site"] = re["site"]
  244. }
  245. datetime, _ := ParseDateString(util.ObjToString(tmp["pubulishtime"]))
  246. if datetime > 0 {
  247. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  248. }
  249. tmp["wtype"] = "百度-Python"
  250. data = append(data, tmp)
  251. insertUrl[infourl] = true
  252. }
  253. tmp = map[string]interface{}{}
  254. }
  255. log.Info(GF.Env.Coll2, zap.Int("数量:", count3))
  256. //官网-lua 数据bidding_yq
  257. if GF.Env.Collb != "" {
  258. sess2 := MgoB.GetMgoConn()
  259. defer MgoB.DestoryMongoConn(sess2)
  260. count4 := 0
  261. query4 := sess2.DB(GF.MongoB.DB).C(GF.Env.Collb).Find(q).Select(map[string]interface{}{
  262. "contenthtml": 0}).Iter()
  263. for tmp := make(map[string]interface{}); query4.Next(tmp); count4++ {
  264. if count4%100 == 0 {
  265. log.Info("count4", zap.Int("current", count4))
  266. }
  267. infourl := util.ObjToString(tmp["href"])
  268. if infourls[infourl] || insertUrl[infourl] {
  269. continue
  270. } else {
  271. if util.ObjToString(tmp["title"]) == "" || util.ObjToString(tmp["detail"]) == "" {
  272. continue
  273. }
  274. if contains(util.ObjToString(tmp["detail"]), GF.Env.Specials) {
  275. continue
  276. }
  277. datetime := util.Int64All(tmp["publishtime"])
  278. if datetime > 0 {
  279. tmp["pubulishtime"] = time.Unix(datetime, 0).Format("2006-01-02")
  280. }
  281. tmp["infourl"] = tmp["href"]
  282. tmp["wtype"] = "官网-lua"
  283. data = append(data, tmp)
  284. insertUrl[infourl] = true
  285. }
  286. tmp = map[string]interface{}{}
  287. }
  288. log.Info(GF.Env.Collb, zap.Int("数量:", count4))
  289. }
  290. //4.
  291. file := now.Format("20060102") + "惠普_舆情数据.xlsx"
  292. //var xlsx *excelize.File
  293. currentPwd, _ := os.Getwd()
  294. exportFile := fmt.Sprintf("%s/%s", currentPwd, file)
  295. xlsx := excelize.NewFile(excelize.Options{ShortDatePattern: "yyyy/m/dd"})
  296. styleOne, _ := xlsx.NewStyle(
  297. &excelize.Style{
  298. Alignment: &excelize.Alignment{
  299. Horizontal: "left",
  300. Vertical: "left",
  301. },
  302. },
  303. )
  304. line := 0
  305. sheet := "有效数据"
  306. xlsx.NewSheet(sheet)
  307. xlsx.DeleteSheet("Sheet1")
  308. _ = xlsx.SetColWidth(sheet, "A", "A", 20)
  309. _ = xlsx.SetColWidth(sheet, "B", "B", 25)
  310. _ = xlsx.SetColWidth(sheet, "C", "C", 30)
  311. _ = xlsx.SetColWidth(sheet, "D", "D", 35)
  312. _ = xlsx.SetColWidth(sheet, "E", "E", 45)
  313. _ = xlsx.SetColWidth(sheet, "F", "F", 20)
  314. subtitles := []interface{}{"发布日期", "来源单位", "标题", "正文内容", "来源网址", "重客类型", "采集方式"}
  315. line++
  316. //设置第一行title
  317. _ = xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &subtitles)
  318. //file := "20230825惠普_舆情.xlsx"
  319. fmt.Println("导出数据总数:-------", len(data))
  320. for k, _ := range data {
  321. fmt.Println("导出数据-------", k)
  322. line++
  323. val := []interface{}{
  324. data[k]["pubulishtime"], data[k]["site"], data[k]["title"], data[k]["detail"], data[k]["infourl"], "", data[k]["wtype"],
  325. }
  326. err := xlsx.SetSheetRow(sheet, fmt.Sprintf("%s%d", "A", line), &val)
  327. if err != nil {
  328. log.Debug("set sheetrow line err", zap.Any(fmt.Sprintf("%s%d", "A", line), err))
  329. return
  330. }
  331. _ = xlsx.SetCellStyle(sheet, fmt.Sprintf("%s%d", "A", line), "BA"+strconv.Itoa(line), styleOne)
  332. }
  333. xlsx.Path = exportFile
  334. xlsx.Save()
  335. fmt.Println("数据导出结束")
  336. }
  337. // contains contains
  338. func contains(data string, specials []string) bool {
  339. for _, v := range specials {
  340. if strings.Contains(data, v) {
  341. return true
  342. }
  343. }
  344. return false
  345. }
  346. func ParseDateString(dateString string) (int64, error) {
  347. // Regular expressions for different date formats
  348. regexPatterns := []string{
  349. //`^(\d{4})年(\d{1,2})月(\d{1,2})日?$`,
  350. //`^(\d{4})年(\d{1,2})月$`,
  351. //`^(\d{4})-(\d{1,2})-(\d{1,2})$`,
  352. //`^(\d{4})年(\d{1,2})$`,
  353. //`^(\d{4})\-(\d{1,2})$`,
  354. //`^(\d{4})\.(\d{1,2})\.(\d{1,2})$`,
  355. //`^(\d{4})\.(\d{1,2})$`,
  356. //`^(\d{4})年$`,
  357. //`^(\d{4})$`,
  358. `(\d{4})[年.\-/]?(\d{1,2})[月.\-/]?(\d{1,2})日?`,
  359. `(\d{1,2})[月.\-/]?(\d{1,2})日?`,
  360. }
  361. for _, pattern := range regexPatterns {
  362. re := regexp.MustCompile(pattern)
  363. match := re.FindStringSubmatch(dateString)
  364. if len(match) > 0 {
  365. year, _ := strconv.Atoi(match[1])
  366. month, _ := strconv.Atoi(match[2])
  367. var day int
  368. if len(match) >= 4 && match[3] != "" {
  369. day, _ = strconv.Atoi(match[3])
  370. } else {
  371. day = 1
  372. }
  373. parsedTime := time.Date(year, time.Month(month), day, 0, 0, 0, 0, time.UTC)
  374. return parsedTime.Unix(), nil
  375. }
  376. }
  377. return 0, fmt.Errorf("unrecognized date format")
  378. }