qyxy_nation_type.go 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755
  1. package main
  2. import (
  3. "context"
  4. "encoding/json"
  5. "fmt"
  6. "github.com/olivere/elastic/v7"
  7. "github.com/xuri/excelize/v2"
  8. "io"
  9. util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  10. "log"
  11. "math"
  12. "os"
  13. "regexp"
  14. )
  15. type Company struct {
  16. CompanyName string `json:"company_name"`
  17. Capital float64 `json:"capital"`
  18. CompanyType string `json:"company_type"`
  19. CompanyStatus string `json:"company_status"`
  20. }
  21. var (
  22. indexName = "qyxy"
  23. )
  24. func getQyxyNationToFiles() {
  25. url := "http://172.17.4.184:19908"
  26. //url := "http://127.0.0.1:19908"
  27. username := "jybid"
  28. password := "Top2023_JEB01i@31"
  29. // 创建 Elasticsearch 客户端
  30. client, err := elastic.NewClient(
  31. elastic.SetURL(url),
  32. elastic.SetBasicAuth(username, password),
  33. elastic.SetSniff(false),
  34. )
  35. if err != nil {
  36. log.Fatalf("Elasticsearch 连接失败: %v", err)
  37. }
  38. //nationalSubs := []string{"石油和天然气开采业", "文教、工美、体育和娱乐用品制造业", "石油、煤炭及其他燃料加工业", "化学原料和化学制品制造业",
  39. // "医药制造业", "通用设备制造业", "专用设备制造业", "汽车制造业", "电信、广播电视和卫星传输服务", "互联网和相关服务", "软件和信息技术服务业", "货币金融服务",
  40. // "资本市场服务", "保险业", "其他金融业", "其他金融业", "教育", "新闻和出版业","生态保护和环境治理业"}
  41. nationalSubs := []string{"生态保护和环境治理业"}
  42. outputDir := "output"
  43. // 确保输出目录存在
  44. os.MkdirAll(outputDir, 0755)
  45. err = exportSelectedGroupsToFiles(client, "national_sub", nationalSubs, outputDir)
  46. if err != nil {
  47. log.Fatalf("导出失败: %v", err)
  48. }
  49. log.Println("🎉 所有分组导出完成")
  50. }
  51. // getQyxyNation getQyxyNation
  52. func getQyxyNation() {
  53. url := "http://172.17.4.184:19908"
  54. //url := "http://127.0.0.1:19908"
  55. username := "jybid"
  56. password := "Top2023_JEB01i@31"
  57. // 创建 Elasticsearch 客户端
  58. client, err := elastic.NewClient(
  59. elastic.SetURL(url),
  60. elastic.SetBasicAuth(username, password),
  61. elastic.SetSniff(false),
  62. )
  63. if err != nil {
  64. log.Fatalf("Elasticsearch 连接失败: %v", err)
  65. }
  66. // 1. 创建 Excel 文件
  67. f := excelize.NewFile()
  68. // 2. 定义分组字段
  69. groupFields := []string{"national_sub"}
  70. sheetNames := []string{"按二级分类"}
  71. for i, field := range groupFields {
  72. log.Printf("查询并导出:%s\n", field)
  73. err := exportGroupToSheet2(client, f, field, sheetNames[i])
  74. if err != nil {
  75. log.Fatalf("导出失败(%s): %v", field, err)
  76. }
  77. }
  78. // 删除默认 Sheet
  79. f.DeleteSheet("Sheet1")
  80. // 保存文件
  81. if err := f.SaveAs("top_companies_by_industry_levels.xlsx"); err != nil {
  82. log.Fatalf("保存失败: %v", err)
  83. }
  84. log.Println("✅ 所有分组已导出完成: top_companies_by_industry_levels.xlsx")
  85. }
  86. func exportGroupToSheet(client *elastic.Client, f *excelize.File, groupField, sheetName string) error {
  87. ctx := context.Background()
  88. // 查询条件
  89. query := elastic.NewBoolQuery().
  90. Must(elastic.NewTermQuery("company_status", "存续")).
  91. MustNot(elastic.NewTermQuery("company_type", "个体工商户"))
  92. // 字段筛选器
  93. sourceCtx := elastic.NewFetchSourceContext(true).Include(
  94. "company_name", "capital", "company_type", "company_status", groupField,
  95. )
  96. // 聚合构建
  97. topHitsAgg := elastic.NewTopHitsAggregation().
  98. Size(5).
  99. Sort("capital", false).
  100. FetchSourceContext(sourceCtx)
  101. termsAgg := elastic.NewTermsAggregation().Field(groupField).Size(500).
  102. SubAggregation("top_capital_companies", topHitsAgg)
  103. // 查询执行
  104. res, err := client.Search().
  105. Index(indexName).
  106. Query(query).
  107. Size(0).
  108. Aggregation("by_group", termsAgg).
  109. Do(ctx)
  110. if err != nil {
  111. return fmt.Errorf("Elasticsearch 查询失败: %v", err)
  112. }
  113. // 创建工作表
  114. index, _ := f.NewSheet(sheetName)
  115. headers := []string{groupField, "company_name", "capital", "company_type", "company_status"}
  116. for i, h := range headers {
  117. cell, _ := excelize.CoordinatesToCellName(i+1, 1)
  118. f.SetCellValue(sheetName, cell, h)
  119. }
  120. // 解析结果并写入
  121. agg, ok := res.Aggregations.Terms("by_group")
  122. if !ok {
  123. return fmt.Errorf("未找到分组聚合结果")
  124. }
  125. row := 2
  126. for _, bucket := range agg.Buckets {
  127. topHits, ok := bucket.TopHits("top_capital_companies")
  128. if !ok {
  129. continue
  130. }
  131. for _, hit := range topHits.Hits.Hits {
  132. var c Company
  133. if err := json.Unmarshal(hit.Source, &c); err != nil {
  134. continue
  135. }
  136. data := []interface{}{bucket.Key, c.CompanyName, c.Capital, c.CompanyType, c.CompanyStatus}
  137. for col, val := range data {
  138. cell, _ := excelize.CoordinatesToCellName(col+1, row)
  139. f.SetCellValue(sheetName, cell, val)
  140. }
  141. row++
  142. }
  143. }
  144. // 激活第一个 Sheet
  145. if sheetName == "按一级分类" {
  146. f.SetActiveSheet(index)
  147. }
  148. return nil
  149. }
  150. func exportGroupToSheet3(client *elastic.Client, f *excelize.File, groupField, sheetName string) error {
  151. ctx := context.Background()
  152. // 查询条件
  153. query := elastic.NewBoolQuery().
  154. Must(elastic.NewTermQuery("company_status", "存续")).
  155. MustNot(elastic.NewTermQuery("company_type", "个体工商户"))
  156. // 第一阶段:获取分组文档总数
  157. countAgg := elastic.NewTermsAggregation().Field(groupField).Size(500)
  158. countRes, err := client.Search().
  159. Index(indexName).
  160. Query(query).
  161. Size(0).
  162. Aggregation("by_group", countAgg).
  163. Do(ctx)
  164. if err != nil {
  165. return fmt.Errorf("获取分组总数失败: %v", err)
  166. }
  167. // 解析分组桶
  168. agg, ok := countRes.Aggregations.Terms("by_group")
  169. if !ok {
  170. return fmt.Errorf("未找到分组聚合结果")
  171. }
  172. // 创建工作表
  173. //index, _ := f.NewSheet(sheetName)
  174. headers := []string{groupField, "company_name", "capital", "company_type", "company_status"}
  175. for i, h := range headers {
  176. cell, _ := excelize.CoordinatesToCellName(i+1, 1)
  177. f.SetCellValue(sheetName, cell, h)
  178. }
  179. // 第二阶段:分批查询每个分组的5%数据
  180. //row := 2
  181. for _, bucket := range agg.Buckets {
  182. log.Println(bucket)
  183. // 计算5%的数据量,至少取1条
  184. //size := int(math.Max(1, float64(bucket.DocCount)*0.05))
  185. }
  186. return nil
  187. }
  188. func exportGroupToSheet21(client *elastic.Client, f *excelize.File, groupField, sheetName string) error {
  189. ctx := context.Background()
  190. // 查询条件
  191. query := elastic.NewBoolQuery().
  192. Must(elastic.NewTermQuery("company_status", "存续")).
  193. MustNot(elastic.NewTermQuery("company_type", "个体工商户"))
  194. // 第一步:获取分组及每个分组的文档总数
  195. countAgg := elastic.NewTermsAggregation().Field(groupField).Size(500)
  196. countRes, err := client.Search().
  197. Index(indexName).
  198. Query(query).
  199. Size(0).
  200. Aggregation("by_group", countAgg).
  201. Do(ctx)
  202. if err != nil {
  203. return fmt.Errorf("获取分组总数失败: %v", err)
  204. }
  205. agg, ok := countRes.Aggregations.Terms("by_group")
  206. if !ok {
  207. return fmt.Errorf("未找到分组聚合结果")
  208. }
  209. // 创建工作表
  210. index, _ := f.NewSheet(sheetName)
  211. headers := []string{groupField, "company_name", "capital", "company_type", "company_status"}
  212. for i, h := range headers {
  213. cell, _ := excelize.CoordinatesToCellName(i+1, 1)
  214. f.SetCellValue(sheetName, cell, h)
  215. }
  216. row := 2
  217. // 第二步:对每个分组进行查询并提取前 5% 的数据
  218. sourceCtx := elastic.NewFetchSourceContext(true).Include(
  219. "company_name", "capital", "company_type", "company_status", groupField,
  220. )
  221. for _, bucket := range agg.Buckets {
  222. groupVal := bucket.Key
  223. docCount := bucket.DocCount
  224. size := int(math.Max(1, float64(docCount)*0.05))
  225. groupStr := fmt.Sprintf("%v", groupVal)
  226. query := elastic.NewBoolQuery().Filter(
  227. elastic.NewTermQuery(groupField, groupStr),
  228. )
  229. src, _ := query.Source()
  230. b, _ := json.MarshalIndent(src, "", " ")
  231. fmt.Println("Query body:\n", string(b))
  232. //sourceCtx := elastic.NewFetchSourceContext(true).Include("company_name", "capital") // 确保不为 nil
  233. res, err := client.Search().
  234. Index(indexName).
  235. Query(query).
  236. Sort("capital", false).
  237. Size(size).
  238. FetchSourceContext(sourceCtx).
  239. Do(ctx)
  240. if err != nil {
  241. fmt.Printf("查询失败 group=%v, err=%v, type=%T\n", groupVal, err, err)
  242. continue
  243. }
  244. if res == nil {
  245. fmt.Println("查询结果为空(res=nil)")
  246. continue
  247. }
  248. fmt.Printf("查询成功: 命中 %d 条\n", res.Hits.TotalHits.Value)
  249. subAgg, ok := res.Aggregations.Terms("by_group")
  250. if !ok || len(subAgg.Buckets) == 0 {
  251. continue
  252. }
  253. topHits, ok := subAgg.Buckets[0].TopHits("top_capital_companies")
  254. if !ok {
  255. continue
  256. }
  257. for _, hit := range topHits.Hits.Hits {
  258. var c Company
  259. if err := json.Unmarshal(hit.Source, &c); err != nil {
  260. continue
  261. }
  262. data := []interface{}{groupVal, c.CompanyName, c.Capital, c.CompanyType, c.CompanyStatus}
  263. for col, val := range data {
  264. cell, _ := excelize.CoordinatesToCellName(col+1, row)
  265. f.SetCellValue(sheetName, cell, val)
  266. }
  267. row++
  268. }
  269. }
  270. // 设置激活工作表
  271. if sheetName == "按一级分类" {
  272. f.SetActiveSheet(index)
  273. }
  274. return nil
  275. }
  276. func exportGroupToSheet22(client *elastic.Client, f *excelize.File, groupField, sheetName string) error {
  277. ctx := context.Background()
  278. // 查询条件:状态为存续,非个体工商户
  279. query := elastic.NewBoolQuery().
  280. Must(elastic.NewTermQuery("company_status", "存续")).
  281. MustNot(elastic.NewTermQuery("company_type", "个体工商户"))
  282. // 聚合:按 groupField 分组
  283. countAgg := elastic.NewTermsAggregation().Field(groupField).Size(500)
  284. countRes, err := client.Search().
  285. Index(indexName).
  286. Query(query).
  287. Size(0).
  288. Aggregation("by_group", countAgg).
  289. Do(ctx)
  290. if err != nil {
  291. return fmt.Errorf("获取分组总数失败: %v", err)
  292. }
  293. agg, ok := countRes.Aggregations.Terms("by_group")
  294. if !ok {
  295. return fmt.Errorf("未找到分组聚合结果")
  296. }
  297. // 创建 Sheet 和标题
  298. index, _ := f.NewSheet(sheetName)
  299. headers := []string{groupField, "company_name", "capital", "company_type", "company_status"}
  300. for i, h := range headers {
  301. cell, _ := excelize.CoordinatesToCellName(i+1, 1)
  302. f.SetCellValue(sheetName, cell, h)
  303. }
  304. row := 2
  305. // 遍历每个分组,查询 top 5% 公司
  306. for _, bucket := range agg.Buckets {
  307. groupVal := bucket.Key
  308. docCount := bucket.DocCount
  309. if docCount == 0 {
  310. continue
  311. }
  312. size := int(math.Max(1, float64(docCount)*0.05))
  313. groupStr := fmt.Sprintf("%v", groupVal)
  314. // 构造查询(加 exists 过滤,避免排序错误)
  315. subQuery := elastic.NewBoolQuery().
  316. Must(elastic.NewTermQuery(groupField, groupStr)).
  317. Must(elastic.NewExistsQuery("capital")) // 避免排序失败
  318. sourceCtx := elastic.NewFetchSourceContext(true).Include(
  319. "company_name", "capital", "company_type", "company_status", groupField,
  320. )
  321. // 查询
  322. scroll := client.Scroll(indexName).
  323. Query(subQuery).
  324. SortWithInfo(elastic.SortInfo{
  325. Field: "capital",
  326. Ascending: false,
  327. Missing: "_last",
  328. }).
  329. Size(500). // 每批 500 条
  330. FetchSourceContext(sourceCtx)
  331. defer scroll.Clear(ctx) // 自动释放
  332. var fetched int
  333. for fetched < size {
  334. res, err := scroll.Do(ctx)
  335. if err == io.EOF {
  336. break // 没有更多数据
  337. }
  338. if err != nil {
  339. fmt.Printf("scroll 查询失败 group=%v, err=%v\n", groupVal, err)
  340. break
  341. }
  342. for _, hit := range res.Hits.Hits {
  343. if fetched >= size {
  344. break
  345. }
  346. var c map[string]interface{}
  347. if err := json.Unmarshal(hit.Source, &c); err != nil {
  348. continue
  349. }
  350. data := []interface{}{
  351. groupStr,
  352. c["company_name"],
  353. c["capital"],
  354. c["company_type"],
  355. c["company_status"],
  356. }
  357. for col, val := range data {
  358. cell, _ := excelize.CoordinatesToCellName(col+1, row)
  359. f.SetCellValue(sheetName, cell, val)
  360. }
  361. row++
  362. fetched++
  363. }
  364. }
  365. // 清理 scroll 上下文
  366. _ = scroll.Clear(ctx)
  367. if err != nil {
  368. fmt.Printf("查询失败 group=%v, err=%v, type=%T\n", groupVal, err, err)
  369. if elasticErr, ok := err.(*elastic.Error); ok && elasticErr.Details != nil {
  370. fmt.Printf("详细错误: type=%v, reason=%v\n", elasticErr.Details.Type, elasticErr.Details.Reason)
  371. for i, cause := range elasticErr.Details.RootCause {
  372. fmt.Printf("RootCause[%d]: %v (%v)\n", i, cause.Reason, cause.Type)
  373. }
  374. }
  375. continue
  376. }
  377. }
  378. // 设置激活表
  379. if sheetName == "按一级分类" {
  380. f.SetActiveSheet(index)
  381. }
  382. return nil
  383. }
  384. func exportGroupToSheet2(client *elastic.Client, f *excelize.File, groupField, sheetName string) error {
  385. const maxRowsPerSheet = 1048576
  386. ctx := context.Background()
  387. // 查询条件:状态为存续,非个体工商户
  388. query := elastic.NewBoolQuery().
  389. Must(elastic.NewTermQuery("company_status", "存续")).
  390. MustNot(elastic.NewTermQuery("company_type", "个体工商户"))
  391. // 聚合:按 groupField 分组
  392. countAgg := elastic.NewTermsAggregation().Field(groupField).Size(500)
  393. countRes, err := client.Search().
  394. Index(indexName).
  395. Query(query).
  396. Size(0).
  397. Aggregation("by_group", countAgg).
  398. Do(ctx)
  399. if err != nil {
  400. return fmt.Errorf("获取分组总数失败: %v", err)
  401. }
  402. agg, ok := countRes.Aggregations.Terms("by_group")
  403. if !ok {
  404. return fmt.Errorf("未找到分组聚合结果")
  405. }
  406. headers := []string{groupField, "company_name", "capital", "company_type", "company_status"}
  407. sheetIndex := 0
  408. currSheet := sheetName
  409. f.NewSheet(currSheet)
  410. writeHeaders(f, currSheet, headers)
  411. row := 2
  412. for _, bucket := range agg.Buckets {
  413. groupVal := bucket.Key
  414. docCount := bucket.DocCount
  415. if docCount == 0 {
  416. continue
  417. }
  418. size := int(math.Max(1, float64(docCount)*0.05))
  419. groupStr := fmt.Sprintf("%v", groupVal)
  420. subQuery := elastic.NewBoolQuery().
  421. Must(elastic.NewTermQuery(groupField, groupStr)).
  422. Must(elastic.NewExistsQuery("capital"))
  423. sourceCtx := elastic.NewFetchSourceContext(true).Include(
  424. "company_name", "capital", "company_type", "company_status", groupField,
  425. )
  426. scroll := client.Scroll(indexName).
  427. Query(subQuery).
  428. SortWithInfo(elastic.SortInfo{
  429. Field: "capital",
  430. Ascending: false,
  431. Missing: "_last",
  432. }).
  433. Size(500).
  434. FetchSourceContext(sourceCtx)
  435. defer scroll.Clear(ctx)
  436. var fetched int
  437. for fetched < size {
  438. res, err := scroll.Do(ctx)
  439. if err == io.EOF {
  440. break
  441. }
  442. if err != nil {
  443. log.Printf("scroll 查询失败 group=%v, err=%v\n", groupVal, err)
  444. break
  445. }
  446. for _, hit := range res.Hits.Hits {
  447. if fetched >= size {
  448. break
  449. }
  450. // 超出最大行数,创建新 sheet
  451. if row > maxRowsPerSheet {
  452. sheetIndex++
  453. currSheet = fmt.Sprintf("%s_%d", sheetName, sheetIndex)
  454. f.NewSheet(currSheet)
  455. writeHeaders(f, currSheet, headers)
  456. row = 2
  457. }
  458. var c map[string]interface{}
  459. if err := json.Unmarshal(hit.Source, &c); err != nil {
  460. continue
  461. }
  462. if util.ObjToString(c["company_status"]) != "存续" {
  463. continue
  464. }
  465. data := []interface{}{
  466. groupStr,
  467. c["company_name"],
  468. c["capital"],
  469. c["company_type"],
  470. c["company_status"],
  471. }
  472. for col, val := range data {
  473. cell, _ := excelize.CoordinatesToCellName(col+1, row)
  474. f.SetCellValue(currSheet, cell, val)
  475. }
  476. row++
  477. fetched++
  478. }
  479. }
  480. }
  481. // 设置激活表
  482. if sheetName == "按一级分类" {
  483. if index, err := f.GetSheetIndex(sheetName); err == nil {
  484. f.SetActiveSheet(index)
  485. }
  486. }
  487. return nil
  488. }
  489. func writeHeaders(f *excelize.File, sheetName string, headers []string) {
  490. for i, h := range headers {
  491. cell, _ := excelize.CoordinatesToCellName(i+1, 1)
  492. f.SetCellValue(sheetName, cell, h)
  493. }
  494. }
  495. const maxRowsPerSheet = 1048576
  496. // exportSelectedGroupsToFiles 导出多文件
  497. func exportSelectedGroupsToFiles(client *elastic.Client, groupField string, groupValues []string, outputDir string) error {
  498. ctx := context.Background()
  499. headers := []string{groupField, "company_name", "capital", "company_type", "company_status"}
  500. for _, groupVal := range groupValues {
  501. // 构造子查询
  502. subQuery := elastic.NewBoolQuery().
  503. Must(
  504. elastic.NewTermQuery("company_status", "存续"),
  505. elastic.NewTermQuery(groupField, groupVal),
  506. ).
  507. MustNot(elastic.NewTermQuery("company_type", "个体工商户")).
  508. Must(elastic.NewExistsQuery("capital"))
  509. // 获取该分组的总数
  510. countRes, err := client.Count(indexName).Query(subQuery).Do(ctx)
  511. if err != nil || countRes == 0 {
  512. log.Printf("跳过分组 %s(无数据或查询失败)\n", groupVal)
  513. continue
  514. }
  515. size := int(math.Max(1, float64(countRes)*0.2))
  516. sourceCtx := elastic.NewFetchSourceContext(true).Include(
  517. "company_name", "capital", "company_type", "company_status", groupField,
  518. )
  519. scroll := client.Scroll(indexName).
  520. Query(subQuery).
  521. SortWithInfo(elastic.SortInfo{
  522. Field: "capital",
  523. Ascending: false,
  524. Missing: "_last",
  525. }).
  526. Size(500).
  527. FetchSourceContext(sourceCtx)
  528. defer scroll.Clear(ctx)
  529. part := 1
  530. row := 2
  531. f := excelize.NewFile()
  532. sheetName := "数据"
  533. f.NewSheet(sheetName)
  534. writeHeaders(f, sheetName, headers)
  535. fetched := 0
  536. for fetched < size {
  537. res, err := scroll.Do(ctx)
  538. if err == io.EOF {
  539. break
  540. }
  541. if err != nil {
  542. log.Printf("scroll 查询失败 group=%v, err=%v\n", groupVal, err)
  543. break
  544. }
  545. for _, hit := range res.Hits.Hits {
  546. if fetched >= size {
  547. break
  548. }
  549. if row > maxRowsPerSheet {
  550. // 超出最大行数,保存当前文件,开启新文件
  551. filename := fmt.Sprintf("%s/%s_top5%%_part%d.xlsx", outputDir, sanitizeFileName(groupVal), part)
  552. f.DeleteSheet("Sheet1")
  553. if err := f.SaveAs(filename); err != nil {
  554. log.Printf("❌ 保存文件失败 [%s]: %v\n", filename, err)
  555. } else {
  556. log.Printf("✅ 已保存:%s(%d 条)", filename, row-2)
  557. }
  558. // 重置
  559. part++
  560. row = 2
  561. f = excelize.NewFile()
  562. f.NewSheet(sheetName)
  563. writeHeaders(f, sheetName, headers)
  564. }
  565. var c map[string]interface{}
  566. if err := json.Unmarshal(hit.Source, &c); err != nil {
  567. continue
  568. }
  569. data := []interface{}{
  570. groupVal,
  571. c["company_name"],
  572. c["capital"],
  573. c["company_type"],
  574. c["company_status"],
  575. }
  576. for col, val := range data {
  577. cell, _ := excelize.CoordinatesToCellName(col+1, row)
  578. f.SetCellValue(sheetName, cell, val)
  579. }
  580. row++
  581. fetched++
  582. }
  583. }
  584. // 保存最后一个文件
  585. if row > 2 {
  586. filename := fmt.Sprintf("%s/%s_top5%%_part%d.xlsx", outputDir, sanitizeFileName(groupVal), part)
  587. f.DeleteSheet("Sheet1")
  588. if err := f.SaveAs(filename); err != nil {
  589. log.Printf("❌ 保存文件失败 [%s]: %v\n", filename, err)
  590. } else {
  591. log.Printf("✅ 完成导出:%s(%d 条)", filename, row-2)
  592. }
  593. }
  594. }
  595. return nil
  596. }
  597. func sanitizeFileName(name string) string {
  598. // 替换非法文件名字符
  599. re := regexp.MustCompile(`[\\/:*?"<>|]`)
  600. return re.ReplaceAllString(name, "_")
  601. }
  602. func dealXlsx() {
  603. log.Println("开始处理数据")
  604. // 打开原始文件
  605. f, err := excelize.OpenFile("top_companies_by_industry_levels.xlsx")
  606. if err != nil {
  607. panic(err)
  608. }
  609. // 获取第一个工作表
  610. sheetName := f.GetSheetName(3)
  611. rows, err := f.GetRows(sheetName)
  612. if err != nil {
  613. panic(err)
  614. }
  615. // 创建新文件用于保存筛选后的数据
  616. newFile := excelize.NewFile()
  617. newSheet := newFile.GetSheetName(0)
  618. // 遍历原始数据并筛选“存续”状态
  619. rowIndex := 0
  620. for i, row := range rows {
  621. if rowIndex == 0 {
  622. // 写入表头
  623. _ = newFile.SetSheetRow(newSheet, fmt.Sprintf("A%d", rowIndex+1), &row)
  624. rowIndex++
  625. continue
  626. }
  627. if i%10000 == 0 {
  628. log.Println("iiiiiii", row[1])
  629. }
  630. // 判断公司状态列(假设在第 5 列,即下标 4)
  631. if len(row) >= 5 && row[4] == "存续" {
  632. _ = newFile.SetSheetRow(newSheet, fmt.Sprintf("A%d", rowIndex+1), &row)
  633. rowIndex++
  634. }
  635. }
  636. // 保存新文件
  637. if err := newFile.SaveAs("筛选后-存续公司3.xlsx"); err != nil {
  638. panic(err)
  639. }
  640. log.Println("筛选完成,结果保存为 筛选后-存续公司3.xlsx")
  641. }