|
@@ -13,12 +13,382 @@ import (
|
|
|
"jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
|
|
|
"log"
|
|
|
"os"
|
|
|
+ "regexp"
|
|
|
"strconv"
|
|
|
"strings"
|
|
|
"testing"
|
|
|
"time"
|
|
|
)
|
|
|
|
|
|
+var (
|
|
|
+ // 正则表达式
|
|
|
+ provinceRe = regexp.MustCompile(`(?P<province>[^省自治区特别行政区]+省|[^省自治区特别行政区]+自治区|[^省自治区特别行政区]+特别行政区|北京|天津|上海|重庆)`)
|
|
|
+ cityRe = regexp.MustCompile(`(?P<city>[^市州盟地区]+市|[^市州盟地区]+州|[^市州盟地区]+盟|[^市州盟地区]+地区|[^市州盟地区]+自治州)`)
|
|
|
+ countyRe = regexp.MustCompile(`(?P<county>[^县区旗]+县|[^县区旗]+区|[^县区旗]+旗|[^县区旗]+自治县|[^县区旗]+自治旗)`)
|
|
|
+)
|
|
|
+
|
|
|
+func TestXlsx2(T *testing.T) {
|
|
|
+ filePath := "中国政府机构目录树0425.xlsx"
|
|
|
+
|
|
|
+ // 打开文件
|
|
|
+ f, err := excelize.OpenFile(filePath)
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 读取 Sheet2 所有行数据
|
|
|
+ sheet2Rows, err := f.GetRows("爬取网站清单")
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取 Sheet2 中每行的 B/F/G 字段组合为字符串数组
|
|
|
+ var sheet2Lines []string
|
|
|
+ domains := make([]string, 0)
|
|
|
+ for k, row := range sheet2Rows {
|
|
|
+ if k == 0 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ var parts []string
|
|
|
+ if len(row) > 0 && row[0] != "" { // A 列
|
|
|
+ parts = append(parts, row[0])
|
|
|
+ }
|
|
|
+
|
|
|
+ if len(row) > 1 && row[1] != "" { // B 列
|
|
|
+ parts = append(parts, row[1])
|
|
|
+ }
|
|
|
+ if len(row) > 5 && row[5] != "" { // F 列
|
|
|
+ parts = append(parts, row[5])
|
|
|
+ }
|
|
|
+ if len(row) > 6 && row[6] != "" { // G 列(可能没有)
|
|
|
+ parts = append(parts, row[6])
|
|
|
+ }
|
|
|
+ sheet2Lines = append(sheet2Lines, strings.Join(parts, " "))
|
|
|
+ domains = append(domains, row[4])
|
|
|
+ }
|
|
|
+
|
|
|
+ // 遍历 Sheet1,每行拿 B/C 列去匹配
|
|
|
+ sheet1Rows, err := f.GetRows("组织架构带层级")
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ for i, row := range sheet1Rows {
|
|
|
+ if i <= 3 {
|
|
|
+ continue // 跳过标题行
|
|
|
+ }
|
|
|
+
|
|
|
+ if i%100 == 0 {
|
|
|
+ log.Println("current", i)
|
|
|
+ }
|
|
|
+ var match bool
|
|
|
+
|
|
|
+ province, city, county, mainPart := row[4], row[5], row[6], row[7]
|
|
|
+ if mainPart == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ domain := ""
|
|
|
+ for k, line := range sheet2Lines {
|
|
|
+ if !strings.Contains(line, mainPart) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ if strings.Contains(mainPart, "社保中心") {
|
|
|
+ mainPart = strings.ReplaceAll(mainPart, "社保中心", "人力资源和社会保障局")
|
|
|
+ }
|
|
|
+
|
|
|
+ if !strings.Contains(line, mainPart) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ //区县不为空
|
|
|
+ if county != "" {
|
|
|
+ if strings.Contains(line, county) {
|
|
|
+ match = true
|
|
|
+ domain = domains[k]
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if city != "" {
|
|
|
+ if strings.Contains(line, city) {
|
|
|
+ match = true
|
|
|
+ domain = domains[k]
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if province != "" {
|
|
|
+ if strings.Contains(line, province) {
|
|
|
+ match = true
|
|
|
+ domain = domains[k]
|
|
|
+ break
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ match = true
|
|
|
+ domain = domains[k]
|
|
|
+ break
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ // 如果匹配成功,在 H 列写入 “是”
|
|
|
+ if match {
|
|
|
+ f.SetCellValue("组织架构带层级", fmt.Sprintf("I%d", i+1), "是")
|
|
|
+ f.SetCellValue("组织架构带层级", fmt.Sprintf("J%d", i+1), domain)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 保存文件
|
|
|
+ if err := f.Save(); err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ fmt.Println("TestXlsx2 匹配完成")
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+// 提取行政区划信息
|
|
|
+func extractRegionInfo1(name string) (province, city, county, mainPart string) {
|
|
|
+ original := name
|
|
|
+
|
|
|
+ // 提取省
|
|
|
+ provinceMatch := provinceRe.FindString(name)
|
|
|
+ if provinceMatch != "" {
|
|
|
+ province = provinceMatch
|
|
|
+ name = strings.Replace(name, provinceMatch, "", 1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取市
|
|
|
+ cityMatch := cityRe.FindString(name)
|
|
|
+ if cityMatch != "" {
|
|
|
+ city = cityMatch
|
|
|
+ name = strings.Replace(name, cityMatch, "", 1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取区县
|
|
|
+ countyMatch := countyRe.FindString(name)
|
|
|
+ if countyMatch != "" {
|
|
|
+ county = countyMatch
|
|
|
+ name = strings.Replace(name, countyMatch, "", 1)
|
|
|
+ }
|
|
|
+
|
|
|
+ mainPart = strings.TrimSpace(name)
|
|
|
+
|
|
|
+ // 若全部都没提取到,则主干就是原始内容
|
|
|
+ if province == "" && city == "" && county == "" {
|
|
|
+ mainPart = original
|
|
|
+ }
|
|
|
+
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+func extractRegionInfo(name string) (province, city, county, mainPart string) {
|
|
|
+ //original := name
|
|
|
+
|
|
|
+ // 提取省
|
|
|
+ provinceMatch := provinceRe.FindString(name)
|
|
|
+ if provinceMatch != "" {
|
|
|
+ province = provinceMatch
|
|
|
+ name = strings.Replace(name, provinceMatch, "", 1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取市
|
|
|
+ cityMatch := cityRe.FindString(name)
|
|
|
+ if cityMatch != "" {
|
|
|
+ city = cityMatch
|
|
|
+ name = strings.Replace(name, cityMatch, "", 1)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取区县
|
|
|
+ countyMatch := countyRe.FindString(name)
|
|
|
+ if countyMatch != "" {
|
|
|
+ county = countyMatch
|
|
|
+ name = strings.Replace(name, countyMatch, "", 1)
|
|
|
+ }
|
|
|
+
|
|
|
+ mainPart = strings.TrimSpace(name)
|
|
|
+
|
|
|
+ // 若没有省市县,尝试去掉国家级前缀
|
|
|
+ if province == "" && city == "" && county == "" {
|
|
|
+ prefixes := []string{
|
|
|
+ "中华人民共和国",
|
|
|
+ "中国",
|
|
|
+ "国家",
|
|
|
+ "国务院",
|
|
|
+ }
|
|
|
+ for _, prefix := range prefixes {
|
|
|
+ if strings.HasPrefix(mainPart, prefix) {
|
|
|
+ mainPart = strings.TrimPrefix(mainPart, prefix)
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ mainPart = strings.TrimSpace(mainPart)
|
|
|
+ }
|
|
|
+
|
|
|
+ return
|
|
|
+}
|
|
|
+
|
|
|
+func TestXlsx(T *testing.T) {
|
|
|
+
|
|
|
+ filePath := "中国政府机构目录树0425.xlsx"
|
|
|
+ // 打开文件
|
|
|
+ f, err := excelize.OpenFile(filePath)
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 遍历 Sheet1,每行拿 B/C 列去匹配
|
|
|
+ sheet1Rows, err := f.GetRows("组织架构带层级")
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ for i, row := range sheet1Rows {
|
|
|
+ if i <= 1 {
|
|
|
+ continue // 跳过标题行
|
|
|
+ }
|
|
|
+
|
|
|
+ if i%100 == 0 {
|
|
|
+ log.Println("current", i)
|
|
|
+ }
|
|
|
+
|
|
|
+ if len(row) < 4 || row[3] == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ name := row[3]
|
|
|
+ province, city, county, mainPart := extractRegionInfo(name)
|
|
|
+
|
|
|
+ if province != "" && row[4] == "" {
|
|
|
+ cell := fmt.Sprintf("E%d", i+1)
|
|
|
+ f.SetCellValue("组织架构带层级", cell, strings.ReplaceAll(province, "省", ""))
|
|
|
+ }
|
|
|
+
|
|
|
+ if city != "" && row[5] == "" {
|
|
|
+ cell := fmt.Sprintf("F%d", i+1)
|
|
|
+ f.SetCellValue("组织架构带层级", cell, city)
|
|
|
+ }
|
|
|
+
|
|
|
+ if county != "" && row[6] == "" {
|
|
|
+ cell := fmt.Sprintf("G%d", i+1)
|
|
|
+ f.SetCellValue("组织架构带层级", cell, county)
|
|
|
+ }
|
|
|
+ if mainPart != "" {
|
|
|
+ cell := fmt.Sprintf("H%d", i+1)
|
|
|
+ f.SetCellValue("组织架构带层级", cell, mainPart)
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ // 保存文件
|
|
|
+ if err := f.Save(); err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func TestMatchWeb(T *testing.T) {
|
|
|
+ filePath := "政府机构--网站碰撞.xlsx"
|
|
|
+
|
|
|
+ // 打开文件
|
|
|
+ f, err := excelize.OpenFile(filePath)
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 读取 Sheet2 所有行数据
|
|
|
+ sheet2Rows, err := f.GetRows("爬取网站清单")
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // 提取 Sheet2 中每行的 B/F/G 字段组合为字符串数组
|
|
|
+ var sheet2Lines []string
|
|
|
+ for k, row := range sheet2Rows {
|
|
|
+ if k == 0 {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ var parts []string
|
|
|
+ if len(row) > 1 && row[1] != "" { // B 列
|
|
|
+ parts = append(parts, row[1])
|
|
|
+ }
|
|
|
+ if len(row) > 5 && row[5] != "" { // F 列
|
|
|
+ parts = append(parts, row[5])
|
|
|
+ }
|
|
|
+ if len(row) > 6 && row[6] != "" { // G 列(可能没有)
|
|
|
+ parts = append(parts, row[6])
|
|
|
+ }
|
|
|
+ sheet2Lines = append(sheet2Lines, strings.Join(parts, " "))
|
|
|
+ }
|
|
|
+
|
|
|
+ // 遍历 Sheet1,每行拿 B/C 列去匹配
|
|
|
+ sheet1Rows, err := f.GetRows("政府机构")
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ for i, row := range sheet1Rows {
|
|
|
+ if i <= 1 {
|
|
|
+ continue // 跳过标题行
|
|
|
+ }
|
|
|
+
|
|
|
+ if i%100 == 0 {
|
|
|
+ log.Println("current", i)
|
|
|
+ }
|
|
|
+ var match bool
|
|
|
+ var bVal, cVal string
|
|
|
+
|
|
|
+ if len(row) > 1 {
|
|
|
+ bVal = strings.TrimSpace(row[1]) // B列
|
|
|
+ }
|
|
|
+ if len(row) > 2 {
|
|
|
+ cVal = strings.TrimSpace(row[2]) // C列
|
|
|
+ }
|
|
|
+
|
|
|
+ // 用 B 列匹配 Sheet2 所有行
|
|
|
+ if bVal != "" {
|
|
|
+ for _, line := range sheet2Lines {
|
|
|
+ if strings.Contains(line, bVal) {
|
|
|
+ match = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 如果 B 匹配失败且 C 有值,再用 C 匹配
|
|
|
+ if !match && cVal != "" {
|
|
|
+ for _, line := range sheet2Lines {
|
|
|
+ if strings.Contains(line, cVal) {
|
|
|
+ match = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ if strings.Contains(cVal, "社保中心") {
|
|
|
+ val2 := strings.ReplaceAll(cVal, "社保中心", "人力资源和社会保障局")
|
|
|
+ if strings.Contains(line, val2) {
|
|
|
+ match = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 如果匹配成功,在 H 列写入 “是”
|
|
|
+ if match {
|
|
|
+ cell := fmt.Sprintf("H%d", i+1)
|
|
|
+ f.SetCellValue("政府机构", cell, "是")
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // 保存文件
|
|
|
+ if err := f.Save(); err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ fmt.Println("匹配完成,结果已写入 Sheet1 的 H 列")
|
|
|
+}
|
|
|
+
|
|
|
func TestExportCompanyType(T *testing.T) {
|
|
|
ctx := context.Background()
|
|
|
// MongoDB 连接字符串
|