wcc 1 周之前
父節點
當前提交
ce8a0f370a
共有 6 個文件被更改,包括 435 次插入5 次删除
  1. 二進制
      environment/environment.exe
  2. 1 1
      graph/graph_test.go
  3. 64 4
      test/test_test.go
  4. 370 0
      xlsx/xlsx_test.go
  5. 二進制
      xlsx/中国政府机构目录树0425.xlsx
  6. 二進制
      xlsx/政府机构--网站碰撞.xlsx

二進制
environment/environment → environment/environment.exe


+ 1 - 1
graph/graph_test.go

@@ -33,7 +33,7 @@ func TestCheckLegalRelationships(t *testing.T) {
 	defer client.Close()
 	//names := []string{"北京剑鱼信息技术有限公司", "河南拓普计算机网络工程有限公司", "上海元藩投资有限公司"}
 	names := []string{"万达集团股份有限公司", "万达石化有限公司", "山东万达电缆有限公司", "山东万达化工有限公司", "山东万达热电有限公司", "山东万达进出口有限公司", "山东耐斯特炭黑有限公司", "山东万达宝通轮胎有限公司", "山东明宇化学有限公司", "大连万达集团股份有限公司", "大连万达(上海)金融集团有限公司", "大连万达集团咨询服务有限公司", "北京万达足球俱乐部有限公司", "北京红舸科技文化有限公司", "北京万达文化产业集团有限公司"}
-	has, result, err := client.CheckLegalRelationships(names, 4, 1)
+	has, result, _, err := client.CheckLegalRelationships(names, 4, 1)
 	log.Println(has, result, err)
 }
 

File diff suppressed because it is too large
+ 64 - 4
test/test_test.go


+ 370 - 0
xlsx/xlsx_test.go

@@ -13,12 +13,382 @@ import (
 	"jygit.jydev.jianyu360.cn/data_processing/common_utils/mongodb"
 	"log"
 	"os"
+	"regexp"
 	"strconv"
 	"strings"
 	"testing"
 	"time"
 )
 
+var (
+	// 正则表达式
+	provinceRe = regexp.MustCompile(`(?P<province>[^省自治区特别行政区]+省|[^省自治区特别行政区]+自治区|[^省自治区特别行政区]+特别行政区|北京|天津|上海|重庆)`)
+	cityRe     = regexp.MustCompile(`(?P<city>[^市州盟地区]+市|[^市州盟地区]+州|[^市州盟地区]+盟|[^市州盟地区]+地区|[^市州盟地区]+自治州)`)
+	countyRe   = regexp.MustCompile(`(?P<county>[^县区旗]+县|[^县区旗]+区|[^县区旗]+旗|[^县区旗]+自治县|[^县区旗]+自治旗)`)
+)
+
+func TestXlsx2(T *testing.T) {
+	filePath := "中国政府机构目录树0425.xlsx"
+
+	// 打开文件
+	f, err := excelize.OpenFile(filePath)
+	if err != nil {
+		panic(err)
+	}
+
+	// 读取 Sheet2 所有行数据
+	sheet2Rows, err := f.GetRows("爬取网站清单")
+	if err != nil {
+		panic(err)
+	}
+
+	// 提取 Sheet2 中每行的 B/F/G 字段组合为字符串数组
+	var sheet2Lines []string
+	domains := make([]string, 0)
+	for k, row := range sheet2Rows {
+		if k == 0 {
+			continue
+		}
+		var parts []string
+		if len(row) > 0 && row[0] != "" { // A 列
+			parts = append(parts, row[0])
+		}
+
+		if len(row) > 1 && row[1] != "" { // B 列
+			parts = append(parts, row[1])
+		}
+		if len(row) > 5 && row[5] != "" { // F 列
+			parts = append(parts, row[5])
+		}
+		if len(row) > 6 && row[6] != "" { // G 列(可能没有)
+			parts = append(parts, row[6])
+		}
+		sheet2Lines = append(sheet2Lines, strings.Join(parts, " "))
+		domains = append(domains, row[4])
+	}
+
+	// 遍历 Sheet1,每行拿 B/C 列去匹配
+	sheet1Rows, err := f.GetRows("组织架构带层级")
+	if err != nil {
+		panic(err)
+	}
+
+	for i, row := range sheet1Rows {
+		if i <= 3 {
+			continue // 跳过标题行
+		}
+
+		if i%100 == 0 {
+			log.Println("current", i)
+		}
+		var match bool
+
+		province, city, county, mainPart := row[4], row[5], row[6], row[7]
+		if mainPart == "" {
+			continue
+		}
+
+		domain := ""
+		for k, line := range sheet2Lines {
+			if !strings.Contains(line, mainPart) {
+				continue
+			}
+
+			if strings.Contains(mainPart, "社保中心") {
+				mainPart = strings.ReplaceAll(mainPart, "社保中心", "人力资源和社会保障局")
+			}
+
+			if !strings.Contains(line, mainPart) {
+				continue
+			}
+
+			//区县不为空
+			if county != "" {
+				if strings.Contains(line, county) {
+					match = true
+					domain = domains[k]
+					break
+				}
+			} else {
+				if city != "" {
+					if strings.Contains(line, city) {
+						match = true
+						domain = domains[k]
+						break
+					}
+				} else {
+					if province != "" {
+						if strings.Contains(line, province) {
+							match = true
+							domain = domains[k]
+							break
+						}
+					} else {
+						match = true
+						domain = domains[k]
+						break
+
+					}
+
+				}
+			}
+
+		}
+		// 如果匹配成功,在 H 列写入 “是”
+		if match {
+			f.SetCellValue("组织架构带层级", fmt.Sprintf("I%d", i+1), "是")
+			f.SetCellValue("组织架构带层级", fmt.Sprintf("J%d", i+1), domain)
+		}
+	}
+
+	// 保存文件
+	if err := f.Save(); err != nil {
+		panic(err)
+	}
+
+	fmt.Println("TestXlsx2  匹配完成")
+
+}
+
+// 提取行政区划信息
+func extractRegionInfo1(name string) (province, city, county, mainPart string) {
+	original := name
+
+	// 提取省
+	provinceMatch := provinceRe.FindString(name)
+	if provinceMatch != "" {
+		province = provinceMatch
+		name = strings.Replace(name, provinceMatch, "", 1)
+	}
+
+	// 提取市
+	cityMatch := cityRe.FindString(name)
+	if cityMatch != "" {
+		city = cityMatch
+		name = strings.Replace(name, cityMatch, "", 1)
+	}
+
+	// 提取区县
+	countyMatch := countyRe.FindString(name)
+	if countyMatch != "" {
+		county = countyMatch
+		name = strings.Replace(name, countyMatch, "", 1)
+	}
+
+	mainPart = strings.TrimSpace(name)
+
+	// 若全部都没提取到,则主干就是原始内容
+	if province == "" && city == "" && county == "" {
+		mainPart = original
+	}
+
+	return
+}
+
+func extractRegionInfo(name string) (province, city, county, mainPart string) {
+	//original := name
+
+	// 提取省
+	provinceMatch := provinceRe.FindString(name)
+	if provinceMatch != "" {
+		province = provinceMatch
+		name = strings.Replace(name, provinceMatch, "", 1)
+	}
+
+	// 提取市
+	cityMatch := cityRe.FindString(name)
+	if cityMatch != "" {
+		city = cityMatch
+		name = strings.Replace(name, cityMatch, "", 1)
+	}
+
+	// 提取区县
+	countyMatch := countyRe.FindString(name)
+	if countyMatch != "" {
+		county = countyMatch
+		name = strings.Replace(name, countyMatch, "", 1)
+	}
+
+	mainPart = strings.TrimSpace(name)
+
+	// 若没有省市县,尝试去掉国家级前缀
+	if province == "" && city == "" && county == "" {
+		prefixes := []string{
+			"中华人民共和国",
+			"中国",
+			"国家",
+			"国务院",
+		}
+		for _, prefix := range prefixes {
+			if strings.HasPrefix(mainPart, prefix) {
+				mainPart = strings.TrimPrefix(mainPart, prefix)
+				break
+			}
+		}
+		mainPart = strings.TrimSpace(mainPart)
+	}
+
+	return
+}
+
+func TestXlsx(T *testing.T) {
+
+	filePath := "中国政府机构目录树0425.xlsx"
+	// 打开文件
+	f, err := excelize.OpenFile(filePath)
+	if err != nil {
+		panic(err)
+	}
+
+	// 遍历 Sheet1,每行拿 B/C 列去匹配
+	sheet1Rows, err := f.GetRows("组织架构带层级")
+	if err != nil {
+		panic(err)
+	}
+
+	for i, row := range sheet1Rows {
+		if i <= 1 {
+			continue // 跳过标题行
+		}
+
+		if i%100 == 0 {
+			log.Println("current", i)
+		}
+
+		if len(row) < 4 || row[3] == "" {
+			continue
+		}
+
+		name := row[3]
+		province, city, county, mainPart := extractRegionInfo(name)
+
+		if province != "" && row[4] == "" {
+			cell := fmt.Sprintf("E%d", i+1)
+			f.SetCellValue("组织架构带层级", cell, strings.ReplaceAll(province, "省", ""))
+		}
+
+		if city != "" && row[5] == "" {
+			cell := fmt.Sprintf("F%d", i+1)
+			f.SetCellValue("组织架构带层级", cell, city)
+		}
+
+		if county != "" && row[6] == "" {
+			cell := fmt.Sprintf("G%d", i+1)
+			f.SetCellValue("组织架构带层级", cell, county)
+		}
+		if mainPart != "" {
+			cell := fmt.Sprintf("H%d", i+1)
+			f.SetCellValue("组织架构带层级", cell, mainPart)
+		}
+
+	}
+
+	// 保存文件
+	if err := f.Save(); err != nil {
+		panic(err)
+	}
+}
+
+func TestMatchWeb(T *testing.T) {
+	filePath := "政府机构--网站碰撞.xlsx"
+
+	// 打开文件
+	f, err := excelize.OpenFile(filePath)
+	if err != nil {
+		panic(err)
+	}
+
+	// 读取 Sheet2 所有行数据
+	sheet2Rows, err := f.GetRows("爬取网站清单")
+	if err != nil {
+		panic(err)
+	}
+
+	// 提取 Sheet2 中每行的 B/F/G 字段组合为字符串数组
+	var sheet2Lines []string
+	for k, row := range sheet2Rows {
+		if k == 0 {
+			continue
+		}
+		var parts []string
+		if len(row) > 1 && row[1] != "" { // B 列
+			parts = append(parts, row[1])
+		}
+		if len(row) > 5 && row[5] != "" { // F 列
+			parts = append(parts, row[5])
+		}
+		if len(row) > 6 && row[6] != "" { // G 列(可能没有)
+			parts = append(parts, row[6])
+		}
+		sheet2Lines = append(sheet2Lines, strings.Join(parts, " "))
+	}
+
+	// 遍历 Sheet1,每行拿 B/C 列去匹配
+	sheet1Rows, err := f.GetRows("政府机构")
+	if err != nil {
+		panic(err)
+	}
+
+	for i, row := range sheet1Rows {
+		if i <= 1 {
+			continue // 跳过标题行
+		}
+
+		if i%100 == 0 {
+			log.Println("current", i)
+		}
+		var match bool
+		var bVal, cVal string
+
+		if len(row) > 1 {
+			bVal = strings.TrimSpace(row[1]) // B列
+		}
+		if len(row) > 2 {
+			cVal = strings.TrimSpace(row[2]) // C列
+		}
+
+		// 用 B 列匹配 Sheet2 所有行
+		if bVal != "" {
+			for _, line := range sheet2Lines {
+				if strings.Contains(line, bVal) {
+					match = true
+					break
+				}
+			}
+		}
+
+		// 如果 B 匹配失败且 C 有值,再用 C 匹配
+		if !match && cVal != "" {
+			for _, line := range sheet2Lines {
+				if strings.Contains(line, cVal) {
+					match = true
+					break
+				}
+				if strings.Contains(cVal, "社保中心") {
+					val2 := strings.ReplaceAll(cVal, "社保中心", "人力资源和社会保障局")
+					if strings.Contains(line, val2) {
+						match = true
+						break
+					}
+				}
+			}
+		}
+
+		// 如果匹配成功,在 H 列写入 “是”
+		if match {
+			cell := fmt.Sprintf("H%d", i+1)
+			f.SetCellValue("政府机构", cell, "是")
+		}
+	}
+
+	// 保存文件
+	if err := f.Save(); err != nil {
+		panic(err)
+	}
+
+	fmt.Println("匹配完成,结果已写入 Sheet1 的 H 列")
+}
+
 func TestExportCompanyType(T *testing.T) {
 	ctx := context.Background()
 	// MongoDB 连接字符串

二進制
xlsx/中国政府机构目录树0425.xlsx


二進制
xlsx/政府机构--网站碰撞.xlsx


Some files were not shown because too many files changed in this diff