123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388 |
- package main
- import (
- "bytes"
- "fmt"
- "golang.org/x/net/html"
- util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "unicode"
- "unicode/utf8"
- )
- var (
- TimeV1 = regexp.MustCompile("^(\\d{4})[年.]?$")
- TimeV2 = regexp.MustCompile("^(\\d{4})[年./-]?(\\d{1,2})[月./-]?$")
- TimeV3 = regexp.MustCompile("^(\\d{4})[年./-]?(\\d{1,2})[月./-]?(\\d{1,2})[日]?$")
- )
- // GetJyURLByID 获取剑鱼地址
- func GetJyURLByID(id string) string {
- var Url = "https://www.jianyu360.com/article/content/%s.html"
- url := fmt.Sprintf(Url, util.CommonEncodeArticle("content", id))
- return url
- }
- // IsHanStart 判断字符串是否以汉字开头或者以字母开头
- func IsHanStart(s string) bool {
- if s == "" {
- return false
- }
- return unicode.Is(unicode.Scripts["Han"], []rune(s)[0]) || unicode.IsLetter([]rune(s)[0])
- }
- // IsHan 判断字符是否为汉字
- func IsHan(c rune) bool {
- return unicode.Is(unicode.Scripts["Han"], c)
- }
- // IsBracketStartWithHan 判断公司名称是否以圆括号开头且括号内汉字开头
- func IsBracketStartWithHan(s string) bool {
- if len(s) == 0 || s[0] != '(' {
- return false
- }
- // 索引 i 和 j 分别是左右圆括号的位置,如果找不到右圆括号则返回 false
- i, j := 0, 0
- for j = i + 1; j < len(s); j++ {
- if s[j] == ')' {
- break
- }
- }
- if j >= len(s) {
- return false
- }
- // 检查圆括号内是否以汉字或字母开头
- bracketContent := s[i+1 : j]
- if len(bracketContent) == 0 || (!unicode.IsLetter(rune(bracketContent[0])) && !IsHan([]rune(bracketContent)[0])) {
- return false
- }
- return true
- }
- // IsCompanyName 判断字符串是否以汉字开头、以括号开头并且括号里面是汉字、以"公司"结尾,其中一个条件符合即返回true,否则返回false
- func IsCompanyName(s string) bool {
- r := []rune(s)
- //if len(r) >= 6 && (string(r[len(r)-6:]) == "有限公司" || string(r[len(r)-6:]) == "股份有限公司") {
- // return (IsHanStart(s) || IsBracketStartWithHan(s))
- //} else if len(r) >= 2 && string(r[len(r)-2:]) == "公司" {
- // return (IsHanStart(s) || IsBracketStartWithHan(s))
- //}
- if len(r) >= 2 {
- return (IsHanStart(s) || IsBracketStartWithHan(s))
- }
- return false
- }
- // GetChineseCharacters 提取字符串中的汉字
- func GetChineseCharacters(s string) string {
- re := regexp.MustCompile(`[\p{Han}]+`)
- return re.FindString(s)
- }
- func getCompanyName(name string) string {
- if IsCompanyName(name) {
- return name
- }
- return GetChineseCharacters(name)
- }
- func IsUnicodeStart(s string) bool {
- if len(s) == 0 {
- return false
- }
- _, size := utf8.DecodeRuneInString(s)
- return size > 0
- }
- // RemoveDuplicateSuffix 去除字符串末尾的重复字词
- func RemoveDuplicateSuffix(str string, suffix string) string {
- // 构建正则表达式:^(.*?)(重复的结尾词)+$
- re := regexp.MustCompile(fmt.Sprintf(`^(.*?)(%s)+$`, suffix))
- matches := re.FindStringSubmatch(str)
- if len(matches) == 3 {
- return matches[1] + matches[2]
- }
- return str
- }
- //func findName(name string) []map[string]interface{} {
- // filter := bson.M{"name": name, "status": 1}
- // info, _ := Mgo.Find(wccBuyer, filter, nil, nil, false, -1, -1)
- //
- // return *info
- //}
- //
- //func findNameID(id string) []map[string]interface{} {
- // filter := bson.M{"name_id": id, "status": 1}
- // info, _ := Mgo.Find(wccBuyer, filter, nil, nil, false, -1, -1)
- //
- // return *info
- //}
- // isStringRepeating 判断字符串内字符完全重复,例如:山东大学山东大学
- func isStringRepeating(str string) bool {
- for i := 0; i < len(str); i++ {
- for j := i + 1; j < len(str); j++ {
- if str[i] != str[j] {
- return false
- }
- }
- }
- return true
- }
- // @Description 采购意向 预计采购时间处理
- // @Author J 2022/6/7 8:04 PM
- func getMethod(str string) int64 {
- // Handle "YYYY" format
- if TimeV1.MatchString(str) {
- arr := TimeV1.FindStringSubmatch(str)
- st := arr[1] + "0000"
- parseInt, err := strconv.ParseInt(st, 10, 64)
- if err == nil {
- return parseInt
- }
- }
- // Handle "YYYYMM" or "YYYY/MM" or "YYYY-MM" or "YYYY.MM" format
- if TimeV2.MatchString(str) {
- arr := TimeV2.FindStringSubmatch(str)
- year := arr[1]
- month := arr[2]
- if len(month) == 1 {
- month = "0" + month
- }
- str2 := year + month + "00"
- parseInt, err := strconv.ParseInt(str2, 10, 64)
- if err == nil {
- return parseInt
- }
- }
- // Handle "YYYYMMDD" or "YYYY/MM/DD" or "YYYY-MM-DD" or "YYYY.MM.DD" format
- if TimeV3.MatchString(str) {
- match := TimeV3.FindStringSubmatch(str)
- if len(match) >= 4 {
- year := match[1]
- month := match[2]
- day := match[3]
- if len(month) == 1 {
- month = "0" + month
- }
- if len(day) == 1 {
- day = "0" + day
- }
- dateStr := year + month + day
- parseInt, err := strconv.ParseInt(dateStr, 10, 64)
- if err == nil {
- return parseInt
- }
- }
- }
- return 0
- }
- // ProcessTopscopeclass 处理行业分类
- func ProcessTopscopeclass(tops, subs []string) ([]string, []string, []string) {
- // 去除 tops 中每个元素末尾的不固定字符
- cleanedTops := make([]string, 0)
- for _, top := range tops {
- parts := strings.Split(top, "")
- cleanedTop := strings.Join(parts[:len(parts)-1], "")
- if !IsInStringArray(cleanedTop, cleanedTops) {
- cleanedTops = append(cleanedTops, cleanedTop)
- }
- }
- // 用于标记 cleanedTops 中已存在于 subs 的元素
- presentMap := make(map[string]bool)
- // 遍历 subs 数组,标记已存在的 cleanedTops 元素
- for _, sub := range subs {
- for _, top := range cleanedTops {
- if strings.Contains(sub, top) {
- presentMap[top] = true
- }
- }
- }
- // 补充缺失的 cleanedTops 元素到 subs 中
- for _, top := range cleanedTops {
- if !presentMap[top] {
- subs = append(subs, top+"_其它")
- }
- }
- return tops, subs, cleanedTops
- }
- // IsInStringArray 判断数组中是否存在字符串
- func IsInStringArray(str string, arr []string) bool {
- // 先对字符串数组进行排序
- sort.Strings(arr)
- // 使用二分查找算法查找字符串
- pos := sort.SearchStrings(arr, str)
- // 如果找到了则返回 true,否则返回 false
- return pos < len(arr) && arr[pos] == str
- }
- // getProject 根据标讯ID获取项目信息
- func getProject(id string) map[string]interface{} {
- where := map[string]interface{}{
- "ids": id,
- }
- p, _ := MgoR.FindOne("projectset_20230904", where)
- project := *p
- return project
- }
- // CleanHTMLTags 处理HTML内容并返回清理后的文本
- func CleanHTMLTags(htmlContent string) (string, error) {
- // 解析HTML内容
- doc, err := html.Parse(strings.NewReader(htmlContent))
- if err != nil {
- return "", err
- }
- var buf bytes.Buffer
- // 递归函数,用来遍历 HTML 树
- var f func(*html.Node)
- f = func(n *html.Node) {
- // 处理文本节点
- if n.Type == html.TextNode {
- // 去掉文本节点中的所有空格
- //buf.WriteString(n.Data)
- // 去掉文本节点中的所有空格
- trimmedText := strings.ReplaceAll(n.Data, " ", "") // 去掉所有空格
- buf.WriteString(trimmedText)
- }
- // 处理元素节点
- if n.Type == html.ElementNode {
- // 调试:输出当前节点的类型和标签名
- //fmt.Printf("ElementNode: %s\n", n.Data)
- // 处理 <br> 标签,插入换行
- if n.Data == "br" {
- buf.WriteString("\n")
- } else if n.Data == "table" {
- // 处理表格标签 <table>
- for tr := n.FirstChild; tr != nil; tr = tr.NextSibling {
- if tr.Type == html.ElementNode && tr.Data == "tr" {
- // 遍历每行中的 <td> 单元格
- for td := tr.FirstChild; td != nil; td = td.NextSibling {
- if td.Data == "td" {
- buf.WriteString("[TD] ")
- f(td) // 递归处理 <td> 中的内容
- }
- }
- }
- }
- //buf.WriteString("Table End\n")
- } else if n.Data == "ul" {
- // 处理无序列表 <ul>
- for li := n.FirstChild; li != nil; li = li.NextSibling {
- if li.Data == "li" {
- buf.WriteString("- ")
- f(li)
- buf.WriteString("\n")
- }
- }
- }
- }
- // 遍历子节点
- for child := n.FirstChild; child != nil; child = child.NextSibling {
- f(child)
- }
- }
- // 启动递归遍历
- f(doc)
- // 去除多余空格
- trimmedText := strings.TrimSpace(buf.String())
- return trimmedText, nil
- }
- // SplitTextByChinesePunctuation 根据中文断句,拆分语句
- func SplitTextByChinesePunctuation(text string) []string {
- // Regular expression pattern for Chinese punctuation and spaces
- // This pattern splits by Chinese punctuation, spaces, and keeps them for splitting.
- //pattern := `[。!?;,:\s]+`
- // 替换掉所有的 NBSP(不间断空格)为普通空格
- text = strings.ReplaceAll(text, "\u00A0", " ")
- pattern := `[,。!?、;:]|\s+`
- re := regexp.MustCompile(pattern)
- // Split the text by the pattern
- parts := re.Split(text, -1)
- // Filter out empty strings resulting from split
- var result []string
- for _, part := range parts {
- trimmed := strings.TrimSpace(part)
- if trimmed != "" {
- result = append(result, trimmed)
- }
- }
- return result
- }
- // RemoveDuplicates 去除字符串数组中重复数据;并去除被长语句包含的短语句
- func RemoveDuplicates(strs []string) []string {
- var result []string
- for _, str := range strs {
- // 检查当前短语是否已被 result 中的任何一个较长短语包含
- shouldAdd := true
- for _, resStr := range result {
- if strings.Contains(resStr, str) {
- // 如果已有的短语包含当前短语,则不添加当前短语
- shouldAdd = false
- break
- }
- }
- if shouldAdd {
- // 将当前短语添加到结果中
- result = append(result, str)
- // 再次遍历一遍,移除包含当前短语的任何较短短语
- for i := len(result) - 2; i >= 0; i-- {
- if strings.Contains(result[i], str) {
- // 如果之前的较短短语包含当前短语,则移除该较短短语
- result = append(result[:i], result[i+1:]...)
- }
- }
- }
- }
- return result
- }
- // CountChineseCharacters 函数统计字符串数组中汉字的总数
- func CountChineseCharacters(strs []string) int {
- var totalCount int
- for _, str := range strs {
- for _, r := range str {
- // 判断字符是否为汉字且不是标点符号
- if unicode.Is(unicode.Han, r) && !unicode.IsPunct(r) {
- totalCount++
- }
- }
- }
- return totalCount
- }
|