package main
import (
"bytes"
"fmt"
"log"
"regexp"
"sort"
"strings"
"golang.org/x/net/html"
)
// SplitTextByChinesePunctuation splits the input text by Chinese punctuation and spaces.
func SplitTextByChinesePunctuation2(text string) []string {
// Regular expression pattern for Chinese punctuation and spaces
pattern := `[,。!?、;:]|\s+`
re := regexp.MustCompile(pattern)
// Split the text by the pattern
parts := re.Split(text, -1)
// Filter out empty strings resulting from split
var result []string
for _, part := range parts {
trimmed := strings.TrimSpace(part)
if trimmed != "" {
result = append(result, trimmed)
}
}
return result
}
// RemoveDuplicates removes duplicates from a sorted slice.
func RemoveDuplicates2(strs []string) []string {
if len(strs) == 0 {
return strs
}
sort.Strings(strs)
j := 0
for i := 1; i < len(strs); i++ {
if strs[j] != strs[i] {
j++
strs[j] = strs[i]
}
}
return strs[:j+1]
}
// CleanHTMLTags removes all HTML tags from the input HTML string and returns the plain text.
func CleanHTMLTags2(htmlContent string) (string, error) {
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
return "", err
}
var buf bytes.Buffer
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.TextNode {
buf.WriteString(n.Data)
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
f(child)
}
}
f(doc)
// Remove leading and trailing white space
trimmedText := strings.TrimSpace(buf.String())
return trimmedText, nil
}
func main2() {
htmlContent := `
致各潜在供应商:
按照公司车辆使用计划,现需采购商务车一辆。请各位潜在供应商参加我司北京办公区行政车辆的谈判采购。现就有关事宜告知如下: 1.采购需求
物资名称 | 规格型号 | 计量单位 | 暂定数量 | 备注 |
商务用车 | 别克GL8 | 辆 | 1 | 排气量3.0升以下 |