util.go 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. package main
  2. import (
  3. "bytes"
  4. "fmt"
  5. "golang.org/x/net/html"
  6. util "jygit.jydev.jianyu360.cn/data_processing/common_utils"
  7. "regexp"
  8. "sort"
  9. "strconv"
  10. "strings"
  11. "unicode"
  12. "unicode/utf8"
  13. )
  14. var (
  15. TimeV1 = regexp.MustCompile("^(\\d{4})[年.]?$")
  16. TimeV2 = regexp.MustCompile("^(\\d{4})[年./-]?(\\d{1,2})[月./-]?$")
  17. TimeV3 = regexp.MustCompile("^(\\d{4})[年./-]?(\\d{1,2})[月./-]?(\\d{1,2})[日]?$")
  18. )
  19. // GetJyURLByID 获取剑鱼地址
  20. func GetJyURLByID(id string) string {
  21. var Url = "https://www.jianyu360.com/article/content/%s.html"
  22. url := fmt.Sprintf(Url, util.CommonEncodeArticle("content", id))
  23. return url
  24. }
  25. // IsHanStart 判断字符串是否以汉字开头或者以字母开头
  26. func IsHanStart(s string) bool {
  27. if s == "" {
  28. return false
  29. }
  30. return unicode.Is(unicode.Scripts["Han"], []rune(s)[0]) || unicode.IsLetter([]rune(s)[0])
  31. }
  32. // IsHan 判断字符是否为汉字
  33. func IsHan(c rune) bool {
  34. return unicode.Is(unicode.Scripts["Han"], c)
  35. }
  36. // IsBracketStartWithHan 判断公司名称是否以圆括号开头且括号内汉字开头
  37. func IsBracketStartWithHan(s string) bool {
  38. if len(s) == 0 || s[0] != '(' {
  39. return false
  40. }
  41. // 索引 i 和 j 分别是左右圆括号的位置,如果找不到右圆括号则返回 false
  42. i, j := 0, 0
  43. for j = i + 1; j < len(s); j++ {
  44. if s[j] == ')' {
  45. break
  46. }
  47. }
  48. if j >= len(s) {
  49. return false
  50. }
  51. // 检查圆括号内是否以汉字或字母开头
  52. bracketContent := s[i+1 : j]
  53. if len(bracketContent) == 0 || (!unicode.IsLetter(rune(bracketContent[0])) && !IsHan([]rune(bracketContent)[0])) {
  54. return false
  55. }
  56. return true
  57. }
  58. // IsCompanyName 判断字符串是否以汉字开头、以括号开头并且括号里面是汉字、以"公司"结尾,其中一个条件符合即返回true,否则返回false
  59. func IsCompanyName(s string) bool {
  60. r := []rune(s)
  61. //if len(r) >= 6 && (string(r[len(r)-6:]) == "有限公司" || string(r[len(r)-6:]) == "股份有限公司") {
  62. // return (IsHanStart(s) || IsBracketStartWithHan(s))
  63. //} else if len(r) >= 2 && string(r[len(r)-2:]) == "公司" {
  64. // return (IsHanStart(s) || IsBracketStartWithHan(s))
  65. //}
  66. if len(r) >= 2 {
  67. return (IsHanStart(s) || IsBracketStartWithHan(s))
  68. }
  69. return false
  70. }
  71. // GetChineseCharacters 提取字符串中的汉字
  72. func GetChineseCharacters(s string) string {
  73. re := regexp.MustCompile(`[\p{Han}]+`)
  74. return re.FindString(s)
  75. }
  76. func getCompanyName(name string) string {
  77. if IsCompanyName(name) {
  78. return name
  79. }
  80. return GetChineseCharacters(name)
  81. }
  82. func IsUnicodeStart(s string) bool {
  83. if len(s) == 0 {
  84. return false
  85. }
  86. _, size := utf8.DecodeRuneInString(s)
  87. return size > 0
  88. }
  89. // RemoveDuplicateSuffix 去除字符串末尾的重复字词
  90. func RemoveDuplicateSuffix(str string, suffix string) string {
  91. // 构建正则表达式:^(.*?)(重复的结尾词)+$
  92. re := regexp.MustCompile(fmt.Sprintf(`^(.*?)(%s)+$`, suffix))
  93. matches := re.FindStringSubmatch(str)
  94. if len(matches) == 3 {
  95. return matches[1] + matches[2]
  96. }
  97. return str
  98. }
  99. //func findName(name string) []map[string]interface{} {
  100. // filter := bson.M{"name": name, "status": 1}
  101. // info, _ := Mgo.Find(wccBuyer, filter, nil, nil, false, -1, -1)
  102. //
  103. // return *info
  104. //}
  105. //
  106. //func findNameID(id string) []map[string]interface{} {
  107. // filter := bson.M{"name_id": id, "status": 1}
  108. // info, _ := Mgo.Find(wccBuyer, filter, nil, nil, false, -1, -1)
  109. //
  110. // return *info
  111. //}
  112. // isStringRepeating 判断字符串内字符完全重复,例如:山东大学山东大学
  113. func isStringRepeating(str string) bool {
  114. for i := 0; i < len(str); i++ {
  115. for j := i + 1; j < len(str); j++ {
  116. if str[i] != str[j] {
  117. return false
  118. }
  119. }
  120. }
  121. return true
  122. }
  123. // @Description 采购意向 预计采购时间处理
  124. // @Author J 2022/6/7 8:04 PM
  125. func getMethod(str string) int64 {
  126. // Handle "YYYY" format
  127. if TimeV1.MatchString(str) {
  128. arr := TimeV1.FindStringSubmatch(str)
  129. st := arr[1] + "0000"
  130. parseInt, err := strconv.ParseInt(st, 10, 64)
  131. if err == nil {
  132. return parseInt
  133. }
  134. }
  135. // Handle "YYYYMM" or "YYYY/MM" or "YYYY-MM" or "YYYY.MM" format
  136. if TimeV2.MatchString(str) {
  137. arr := TimeV2.FindStringSubmatch(str)
  138. year := arr[1]
  139. month := arr[2]
  140. if len(month) == 1 {
  141. month = "0" + month
  142. }
  143. str2 := year + month + "00"
  144. parseInt, err := strconv.ParseInt(str2, 10, 64)
  145. if err == nil {
  146. return parseInt
  147. }
  148. }
  149. // Handle "YYYYMMDD" or "YYYY/MM/DD" or "YYYY-MM-DD" or "YYYY.MM.DD" format
  150. if TimeV3.MatchString(str) {
  151. match := TimeV3.FindStringSubmatch(str)
  152. if len(match) >= 4 {
  153. year := match[1]
  154. month := match[2]
  155. day := match[3]
  156. if len(month) == 1 {
  157. month = "0" + month
  158. }
  159. if len(day) == 1 {
  160. day = "0" + day
  161. }
  162. dateStr := year + month + day
  163. parseInt, err := strconv.ParseInt(dateStr, 10, 64)
  164. if err == nil {
  165. return parseInt
  166. }
  167. }
  168. }
  169. return 0
  170. }
  171. // ProcessTopscopeclass 处理行业分类
  172. func ProcessTopscopeclass(tops, subs []string) ([]string, []string, []string) {
  173. // 去除 tops 中每个元素末尾的不固定字符
  174. cleanedTops := make([]string, 0)
  175. for _, top := range tops {
  176. parts := strings.Split(top, "")
  177. cleanedTop := strings.Join(parts[:len(parts)-1], "")
  178. if !IsInStringArray(cleanedTop, cleanedTops) {
  179. cleanedTops = append(cleanedTops, cleanedTop)
  180. }
  181. }
  182. // 用于标记 cleanedTops 中已存在于 subs 的元素
  183. presentMap := make(map[string]bool)
  184. // 遍历 subs 数组,标记已存在的 cleanedTops 元素
  185. for _, sub := range subs {
  186. for _, top := range cleanedTops {
  187. if strings.Contains(sub, top) {
  188. presentMap[top] = true
  189. }
  190. }
  191. }
  192. // 补充缺失的 cleanedTops 元素到 subs 中
  193. for _, top := range cleanedTops {
  194. if !presentMap[top] {
  195. subs = append(subs, top+"_其它")
  196. }
  197. }
  198. return tops, subs, cleanedTops
  199. }
  200. // IsInStringArray 判断数组中是否存在字符串
  201. func IsInStringArray(str string, arr []string) bool {
  202. // 先对字符串数组进行排序
  203. sort.Strings(arr)
  204. // 使用二分查找算法查找字符串
  205. pos := sort.SearchStrings(arr, str)
  206. // 如果找到了则返回 true,否则返回 false
  207. return pos < len(arr) && arr[pos] == str
  208. }
  209. // getProject 根据标讯ID获取项目信息
  210. func getProject(id string) map[string]interface{} {
  211. where := map[string]interface{}{
  212. "ids": id,
  213. }
  214. p, _ := MgoR.FindOne("projectset_20230904", where)
  215. project := *p
  216. return project
  217. }
  218. // CleanHTMLTags 处理HTML内容并返回清理后的文本
  219. func CleanHTMLTags(htmlContent string) (string, error) {
  220. // 解析HTML内容
  221. doc, err := html.Parse(strings.NewReader(htmlContent))
  222. if err != nil {
  223. return "", err
  224. }
  225. var buf bytes.Buffer
  226. // 递归函数,用来遍历 HTML 树
  227. var f func(*html.Node)
  228. f = func(n *html.Node) {
  229. // 处理文本节点
  230. if n.Type == html.TextNode {
  231. // 去掉文本节点中的所有空格
  232. //buf.WriteString(n.Data)
  233. // 去掉文本节点中的所有空格
  234. trimmedText := strings.ReplaceAll(n.Data, " ", "") // 去掉所有空格
  235. buf.WriteString(trimmedText)
  236. }
  237. // 处理元素节点
  238. if n.Type == html.ElementNode {
  239. // 调试:输出当前节点的类型和标签名
  240. //fmt.Printf("ElementNode: %s\n", n.Data)
  241. // 处理 <br> 标签,插入换行
  242. if n.Data == "br" {
  243. buf.WriteString("\n")
  244. } else if n.Data == "table" {
  245. // 处理表格标签 <table>
  246. for tr := n.FirstChild; tr != nil; tr = tr.NextSibling {
  247. if tr.Type == html.ElementNode && tr.Data == "tr" {
  248. // 遍历每行中的 <td> 单元格
  249. for td := tr.FirstChild; td != nil; td = td.NextSibling {
  250. if td.Data == "td" {
  251. buf.WriteString("[TD] ")
  252. f(td) // 递归处理 <td> 中的内容
  253. }
  254. }
  255. }
  256. }
  257. //buf.WriteString("Table End\n")
  258. } else if n.Data == "ul" {
  259. // 处理无序列表 <ul>
  260. for li := n.FirstChild; li != nil; li = li.NextSibling {
  261. if li.Data == "li" {
  262. buf.WriteString("- ")
  263. f(li)
  264. buf.WriteString("\n")
  265. }
  266. }
  267. }
  268. }
  269. // 遍历子节点
  270. for child := n.FirstChild; child != nil; child = child.NextSibling {
  271. f(child)
  272. }
  273. }
  274. // 启动递归遍历
  275. f(doc)
  276. // 去除多余空格
  277. trimmedText := strings.TrimSpace(buf.String())
  278. return trimmedText, nil
  279. }
  280. // SplitTextByChinesePunctuation 根据中文断句,拆分语句
  281. func SplitTextByChinesePunctuation(text string) []string {
  282. // Regular expression pattern for Chinese punctuation and spaces
  283. // This pattern splits by Chinese punctuation, spaces, and keeps them for splitting.
  284. //pattern := `[。!?;,:\s]+`
  285. // 替换掉所有的 NBSP(不间断空格)为普通空格
  286. text = strings.ReplaceAll(text, "\u00A0", " ")
  287. pattern := `[,。!?、;:]|\s+`
  288. re := regexp.MustCompile(pattern)
  289. // Split the text by the pattern
  290. parts := re.Split(text, -1)
  291. // Filter out empty strings resulting from split
  292. var result []string
  293. for _, part := range parts {
  294. trimmed := strings.TrimSpace(part)
  295. if trimmed != "" {
  296. result = append(result, trimmed)
  297. }
  298. }
  299. return result
  300. }
  301. // RemoveDuplicates 去除字符串数组中重复数据;并去除被长语句包含的短语句
  302. func RemoveDuplicates(strs []string) []string {
  303. var result []string
  304. for _, str := range strs {
  305. // 检查当前短语是否已被 result 中的任何一个较长短语包含
  306. shouldAdd := true
  307. for _, resStr := range result {
  308. if strings.Contains(resStr, str) {
  309. // 如果已有的短语包含当前短语,则不添加当前短语
  310. shouldAdd = false
  311. break
  312. }
  313. }
  314. if shouldAdd {
  315. // 将当前短语添加到结果中
  316. result = append(result, str)
  317. // 再次遍历一遍,移除包含当前短语的任何较短短语
  318. for i := len(result) - 2; i >= 0; i-- {
  319. if strings.Contains(result[i], str) {
  320. // 如果之前的较短短语包含当前短语,则移除该较短短语
  321. result = append(result[:i], result[i+1:]...)
  322. }
  323. }
  324. }
  325. }
  326. return result
  327. }
  328. // CountChineseCharacters 函数统计字符串数组中汉字的总数
  329. func CountChineseCharacters(strs []string) int {
  330. var totalCount int
  331. for _, str := range strs {
  332. for _, r := range str {
  333. // 判断字符是否为汉字且不是标点符号
  334. if unicode.Is(unicode.Han, r) && !unicode.IsPunct(r) {
  335. totalCount++
  336. }
  337. }
  338. }
  339. return totalCount
  340. }