util.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. // util
  2. package main
  3. import (
  4. "fmt"
  5. "log"
  6. "regexp"
  7. "strings"
  8. "time"
  9. "github.com/PuerkitoBio/goquery"
  10. "golang.org/x/net/html"
  11. )
  12. var (
  13. theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" +
  14. "中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" +
  15. "成交价格|中标报价|简要规格描述)"
  16. theadWords_order = "(包件号|标的|标段|候选人|供应商)"
  17. theadWordsReg = regexp.MustCompile(theadWords)
  18. theadWordsReg_order = regexp.MustCompile(theadWords_order)
  19. delRowKeys = "未成交|未中标原因"
  20. delRowKeysReg = regexp.MustCompile(delRowKeys)
  21. //负向表头,用于剔除干扰表格
  22. reverseTheadKeys = map[string][]string{
  23. "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"},
  24. //"spotcheck": []string{"项目名称", "抽取家数"},
  25. }
  26. //联合体投标判断
  27. consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
  28. consortiumKeysReg = regexp.MustCompile(consortium)
  29. clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)"}
  30. clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩"}
  31. )
  32. func getIdFromDate(startStr, endStr string) (string, string) {
  33. start, _ := time.Parse("2006-01-02", startStr)
  34. end, _ := time.Parse("2006-01-02", endStr)
  35. // 昨天凌晨0点时间戳
  36. hexTimestamp1 := fmt.Sprintf("%X", start.Unix()) + "0000000000000000"
  37. // 今天凌晨0点时间戳
  38. hexTimestamp2 := fmt.Sprintf("%X", end.Unix()) + "0000000000000000"
  39. return hexTimestamp1, hexTimestamp2
  40. }
  41. // 判断是否有嵌套表格
  42. func tableIsPackage(htmlContent string) (bool, int) {
  43. //判断是否有多层表格嵌套
  44. if hasNestedTables(htmlContent) {
  45. //log.Println("表格嵌套")
  46. return false, 0
  47. }
  48. ispack := false
  49. tablesMixRows := 0
  50. tablesData := getPackAgeByTable(htmlContent)
  51. for _, dataRows := range tablesData {
  52. // for k, v := range dataRows {
  53. // log.Println(i, k, v)
  54. // }
  55. if len(dataRows) > 2 {
  56. ispack = true
  57. }
  58. if tablesMixRows < len(dataRows) {
  59. tablesMixRows = len(dataRows)
  60. }
  61. }
  62. //log.Println(ispack, tablesMixRows)
  63. return ispack, tablesMixRows
  64. }
  65. // 提取疑似表格分包数据
  66. func getPackAgeByTable(htmlContent string) map[string][]map[string]string {
  67. // 解析HTML文档
  68. doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
  69. if err != nil {
  70. log.Println(err)
  71. }
  72. // 遍历所有表格
  73. tableDataRows := map[string][]map[string]string{}
  74. doc.Find("table").Each(func(i int, table *goquery.Selection) {
  75. var headers []string
  76. var rows []map[string]string
  77. // 遍历表格行
  78. table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) {
  79. // 自动识别表头行(根据单元格内容特征)
  80. if isHeaderRow(row) && len(headers) < 1 {
  81. isDelHeader := false
  82. tmphead := []string{}
  83. bidTheadKeysIndexNum := map[string]int{} //记录满足剔除的表头的阈值
  84. row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
  85. text := strings.TrimSpace(cell.Text())
  86. tmphead = append(tmphead, text)
  87. if delRowKeysReg.MatchString(text) {
  88. isDelHeader = true
  89. }
  90. //如果是标的物、评分、抽查列表,剔除
  91. for k, theadKeys := range reverseTheadKeys {
  92. for _, v := range theadKeys {
  93. if strings.Contains(text, v) {
  94. bidTheadKeysIndexNum[k]++
  95. }
  96. }
  97. if bidTheadKeysIndexNum[k] > 1 { //满足一个以上的表头删除
  98. isDelHeader = true
  99. }
  100. }
  101. })
  102. //log.Println("tmphead th", tmphead)
  103. if len(tmphead) < 1 {
  104. row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
  105. text := strings.TrimSpace(cell.Text())
  106. tmphead = append(tmphead, text)
  107. if delRowKeysReg.MatchString(text) {
  108. isDelHeader = true
  109. }
  110. //如果是标的物、评分、抽查列表,剔除
  111. for k, theadKeys := range reverseTheadKeys {
  112. for _, v := range theadKeys {
  113. if strings.Contains(text, v) {
  114. bidTheadKeysIndexNum[k]++
  115. }
  116. }
  117. if bidTheadKeysIndexNum[k] > 1 {
  118. isDelHeader = true
  119. }
  120. }
  121. })
  122. }
  123. //log.Println("tmphead td", tmphead)
  124. if !isDelHeader {
  125. headers = append(headers, tmphead...)
  126. }
  127. //log.Println("headers", headers)
  128. }
  129. // 处理数据行
  130. if len(headers) > 0 {
  131. isDelRows := false //是否需要屏蔽词
  132. rowData := make(map[string]string)
  133. row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
  134. if cellIdx < len(headers) {
  135. header := headers[cellIdx]
  136. text := strings.TrimSpace(cell.Text())
  137. rowData[header] = text
  138. if delRowKeysReg.MatchString(text) {
  139. isDelRows = true
  140. }
  141. }
  142. })
  143. //log.Println(isDelRows, rowData)
  144. if !isDelRows {
  145. rows = append(rows, rowData)
  146. }
  147. }
  148. })
  149. tableDataRows[fmt.Sprint(i)] = rows
  150. })
  151. return tableDataRows
  152. }
  153. // 自定义表头判断逻辑(根据单元格内容特征)
  154. func isHeaderRow(row *goquery.Selection) bool {
  155. // 判断条件示例 包含 theadWords 特定关键词
  156. hasAttributeKeyword := false
  157. matchNum := 0
  158. row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
  159. text := strings.TrimSpace(cell.Text())
  160. if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
  161. matchNum++
  162. hasAttributeKeyword = true
  163. } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
  164. matchNum++
  165. hasAttributeKeyword = true
  166. }
  167. //log.Println(text, matchNum, hasAttributeKeyword)
  168. })
  169. row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
  170. text := strings.TrimSpace(cell.Text())
  171. if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
  172. matchNum++
  173. hasAttributeKeyword = true
  174. } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
  175. matchNum++
  176. hasAttributeKeyword = true
  177. }
  178. //log.Println(text, matchNum, hasAttributeKeyword)
  179. })
  180. //log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword)
  181. return matchNum > 1 && hasAttributeKeyword
  182. }
  183. func removeTables(html string) string {
  184. // 匹配<table>标签及其内容的正则表达式
  185. re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
  186. return re.ReplaceAllString(html, "")
  187. }
  188. // cleanWebText 删除包含指定关键词及其后续的所有内容
  189. func cleanWebText(input string, keywords, keywordsback []string) string {
  190. // 构建关键词正则表达式(使用OR连接)
  191. keywordPattern := strings.Join(keywordsback, "|")
  192. re, err := regexp.Compile(fmt.Sprintf(`(?s)(%s).*`, keywordPattern))
  193. if err != nil {
  194. return input // 正则编译失败时返回原始文本
  195. }
  196. input = re.ReplaceAllString(input, "")
  197. keyword := strings.Join(keywords, "|")
  198. re, err = regexp.Compile(keyword)
  199. if err != nil {
  200. return input
  201. }
  202. return re.ReplaceAllString(input, "")
  203. }
  204. // 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ)
  205. func convertNumerals(input string) string {
  206. // 字符映射表
  207. chineseNumMap := map[rune]rune{
  208. '零': '0', '一': '1', '二': '2', '三': '3', '四': '4',
  209. '五': '5', '六': '6', '七': '7', '八': '8', '九': '9',
  210. '十': '1', // 仅处理个位,十位需特殊处理
  211. }
  212. romanNumMap := map[rune]rune{
  213. 'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5',
  214. 'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
  215. 'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
  216. }
  217. var result strings.Builder
  218. for _, char := range input {
  219. // 直接检查阿拉伯数字
  220. if char >= '0' && char <= '9' {
  221. result.WriteRune(char)
  222. continue
  223. }
  224. // 检查中文数字
  225. if num, exists := chineseNumMap[char]; exists {
  226. result.WriteRune(num)
  227. continue
  228. }
  229. // 检查罗马数字
  230. if num, exists := romanNumMap[char]; exists {
  231. result.WriteRune(num)
  232. continue
  233. }
  234. // 非数字字符保持不变
  235. result.WriteRune(char)
  236. }
  237. return result.String()
  238. }
  239. // 检查HTML文本中是否存在多层表格嵌套
  240. func hasNestedTables(htmlContent string) bool {
  241. doc, err := html.Parse(strings.NewReader(htmlContent))
  242. if err != nil {
  243. return false
  244. }
  245. var hasNested bool
  246. var checkNested func(node *html.Node, depth int)
  247. checkNested = func(node *html.Node, depth int) {
  248. if node.Type == html.ElementNode && node.Data == "table" {
  249. if depth > 0 { // 非顶层表格
  250. hasNested = true
  251. return
  252. }
  253. depth++
  254. }
  255. for c := node.FirstChild; c != nil && !hasNested; c = c.NextSibling {
  256. checkNested(c, depth)
  257. }
  258. }
  259. checkNested(doc, 0)
  260. return hasNested
  261. }
  262. // Unicode判断工具函数
  263. func isChineseRune(r rune) bool {
  264. // 基础汉字检测
  265. if r >= 0x4E00 && r <= 0x9FFF {
  266. return true
  267. }
  268. // CJK符号和标点
  269. if r >= 0x3000 && r <= 0x303F {
  270. return true
  271. }
  272. // 全角符号(过滤字母数字)
  273. if r >= 0xFF00 && r <= 0xFFEF {
  274. // 排除全角字母
  275. if (r >= 0xFF21 && r <= 0xFF3A) || // 大写字母
  276. (r >= 0xFF41 && r <= 0xFF5A) { // 小写字母
  277. return false
  278. }
  279. // 排除全角数字
  280. if r >= 0xFF10 && r <= 0xFF19 {
  281. return false
  282. }
  283. return true
  284. }
  285. // 特殊符号检测
  286. switch r {
  287. case 0x2018, 0x2019, 0x201C, 0x201D, // 引号
  288. 0x2014, 0x2026, // 破折号、省略号
  289. 0x3010, 0x3011, // 【】
  290. 0x3008, 0x3009, 0x300A, 0x300B: // 《》〈〉
  291. return true
  292. }
  293. return false
  294. }
  295. // 判断是否是联合体中标
  296. func isConsortiumKeysReg(content string) bool {
  297. return consortiumKeysReg.MatchString(content)
  298. }