util.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. // util
  2. package main
  3. import (
  4. "fmt"
  5. "log"
  6. "regexp"
  7. "strings"
  8. "time"
  9. "github.com/PuerkitoBio/goquery"
  10. "golang.org/x/net/html"
  11. )
  12. var (
  13. theadWords = "(标段|标包|标包号|包号|包段|子包号|子标段名称|子项|包件号|包件代码|包件编号|分包编码|分包名称|分标编号|分标编码|合同段|包件名称|标包名称|" +
  14. "中标单位|中标人|中商人|成交人|成交人名称|供应商|供应商名称|项目名称|项目地址|标的|标的名称|标项名称|采购合同|" +
  15. "成交价格|中标报价|简要规格描述|预算金额|最高限价|中标、成交供应商|中标、成交金额|中标、成交结果情况说明)"
  16. theadWords_order = "(包件号|标的|标段|候选人|供应商)"
  17. theadWordsReg = regexp.MustCompile(theadWords)
  18. theadWordsReg_order = regexp.MustCompile(theadWords_order)
  19. delRowKeys = "未成交|未中标原因"
  20. delRowKeysReg = regexp.MustCompile(delRowKeys)
  21. //负向表头,用于剔除干扰表格
  22. reverseTheadKeys = map[string][]string{
  23. "bidlist": []string{"品牌", "规格型号", "数量", "单价", "报价得分", "总分"},
  24. //"spotcheck": []string{"项目名称", "抽取家数"},
  25. }
  26. //联合体投标判断
  27. consortium = "(联合体牵头人|联合体成员[:: ].{5,30}(公司|院|大学|研究所))|(中标单位[:: ].{5,60}(联合体))"
  28. consortiumKeysReg = regexp.MustCompile(consortium)
  29. //干扰内容清理
  30. clearKeys = []string{"承包(一|二|三|四)级", "开标(\\d)(室|厅)", "\\d+\\.\\d+", "\\d+(.{1,10}).(pdf|doc|zip|rar)",
  31. "([一二三四五六七八九十0-9]+次)"}
  32. clearKeysBack = []string{"上一篇", "下一篇", "历史业绩", "候选人企业业绩", "候选人类似业绩", "成交候选人业绩", "企业类似项目业绩",
  33. "投标人业绩", "企业业绩", "投标文件中载明的业绩情况", "质量标准:", "评审专家"}
  34. //干扰内容替换
  35. replaceMap = map[string]string{
  36. "标项目": "标",
  37. }
  38. )
  39. func getIdFromDate(startStr, endStr string) (string, string) {
  40. start, _ := time.Parse("2006-01-02", startStr)
  41. end, _ := time.Parse("2006-01-02", endStr)
  42. // 昨天凌晨0点时间戳
  43. hexTimestamp1 := fmt.Sprintf("%X", start.Unix()) + "0000000000000000"
  44. // 今天凌晨0点时间戳
  45. hexTimestamp2 := fmt.Sprintf("%X", end.Unix()) + "0000000000000000"
  46. return hexTimestamp1, hexTimestamp2
  47. }
  48. // 判断是否有嵌套表格
  49. func tableIsPackage(htmlContent string) (bool, int) {
  50. //判断是否有多层表格嵌套
  51. if hasNestedTables(htmlContent) {
  52. //log.Println("表格嵌套")
  53. return false, 0
  54. }
  55. ispack := false
  56. tablesMixRows := 0
  57. tablesData := getPackAgeByTable(htmlContent)
  58. for _, dataRows := range tablesData {
  59. // for k, v := range dataRows {
  60. // log.Println(i, k, v)
  61. // }
  62. if len(dataRows) > 2 {
  63. ispack = true
  64. }
  65. if tablesMixRows < len(dataRows) {
  66. tablesMixRows = len(dataRows)
  67. }
  68. }
  69. //log.Println(ispack, tablesMixRows)
  70. return ispack, tablesMixRows
  71. }
  72. // 提取疑似表格分包数据
  73. func getPackAgeByTable(htmlContent string) map[string][]map[string]string {
  74. // 解析HTML文档
  75. doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
  76. if err != nil {
  77. log.Println(err)
  78. }
  79. // 遍历所有表格
  80. tableDataRows := map[string][]map[string]string{}
  81. doc.Find("table").Each(func(i int, table *goquery.Selection) {
  82. var headers []string
  83. var rows []map[string]string
  84. // 遍历表格行
  85. table.Find("tr").Each(func(rowIdx int, row *goquery.Selection) {
  86. // 自动识别表头行(根据单元格内容特征)
  87. if isHeaderRow(row) && len(headers) < 1 {
  88. isDelHeader := false
  89. tmphead := []string{}
  90. bidTheadKeysIndexNum := map[string]int{} //记录满足剔除的表头的阈值
  91. row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
  92. text := strings.TrimSpace(cell.Text())
  93. tmphead = append(tmphead, text)
  94. if delRowKeysReg.MatchString(text) {
  95. isDelHeader = true
  96. }
  97. //如果是标的物、评分、抽查列表,剔除
  98. for k, theadKeys := range reverseTheadKeys {
  99. for _, v := range theadKeys {
  100. if strings.Contains(text, v) {
  101. bidTheadKeysIndexNum[k]++
  102. }
  103. }
  104. if bidTheadKeysIndexNum[k] > 1 { //满足一个以上的表头删除
  105. isDelHeader = true
  106. }
  107. }
  108. })
  109. //log.Println("tmphead th", tmphead)
  110. if len(tmphead) < 1 {
  111. row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
  112. text := strings.TrimSpace(cell.Text())
  113. tmphead = append(tmphead, text)
  114. if delRowKeysReg.MatchString(text) {
  115. isDelHeader = true
  116. }
  117. //如果是标的物、评分、抽查列表,剔除
  118. for k, theadKeys := range reverseTheadKeys {
  119. for _, v := range theadKeys {
  120. if strings.Contains(text, v) {
  121. bidTheadKeysIndexNum[k]++
  122. }
  123. }
  124. if bidTheadKeysIndexNum[k] > 1 {
  125. isDelHeader = true
  126. }
  127. }
  128. })
  129. }
  130. //log.Println("tmphead td", tmphead)
  131. if !isDelHeader {
  132. headers = append(headers, tmphead...)
  133. }
  134. //log.Println("headers", headers)
  135. }
  136. // 处理数据行
  137. if len(headers) > 0 {
  138. isDelRows := false //是否需要屏蔽词
  139. rowData := make(map[string]string)
  140. row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
  141. if cellIdx < len(headers) {
  142. header := headers[cellIdx]
  143. text := strings.TrimSpace(cell.Text())
  144. rowData[header] = text
  145. if delRowKeysReg.MatchString(text) {
  146. isDelRows = true
  147. }
  148. }
  149. })
  150. //log.Println(isDelRows, rowData)
  151. if !isDelRows {
  152. rows = append(rows, rowData)
  153. }
  154. }
  155. })
  156. tableDataRows[fmt.Sprint(i)] = rows
  157. })
  158. return tableDataRows
  159. }
  160. // 自定义表头判断逻辑(根据单元格内容特征)
  161. func isHeaderRow(row *goquery.Selection) bool {
  162. // 判断条件示例 包含 theadWords 特定关键词
  163. hasAttributeKeyword := false
  164. matchNum := 0
  165. row.Find("td").Each(func(cellIdx int, cell *goquery.Selection) {
  166. text := strings.TrimSpace(cell.Text())
  167. if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
  168. matchNum++
  169. hasAttributeKeyword = true
  170. } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
  171. matchNum++
  172. hasAttributeKeyword = true
  173. }
  174. //log.Println(text, matchNum, hasAttributeKeyword)
  175. })
  176. row.Find("th").Each(func(cellIdx int, cell *goquery.Selection) {
  177. text := strings.TrimSpace(cell.Text())
  178. if theadWordsReg.MatchString(text) && len([]rune(text)) < 8 {
  179. matchNum++
  180. hasAttributeKeyword = true
  181. } else if theadWordsReg_order.MatchString(text) && len([]rune(text)) < 8 {
  182. matchNum++
  183. hasAttributeKeyword = true
  184. }
  185. //log.Println(text, matchNum, hasAttributeKeyword)
  186. })
  187. //log.Println("isHeaderRow", matchNum, hasAttributeKeyword, matchNum > 1 && hasAttributeKeyword)
  188. return matchNum > 1 && hasAttributeKeyword
  189. }
  190. // 匹配<table>标签及其内容的正则表达式
  191. func removeTables(html string) string {
  192. re := regexp.MustCompile(`(?i)<table[^>]*>[\s\S]*?</table>`)
  193. return re.ReplaceAllString(html, "")
  194. }
  195. // 替换文本数据
  196. func repalceString(input string, replace map[string]string) string {
  197. for k, v := range replace {
  198. input = strings.Replace(input, k, v, -1)
  199. }
  200. return input
  201. }
  202. // cleanWebText 删除包含指定关键词及其后续的所有内容
  203. func cleanWebText(input string, keywords, keywordsback []string) string {
  204. // 构建关键词正则表达式(使用OR连接)
  205. keywordPattern := strings.Join(keywordsback, "|")
  206. re, err := regexp.Compile(fmt.Sprintf(`(?s)(%s).*`, keywordPattern))
  207. if err != nil {
  208. return input // 正则编译失败时返回原始文本
  209. }
  210. input = re.ReplaceAllString(input, "")
  211. keyword := strings.Join(keywords, "|")
  212. re, err = regexp.Compile(keyword)
  213. if err != nil {
  214. return input
  215. }
  216. return re.ReplaceAllString(input, "")
  217. }
  218. // 支持中文数字(零一二三四五六七八九十)、阿拉伯数字(0-9)、罗马数字(Ⅰ-Ⅻ)
  219. func convertNumerals(input string) string {
  220. // 字符映射表
  221. chineseNumMap := map[rune]rune{
  222. '零': '0', '一': '1', '二': '2', '三': '3', '四': '4',
  223. '五': '5', '六': '6', '七': '7', '八': '8', '九': '9',
  224. '十': '1', // 仅处理个位,十位需特殊处理
  225. }
  226. romanNumMap := map[rune]rune{
  227. 'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5',
  228. 'Ⅵ': '6', 'Ⅶ': '7', 'Ⅷ': '8', 'Ⅸ': '9', 'Ⅹ': '1', // 仅处理个位
  229. 'Ⅺ': '1', 'Ⅻ': '1', // 罗马数字11和12仅处理十位
  230. 'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5',
  231. 'F': '6', 'G': '7', 'H': '8', 'J': '9', 'K': '1',
  232. 'a': '1', 'b': '2', 'c': '3', 'd': '4', 'e': '5',
  233. 'f': '6', 'g': '7', 'h': '8', 'j': '9', 'k': '1',
  234. }
  235. var result strings.Builder
  236. var result1 strings.Builder
  237. for _, char := range input {
  238. // 检查阿拉伯数字
  239. if char >= '0' && char <= '9' {
  240. result1.WriteRune(char)
  241. continue
  242. }
  243. }
  244. var result2 strings.Builder
  245. for _, char := range input {
  246. // 检查中文数字
  247. if num, exists := chineseNumMap[char]; exists {
  248. result2.WriteRune(num)
  249. continue
  250. }
  251. }
  252. var result3 strings.Builder
  253. for _, char := range input {
  254. // 检查罗马数字
  255. if num, exists := romanNumMap[char]; exists {
  256. result3.WriteRune(num)
  257. continue
  258. }
  259. }
  260. if result1.Len() > result.Len() {
  261. result = result1
  262. }
  263. if result2.Len() > result.Len() {
  264. result = result2
  265. }
  266. if result3.Len() > result.Len() {
  267. result = result3
  268. }
  269. return result.String()
  270. }
  271. // 检查HTML文本中是否存在多层表格嵌套
  272. func hasNestedTables(htmlContent string) bool {
  273. doc, err := html.Parse(strings.NewReader(htmlContent))
  274. if err != nil {
  275. return false
  276. }
  277. var hasNested bool
  278. var checkNested func(node *html.Node, depth int)
  279. checkNested = func(node *html.Node, depth int) {
  280. if node.Type == html.ElementNode && node.Data == "table" {
  281. if depth > 0 { // 非顶层表格
  282. hasNested = true
  283. return
  284. }
  285. depth++
  286. }
  287. for c := node.FirstChild; c != nil && !hasNested; c = c.NextSibling {
  288. checkNested(c, depth)
  289. }
  290. }
  291. checkNested(doc, 0)
  292. return hasNested
  293. }
  294. // Unicode判断工具函数
  295. func isChineseRune(r rune) bool {
  296. // 基础汉字检测
  297. if r >= 0x4E00 && r <= 0x9FFF {
  298. return true
  299. }
  300. // CJK符号和标点
  301. if r >= 0x3000 && r <= 0x303F {
  302. return true
  303. }
  304. // 全角符号(过滤字母数字)
  305. if r >= 0xFF00 && r <= 0xFFEF {
  306. // 排除全角字母
  307. if (r >= 0xFF21 && r <= 0xFF3A) || // 大写字母
  308. (r >= 0xFF41 && r <= 0xFF5A) { // 小写字母
  309. return false
  310. }
  311. // 排除全角数字
  312. if r >= 0xFF10 && r <= 0xFF19 {
  313. return false
  314. }
  315. return true
  316. }
  317. // 特殊符号检测
  318. switch r {
  319. case 0x2018, 0x2019, 0x201C, 0x201D, // 引号
  320. 0x2014, 0x2026, // 破折号、省略号
  321. 0x3010, 0x3011, // 【】
  322. 0x3008, 0x3009, 0x300A, 0x300B: // 《》〈〉
  323. return true
  324. }
  325. return false
  326. }
  327. // 判断是否是联合体中标
  328. func isConsortiumKeysReg(content string) bool {
  329. return consortiumKeysReg.MatchString(content)
  330. }