|
@@ -7,20 +7,21 @@ import (
|
|
"strings"
|
|
"strings"
|
|
)
|
|
)
|
|
|
|
|
|
-/**
|
|
|
|
|
|
+/*
|
|
|
|
+*
|
|
之前爬虫过来的数据对table表格的抓取异常问题
|
|
之前爬虫过来的数据对table表格的抓取异常问题
|
|
查找并修正不规则表格的字符串,只对全文做处理,块内的表格不需要修正
|
|
查找并修正不规则表格的字符串,只对全文做处理,块内的表格不需要修正
|
|
-**/
|
|
|
|
|
|
+*
|
|
|
|
+*/
|
|
var thbf = regexp.MustCompile("(?i)</?t(head|body|foot)>")
|
|
var thbf = regexp.MustCompile("(?i)</?t(head|body|foot)>")
|
|
|
|
|
|
-//需要保留thead
|
|
|
|
|
|
+// 需要保留thead
|
|
var saveThead = regexp.MustCompile("(?is)<thead>(.+?)</thead>")
|
|
var saveThead = regexp.MustCompile("(?is)<thead>(.+?)</thead>")
|
|
var clearpkg = regexp.MustCompile("(标示|标识)")
|
|
var clearpkg = regexp.MustCompile("(标示|标识)")
|
|
var clearMoneyReg1 = regexp.MustCompile("(总成交金额:[0-9.]+)[\\s ]+([((]?万元[))]?)")
|
|
var clearMoneyReg1 = regexp.MustCompile("(总成交金额:[0-9.]+)[\\s ]+([((]?万元[))]?)")
|
|
|
|
|
|
func RepairCon(con string) string {
|
|
func RepairCon(con string) string {
|
|
con = clearpkg.ReplaceAllString(con, "")
|
|
con = clearpkg.ReplaceAllString(con, "")
|
|
-
|
|
|
|
con = clearMoneyReg1.ReplaceAllString(con, "$1$2")
|
|
con = clearMoneyReg1.ReplaceAllString(con, "$1$2")
|
|
res := saveThead.FindAllStringSubmatch(con, 1)
|
|
res := saveThead.FindAllStringSubmatch(con, 1)
|
|
th := ""
|
|
th := ""
|
|
@@ -47,7 +48,7 @@ func RepairCon(con string) string {
|
|
return con
|
|
return con
|
|
}
|
|
}
|
|
|
|
|
|
-//修复表格
|
|
|
|
|
|
+// 修复表格
|
|
func findpos(con string, iLen, start int) (newcon string) {
|
|
func findpos(con string, iLen, start int) (newcon string) {
|
|
defer qutil.Catch()
|
|
defer qutil.Catch()
|
|
n := len(con)
|
|
n := len(con)
|
|
@@ -133,7 +134,7 @@ func findpos(con string, iLen, start int) (newcon string) {
|
|
return
|
|
return
|
|
}
|
|
}
|
|
|
|
|
|
-//td的值里面有一个包,并且没有冒号kv
|
|
|
|
|
|
+// td的值里面有一个包,并且没有冒号kv
|
|
func isHasOnePkgAndNoKv(v1 string) (bool, string) {
|
|
func isHasOnePkgAndNoKv(v1 string) (bool, string) {
|
|
v1s := FindVal_1.FindAllString(v1, -1)
|
|
v1s := FindVal_1.FindAllString(v1, -1)
|
|
colonCount := len(regDivision.FindAllString(v1, -1))
|
|
colonCount := len(regDivision.FindAllString(v1, -1))
|
|
@@ -146,7 +147,7 @@ func isHasOnePkgAndNoKv(v1 string) (bool, string) {
|
|
return false, v1
|
|
return false, v1
|
|
}
|
|
}
|
|
|
|
|
|
-//替换分包中混淆的词
|
|
|
|
|
|
+// 替换分包中混淆的词
|
|
func replPkgConfusion(v1 string) string {
|
|
func replPkgConfusion(v1 string) string {
|
|
v1 = PreReg.ReplaceAllString(v1, "")
|
|
v1 = PreReg.ReplaceAllString(v1, "")
|
|
v1 = PreReg1.ReplaceAllString(v1, "")
|
|
v1 = PreReg1.ReplaceAllString(v1, "")
|