|
@@ -21,7 +21,7 @@ var (
|
|
|
//清理表格中是key中包含的空格或数字等
|
|
|
tablekeyclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、.,.。、_/]+|^[\\d一二三四五六七八九十]+[、.]*|[((【\\[].*?[))】\\]]")
|
|
|
//清理表格td中的符号
|
|
|
- tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、,。、_??;;~\\-#\\\\附(件|图)]|^*")
|
|
|
+ tabletdclear = regexp.MustCompile("[\\s\u3000\u2003\u00a0\\n、,。、_??;;~\\-#\\\\]*|(详?见)附(件|图)")
|
|
|
//判断key是金额,对万元的处理
|
|
|
moneyreg = regexp.MustCompile("(预算|费|价|额|规模|投资)")
|
|
|
//根据表格的内容判断是不是表头,如果含有金额则不是表头
|
|
@@ -658,8 +658,6 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
|
|
|
TR := NewTR(table)
|
|
|
tdTextIsNull := true
|
|
|
tds.Each(func(m int, selm *goquery.Selection) {
|
|
|
- // t, _ := selm.Html()
|
|
|
- // fmt.Println("t---------", t)
|
|
|
//对隐藏列不处理!!!
|
|
|
if IsHide(selm) {
|
|
|
return
|
|
@@ -667,7 +665,6 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
|
|
|
//进入每一个单元格
|
|
|
td := NewTD(selm, TR, table)
|
|
|
//num++
|
|
|
- //fmt.Println("------", td.SortKV.Keys, td.SortKV.Map)
|
|
|
TR.AddTD(td)
|
|
|
if td.Val != "" { //删除一个tr,tr中所有td是空值的
|
|
|
tdTextIsNull = false
|
|
@@ -680,6 +677,11 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
|
|
|
})
|
|
|
//重置行列
|
|
|
table.ComputeRowColSpan()
|
|
|
+ // for n, tr := range table.TRs {
|
|
|
+ // for m, td := range tr.TDs {
|
|
|
+ // qutil.Debug(td.BH, n, m, td.Text, td.StartRow, td.EndRow, td.StartCol, td.EndCol)
|
|
|
+ // }
|
|
|
+ // }
|
|
|
|
|
|
tm := []map[string]interface{}{}
|
|
|
tmk := map[string]bool{}
|
|
@@ -738,7 +740,7 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
|
|
|
//删除尾部空行
|
|
|
for len(table.TRs) > 0 {
|
|
|
npos := len(table.TRs)
|
|
|
- tailTR := table.TRs[npos-1]
|
|
|
+ tailTR := table.TRs[npos-1] //最后一个tr
|
|
|
bspace := true
|
|
|
for _, v := range tailTR.TDs {
|
|
|
if v.Val != "" || v.SonTableResult != nil || len(v.SortKV.Keys) > 0 {
|
|
@@ -782,6 +784,7 @@ func (table *Table) Analy(contactFormat *u.ContactFormat) []*Table {
|
|
|
table.TdContactFormat(contactFormat)
|
|
|
//开始查找kv,核心模块
|
|
|
table.FindKV()
|
|
|
+ qutil.Debug(table.SortKV.Map)
|
|
|
//table中抽取品牌
|
|
|
if u.IsBrandGoods {
|
|
|
table.analyBrand1()
|
|
@@ -827,16 +830,16 @@ func (table *Table) Adjust() {
|
|
|
table.RowNum = len(table.TRs)
|
|
|
// for k1, tr := range table.TRs {
|
|
|
// for k2, td := range tr.TDs {
|
|
|
- // u.Debug(k1, k2, td.Val, td.Rowspan, td.Colspan, td.ColPos, tr.RowPos)
|
|
|
+ // qutil.Debug(k1, k2, td.Val, td.Rowspan, td.Colspan, td.ColPos, tr.RowPos)
|
|
|
// }
|
|
|
// }
|
|
|
//计算行列起止位置,跨行跨列处理
|
|
|
table.ComputeRowColSpan()
|
|
|
- // for k1, tr := range table.TRs {
|
|
|
- // for k2, td := range tr.TDs {
|
|
|
- // u.Debug(k1, k2, td.Val, td.StartRow, td.EndRow, td.StartCol, td.EndCol)
|
|
|
- // }
|
|
|
+ // for k1, tr := range table.TRs {
|
|
|
+ // for k2, td := range tr.TDs {
|
|
|
+ // qutil.Debug(k1, k2, td.Val, td.StartRow, td.EndRow, td.StartCol, td.EndCol)
|
|
|
// }
|
|
|
+ // }
|
|
|
//大概计算每个起止行列的概率
|
|
|
table.GetKeyRation()
|
|
|
/*
|
|
@@ -847,7 +850,7 @@ func (table *Table) Adjust() {
|
|
|
for _, td := range v.Tdmap[v1] {
|
|
|
str += "__" + td.Val + fmt.Sprintf("%d_%d_%d_%d", td.StartRow, td.EndRow, td.StartCol, td.EndCol)
|
|
|
}
|
|
|
- u.Debug(k, k1, string(bs), v.Rationmap[v1], str)
|
|
|
+ qutil.Debug(k, k1, string(bs), v.Rationmap[v1], str)
|
|
|
}
|
|
|
}
|
|
|
*/
|
|
@@ -862,7 +865,6 @@ func (table *Table) Adjust() {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
if float32(count)/float32(table.TDNum) < 0.85 {
|
|
|
//精确计算起止行列是表头的概率
|
|
|
table.ComputeRowColIsKeyRation()
|
|
@@ -871,7 +873,7 @@ func (table *Table) Adjust() {
|
|
|
for i, tr := range table.TRs {
|
|
|
for _, td := range tr.TDs {
|
|
|
if td.BH {
|
|
|
- //u.Debug("----=====---", td.Val, len(table.TRs[len(table.TRs)-1].TDs), i, len(table.TRs)-1)
|
|
|
+ //qutil.Debug("----=====---", td.Val, len(table.TRs[len(table.TRs)-1].TDs), i, len(table.TRs)-1)
|
|
|
if i == len(table.TRs)-1 && len(table.TRs[len(table.TRs)-1].TDs) == 2 {
|
|
|
res, _, _, _, _ := CheckCommon(td.Val, "abandontable")
|
|
|
if res {
|
|
@@ -896,7 +898,7 @@ func (table *Table) ComputeRowColSpan() {
|
|
|
for k, v := range table.TRs {
|
|
|
nk := 0 //nk列的起始,k行的起始||如果有合并,起始就不是0
|
|
|
ball := true
|
|
|
- rowspans := v.TDs[0].Rowspan
|
|
|
+ rowspans := v.TDs[0].Rowspan //某一行第一个td的rowspan
|
|
|
for _, v1 := range v.TDs {
|
|
|
if v1.Rowspan != rowspans {
|
|
|
ball = false
|
|
@@ -995,16 +997,20 @@ func (table *Table) FindTag() {
|
|
|
//计算r/c_start_end的概率
|
|
|
func (table *Table) GetKeyRation() {
|
|
|
for _, vn := range table.StartAndEndRationKSort.Keys {
|
|
|
+ qutil.Debug("vn:", vn)
|
|
|
v := table.StartAndEndRation[vn]
|
|
|
for _, v1 := range v.Poss {
|
|
|
count := 0
|
|
|
n := 0
|
|
|
+ qutil.Debug("len:", len(v.Tdmap[v1]))
|
|
|
for _, td := range v.Tdmap[v1] {
|
|
|
n++
|
|
|
if td.BH {
|
|
|
+ qutil.Debug("val:", td.Val)
|
|
|
count++
|
|
|
}
|
|
|
}
|
|
|
+ qutil.Debug(float32(count), float32(n), float32(count)/float32(n))
|
|
|
v.Rationmap[v1] = float32(count) / float32(n)
|
|
|
}
|
|
|
}
|
|
@@ -1020,11 +1026,15 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
checkCompute := map[string]bool{}
|
|
|
for k, tr := range table.TRs {
|
|
|
rk := fmtkey("r", tr.TDs[0].StartRow, tr.TDs[0].EndRow)
|
|
|
+ qutil.Debug("rk", rk)
|
|
|
if k == 0 { //第1行的概率
|
|
|
ck := fmtkey("c", tr.TDs[0].StartCol, tr.TDs[0].EndCol)
|
|
|
+ qutil.Debug("ck", ck)
|
|
|
//u.Debug(table.BFirstRow, "--", table.StartAndEndRation[rk], table.StartAndEndRation[ck])
|
|
|
ration1, _ := table.StartAndEndRation[rk].GetTDRation(tr.TDs[0])
|
|
|
ration2, _ := table.StartAndEndRation[ck].GetTDRation(tr.TDs[0])
|
|
|
+ qutil.Debug("ration1:", ration1, "ration2:", ration2)
|
|
|
+ qutil.Debug(len(tr.TDs) == 2 && ration2 < 0.55, len(tr.TDs) == 2 && ration1 > 0.5)
|
|
|
if (len(tr.TDs) == 2 && ration2 < 0.55) && (len(tr.TDs) == 2 && ration1 > 0.5) { //第一行为key
|
|
|
bkeyfirstrow = true
|
|
|
ball := true
|
|
@@ -1061,6 +1071,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ qutil.Debug("bkeyfirstrow:", bkeyfirstrow, "bkeyfirstcol:", bkeyfirstcol)
|
|
|
if !bkeyfirstrow && !bkeyfirstcol {
|
|
|
if len(tr.TDs) > 1 && ration1 > ration2 && ration1 > 0.5 {
|
|
|
bkeyfirstrow = true
|
|
@@ -1091,6 +1102,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
}
|
|
|
}
|
|
|
} else {
|
|
|
+ qutil.Debug("bkeyfirstrow", bkeyfirstrow)
|
|
|
if bkeyfirstrow {
|
|
|
//第一列的概率
|
|
|
ration1, _ := table.StartAndEndRation[rk].GetTDRation(tr.TDs[0])
|
|
@@ -1105,6 +1117,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
} //else {for _, td := range tr.TDs {}}
|
|
|
} else {
|
|
|
//列在起作用
|
|
|
+ qutil.Debug("bkeyfirstcol", bkeyfirstcol)
|
|
|
if bkeyfirstcol {
|
|
|
for _, td := range tr.TDs {
|
|
|
ck := fmtkey("c", td.StartCol, td.EndCol)
|
|
@@ -1142,20 +1155,25 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ //qutil.Debug("table.Brule", table.Brule, !bkeyfirstcol && !bkeyfirstrow)
|
|
|
if !table.Brule || (!bkeyfirstcol && !bkeyfirstrow) {
|
|
|
//断行问题,虽然同列或同行,但中间被跨行截断,表格方向调整
|
|
|
for _, k := range table.StartAndEndRationKSort.Keys {
|
|
|
+ qutil.Debug("k:", k)
|
|
|
v := table.StartAndEndRation[k]
|
|
|
//横向判断,要判断最多的方向,否则会出现不定的情况(map遍历问题)
|
|
|
k1 := k[:1]
|
|
|
for _, v2 := range v.Poss {
|
|
|
lentds := len(v.Tdmap[v2])
|
|
|
+ qutil.Debug(v2.Max, v2.Min, "len", lentds)
|
|
|
if v.Rationmap[v2] > checkval {
|
|
|
for _, td := range v.Tdmap[v2] {
|
|
|
+ qutil.Debug("td:", td.Val)
|
|
|
if td.KeyDirect == 0 && !MoneyReg.MatchString(td.Val) {
|
|
|
if k1 == "r" {
|
|
|
ck := fmtkey("c", td.StartCol, td.EndCol)
|
|
|
rt := table.StartAndEndRation[ck]
|
|
|
+ qutil.Debug("ck:", ck, "rt:", rt)
|
|
|
//clen := 0
|
|
|
var fv float32
|
|
|
var tdn []*TD
|
|
@@ -1164,6 +1182,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
//clen = len(tdn)
|
|
|
}
|
|
|
if lentds > 1 {
|
|
|
+ qutil.Debug((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil)
|
|
|
if ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" {
|
|
|
td.KeyDirect = 1
|
|
|
td.KVDirect = 2
|
|
@@ -1173,6 +1192,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
} else {
|
|
|
ck := fmtkey("r", td.StartRow, td.EndRow)
|
|
|
rt := table.StartAndEndRation[ck]
|
|
|
+ qutil.Debug("ck:", ck, "rt:", rt)
|
|
|
var fv float32
|
|
|
var tdn []*TD
|
|
|
//clen := 0
|
|
@@ -1181,6 +1201,7 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
//clen = len(tdn)
|
|
|
}
|
|
|
if lentds > 1 {
|
|
|
+ qutil.Debug(tdn != nil, v.Rationmap[v2] > fv, tdn == nil)
|
|
|
if ((tdn != nil && v.Rationmap[v2] > fv) || tdn == nil) && td.Valtype != "BO" {
|
|
|
td.KeyDirect = 2
|
|
|
td.KVDirect = 1
|
|
@@ -1188,12 +1209,13 @@ func (table *Table) ComputeRowColIsKeyRation() {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
+ qutil.Debug(td.Val, td.BH, td.KeyDirect, td.KVDirect)
|
|
|
} else {
|
|
|
break
|
|
|
}
|
|
|
}
|
|
|
} else if v.Rationmap[v2] < 0.5 && len(v.Tdmap[v2]) > 3 {
|
|
|
+ qutil.Debug("================================")
|
|
|
for _, td := range v.Tdmap[v2] {
|
|
|
// u.Debug(td.Val, "-----", td.BH)
|
|
|
if td.KeyDirect == 0 && td.BH && !td.MustBH {
|
|
@@ -2755,7 +2777,7 @@ func (table *Table) analyBrand1() {
|
|
|
arrcount1 := 0 //记录key是否存在必须title(数组数据)
|
|
|
arrcount2 := 0
|
|
|
ka := make(map[string][]string) //最终存储数据
|
|
|
- //qutil.Debug(k, "aMap.Keys----", aMap.Keys)
|
|
|
+ //qutil.Debug("aMap.Keys----", aMap.Keys)
|
|
|
for _, k0 := range aMap.Keys {
|
|
|
v0 := aMap.Map[k0].([]string)
|
|
|
//qutil.Debug("k0:", k0, "v0:", v0)
|
|
@@ -3202,13 +3224,13 @@ func assembleData(m interface{}, n int) []map[string]string {
|
|
|
datas[i] = data
|
|
|
}
|
|
|
//end
|
|
|
- for _, fdv := range datas { //清除空数据和只含特殊符号的数据
|
|
|
- for fmk, fmv := range fdv {
|
|
|
- if tabletdclear.ReplaceAllString(fmv, "") == "" {
|
|
|
- delete(fdv, fmk)
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+ // for _, fdv := range datas { //清除空数据和只含特殊符号的数据
|
|
|
+ // for fmk, fmv := range fdv {
|
|
|
+ // if tabletdclear.ReplaceAllString(fmv, "") == "" {
|
|
|
+ // delete(fdv, fmk)
|
|
|
+ // }
|
|
|
+ // }
|
|
|
+ // }
|
|
|
} else { //字符串数据
|
|
|
realTypeM := m.(map[string]string)
|
|
|
datas = append(datas, realTypeM)
|