|
@@ -87,7 +87,7 @@ var submatchreg = regexp.MustCompile(`((?:[一二三四五六七八九十0-10]+[
|
|
|
var BHKey = regexp.MustCompile(`^[^,,;:。、.]{2,8}.{0,3}[::].+$`)
|
|
|
var dwReg = regexp.MustCompile("单位[::/ \\s\u3000\u2003\u00a0\\n]*([万亿元]+)")
|
|
|
|
|
|
-func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite string) *TD {
|
|
|
+func NewTD(Goquery *goquery.Selection, tr *TR, table *Table, isSite bool, codeSite string) *TD {
|
|
|
defer qutil.Catch()
|
|
|
td := &TD{
|
|
|
ArrVal: []string{},
|
|
@@ -121,7 +121,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
|
//qutil.Debug("有子表格")
|
|
|
//格式化正文
|
|
|
txt = TextAfterRemoveTable(td.Html)
|
|
|
- td.tdHasTable(&bsontable, tr,isSite,codeSite) //处理td中的table,块标签处理,子表解析集处理
|
|
|
+ td.tdHasTable(&bsontable, tr, isSite, codeSite) //处理td中的table,块标签处理,子表解析集处理
|
|
|
} else {
|
|
|
txt = strings.TrimSpace(td.Goquery.Text())
|
|
|
}
|
|
@@ -130,7 +130,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
|
td.Text = txt //原始串
|
|
|
//处理table外内容
|
|
|
var ub []*u.Block
|
|
|
- ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock,isSite,codeSite)
|
|
|
+ ub, _ = DivideBlock("", txt, 2, table.TableResult.RuleBlock, isSite, codeSite)
|
|
|
//看是否划块
|
|
|
if len(ub) > 0 {
|
|
|
for _, bl := range ub {
|
|
@@ -143,30 +143,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
|
td.SortKV.AddKey(bl_sk, bl_sv)
|
|
|
}
|
|
|
}
|
|
|
- } else {
|
|
|
- //for _, v := range GetKVAll(txt, "", nil, 2).KvTags {
|
|
|
- //for _, vv := range v {
|
|
|
- //td.SortKV.AddKey(vv.Key, vv.Value)
|
|
|
- //}
|
|
|
- //}
|
|
|
- }
|
|
|
- ////抽取不到走正则抽
|
|
|
- //proCode := projectcodeReg.FindString(text)
|
|
|
- //if proCode != "" {
|
|
|
- // ckv := GetKVAll(proCode, "", nil, 1)
|
|
|
- // for _, v := range ckv.KvTags {
|
|
|
- // for _, vv := range v {
|
|
|
- // td.SortKV.AddKey(vv.Key, vv.Value)
|
|
|
- // }
|
|
|
- // }
|
|
|
- //} else if proCode = projectcodeReg2.FindString(text); proCode != "" {
|
|
|
- // ckv := GetKVAll(proCode, "", nil, 1)
|
|
|
- // for _, v := range ckv.KvTags {
|
|
|
- // for _, vv := range v {
|
|
|
- // td.SortKV.AddKey(vv.Key, vv.Value)
|
|
|
- // }
|
|
|
- // }
|
|
|
- //}
|
|
|
+ }
|
|
|
if proCode := jsonReg.FindString(text); proCode != "" {
|
|
|
jsonMap := make(map[string]string)
|
|
|
json.Unmarshal([]byte(proCode), &jsonMap)
|
|
@@ -175,7 +152,7 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
|
}
|
|
|
}
|
|
|
//对td单元格值判断是否是表头和根据td内容长度进行分块处理
|
|
|
- td.tdIsHb(tr, table, bsontable,isSite,codeSite)
|
|
|
+ td.tdIsHb(tr, table, bsontable, isSite, codeSite)
|
|
|
bhead := false
|
|
|
if td.TR.RowPos == 0 { //第一行
|
|
|
if td.Goquery.Closest("thead").Size() == 1 && !bsontable { //如果是thead确定为k值表头
|
|
@@ -187,12 +164,11 @@ func NewTD(Goquery *goquery.Selection, tr *TR, table *Table,isSite bool,codeSite
|
|
|
td.KeyDirect = 1 //k方向,k纵值横,k横值纵 1横 2纵
|
|
|
td.KVDirect = 2 //键-值方向,0未知,1横 2纵//指值和k的方向
|
|
|
}
|
|
|
- //u.Debug(td.BH, td.Val)
|
|
|
return td
|
|
|
}
|
|
|
|
|
|
//处理td中的table,块标签处理,子表解析集处理
|
|
|
-func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
|
+func (td *TD) tdHasTable(bsontable *bool, tr *TR, isSite bool, codeSite string) {
|
|
|
ts := td.TR.Table.TableResult
|
|
|
tabs, _ := ComputeConRatio(td.Html, 2) //计算表格占比
|
|
|
if len(tabs) > 0 {
|
|
@@ -219,7 +195,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
|
stag = str
|
|
|
}
|
|
|
}
|
|
|
- if strings.Contains(stag,"开标记录"){
|
|
|
+ if strings.Contains(stag, "开标记录") {
|
|
|
return
|
|
|
}
|
|
|
for _, tv := range tabs {
|
|
@@ -228,7 +204,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
|
}
|
|
|
sonts := NewTableResult(ts.Id, ts.Toptype, stag, td.Html, 2, td.TR.Table.TableResult.RuleBlock)
|
|
|
sonts.GoqueryTabs = tv
|
|
|
- sonts.Analy(isSite,codeSite)
|
|
|
+ sonts.Analy(isSite, codeSite)
|
|
|
|
|
|
//sonts := AnalyTableV2(tabs, ts.Toptype, stag, td.Html, 2, ts.Id, table.TableResult.RuleBlock) //又一次调用解析表格入口
|
|
|
td.BH = false
|
|
@@ -303,7 +279,7 @@ func (td *TD) tdHasTable(bsontable *bool, tr *TR,isSite bool,codeSite string) {
|
|
|
}
|
|
|
|
|
|
//对td单元格值判断是否是表头和根据td内容长度进行分块处理
|
|
|
-func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string) {
|
|
|
+func (td *TD) tdIsHb(tr *TR, table *Table, bsontable, isSite bool, codeSite string) {
|
|
|
lenval := len([]rune(td.Val)) //经过处理的td内容长度
|
|
|
//if lentxt > 9 {
|
|
|
//td.KV = GetKVAll(txt, "")
|
|
@@ -311,7 +287,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
|
//经过处理的td内容长度大于50,划块,分包
|
|
|
if lenval > 50 { //看是否划块
|
|
|
//u.Debug(txt)
|
|
|
- ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock,isSite,codeSite) //对td的原始值
|
|
|
+ ub, _ = DivideBlock("", td.Text, 2, table.TableResult.RuleBlock, isSite, codeSite) //对td的原始值
|
|
|
//看是否划块
|
|
|
if len(ub) > 0 {
|
|
|
for _, bl := range ub {
|
|
@@ -344,10 +320,10 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
|
}
|
|
|
if isFindPkg {
|
|
|
if len(ub) > 0 {
|
|
|
- blockPackage = FindPackageFromBlocks(&ub,isSite,codeSite) //从块里面找分包
|
|
|
+ blockPackage = FindPackageFromBlocks(&ub, isSite, codeSite) //从块里面找分包
|
|
|
} else {
|
|
|
- if !excludeKey2.MatchString(td.Val){
|
|
|
- blockPackage = FindPackageFromText("", td.Val,isSite,codeSite) //从正文里面找分包
|
|
|
+ if !excludeKey2.MatchString(td.Val) {
|
|
|
+ blockPackage = FindPackageFromText("", td.Val, isSite, codeSite) //从正文里面找分包
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -389,7 +365,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
|
td.SortKV.AddKey(strings.TrimSpace(td.Text[:tagindex]), strings.TrimSpace(td.Text[tagindex:])) //存放kv值
|
|
|
td.BH = true
|
|
|
}
|
|
|
- _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3,isSite,codeSite) //td冒号kv
|
|
|
+ _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 3, isSite, codeSite) //td冒号kv
|
|
|
for k, v := range resm {
|
|
|
if k != "" && v != "" {
|
|
|
td.SortKV.AddKey(k, v) //存放kv值
|
|
@@ -398,27 +374,23 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
|
//u.Debug(td.SortKV.Keys, "-------2--------------------------------")
|
|
|
// td.SortKV = FindKv(text, "") //GetKvFromtxt(text, "")
|
|
|
//resm := GetKVAll(text, "")
|
|
|
- if len(td.SortKV.Keys) > 0 {
|
|
|
- //td.KVDirect = 3 //不当头也不当值,忽略
|
|
|
- if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
|
|
|
- td.Val, _ = td.SortKV.Map[td.SortKV.Keys[0]].(string)
|
|
|
- td.BH = true
|
|
|
- }
|
|
|
- } else if !bsontable {
|
|
|
+ if !bsontable {
|
|
|
txt := repSpace.ReplaceAllString(td.Val, "")
|
|
|
btw, must, _, _, repl := CheckHeader(txt)
|
|
|
if lenval > 15 && !strings.Contains(txt, "采购代理机构名称、地址和联系方式") {
|
|
|
btw = false
|
|
|
}
|
|
|
- if strings.Contains(td.Val, "个项目") ||strings.Contains(td.Val, "奥图码"){
|
|
|
+ if strings.Contains(td.Val, "个项目") || strings.Contains(td.Val, "奥图码") {
|
|
|
must = false
|
|
|
btw = false
|
|
|
}
|
|
|
td.Valtype = repl
|
|
|
td.MustBH = must
|
|
|
td.BH = btw
|
|
|
- if strings.Contains(txt,"年估算额年(万元)"){
|
|
|
- td.MustBH = true
|
|
|
+ } else if len(td.SortKV.Keys) > 0 {
|
|
|
+ //td.KVDirect = 3 //不当头也不当值,忽略
|
|
|
+ if len(td.SortKV.Keys) == 1 && BHKey.MatchString(td.Val) && !MultiReg.MatchString(td.Val) {
|
|
|
+ td.Val, _ = td.SortKV.Map[td.SortKV.Keys[0]].(string)
|
|
|
td.BH = true
|
|
|
}
|
|
|
}
|
|
@@ -450,7 +422,7 @@ func (td *TD) tdIsHb(tr *TR, table *Table, bsontable,isSite bool,codeSite string
|
|
|
if len(td.TR.TDs) > 0 {
|
|
|
kvTitle = td.TR.TDs[len(td.TR.TDs)-1].Val
|
|
|
}
|
|
|
- _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2,isSite,codeSite) //获取冒号kv入口
|
|
|
+ _, resm := colonkvEntity.entrance(td.Val, kvTitle, nil, 2, isSite, codeSite) //获取冒号kv入口
|
|
|
for k, v := range resm {
|
|
|
td.SortKV.AddKey(k, v)
|
|
|
}
|
|
@@ -496,7 +468,7 @@ func (tr *TR) AddTD(td *TD) {
|
|
|
tr.TDs[len(tr.TDs)-1].RightNode = td
|
|
|
}
|
|
|
**/
|
|
|
- if tr==nil|| tr.TDs == nil{
|
|
|
+ if tr == nil || tr.TDs == nil {
|
|
|
return
|
|
|
}
|
|
|
td.ColPos = len(tr.TDs)
|
|
@@ -851,7 +823,8 @@ func CheckHeader(txt string) (res, must bool, stype, reg, repl string) {
|
|
|
con 文本
|
|
|
strtype 1全文 2块文本
|
|
|
**/
|
|
|
-var hisReg =regexp.MustCompile("类似业绩|历史业绩")
|
|
|
+var hisReg = regexp.MustCompile("类似业绩|历史业绩")
|
|
|
+
|
|
|
func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio float32) {
|
|
|
defer qutil.Catch()
|
|
|
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(con))
|
|
@@ -869,7 +842,7 @@ func ComputeConRatio(con string, strtype int) (tabs []*goquery.Selection, ratio
|
|
|
}
|
|
|
}
|
|
|
if !b {
|
|
|
- if hisReg.MatchString(tmpt.First().Text()){
|
|
|
+ if hisReg.MatchString(tmpt.First().Text()) {
|
|
|
continue
|
|
|
}
|
|
|
tabs = append(tabs, tmpt)
|