|
@@ -484,6 +484,132 @@ func AnalyStart(job *u.Job, isSite bool, codeSite string) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+// 分析方法
|
|
|
+func AnalyStartNoTable(job *u.Job, isSite bool, codeSite string) {
|
|
|
+ con := job.Content
|
|
|
+ //全文的需要修复表格
|
|
|
+ con = RepairCon(con)
|
|
|
+ //格式化正文
|
|
|
+ //con = preConReg1.ReplaceAllString(con, "${1}${2}")
|
|
|
+ hisReg1_str := hisReg1.FindString(con)
|
|
|
+ if hisReg1_str != "" && !strings.Contains(hisReg1_str, "中标候选人得分") {
|
|
|
+ con = hisReg1.ReplaceAllString(con, "${4}")
|
|
|
+ }
|
|
|
+ hisReg2_str := hisReg2.FindString(con)
|
|
|
+ if hisReg2_str != "" && !strings.Contains(hisReg2_str, "中标候选人得分") {
|
|
|
+ con = hisReg2.ReplaceAllString(con, "${6}")
|
|
|
+ }
|
|
|
+ con = formattext.ReplaceAllString(con, "${1}:${2}")
|
|
|
+ con = formattext2.ReplaceAllString(con, "${1}")
|
|
|
+ con = formattext3.ReplaceAllString(con, "")
|
|
|
+ con = formattext4.ReplaceAllString(con, "\n${1}:${2}\n")
|
|
|
+ //特殊格式-影响分包候选人抽取-候选人等识别-替换
|
|
|
+ con = formattext5.ReplaceAllString(con, "中标金额:${2}\n")
|
|
|
+ con = formattext6.ReplaceAllString(con, "$1$2")
|
|
|
+ con = formattext7.ReplaceAllString(con, "$1$2")
|
|
|
+ //改变特殊结构
|
|
|
+ con = formattext10.ReplaceAllString(con, "\n分包$3\n中标单位:$5 中标金额:$6\n")
|
|
|
+ con = formattext11.ReplaceAllString(con, "${1}\n${2}\n预算金额:${4}\n${5}\n预算金额:${7}\n${8}\n")
|
|
|
+ con = formattext12.ReplaceAllString(con, "\n${1}:${3}万元\n")
|
|
|
+ con = formattext13.ReplaceAllString(con, "\n包一\n中标单位:${1}\n中标金额:${3}\n"+"包二\n中标单位:${2}\n中标金额:${4}\n")
|
|
|
+ con = formattext14.ReplaceAllString(con, "\n包一\n中标单位:${1}\n中标金额:${2}\n"+"包二\n中标单位:${3}\n中标金额:${4}\n")
|
|
|
+ //多供应商~文本结构~重构
|
|
|
+ if m_b, m_c := dealWithMultiSuppliersText(con); m_b {
|
|
|
+ con = m_c
|
|
|
+ }
|
|
|
+ //工程业绩描述影响抽取
|
|
|
+ con = formattext20.ReplaceAllString(con, "\n")
|
|
|
+ con = formattext21.ReplaceAllString(con, "")
|
|
|
+ //指定爬虫-特殊结构-计算抽取
|
|
|
+ if codeSite == "a_zgzfcgw_zfcghtgg_new" {
|
|
|
+ str := formattext50.FindString(con)
|
|
|
+ if str != "" {
|
|
|
+ new_str := dealWithSpecStructToSpiderCode(str)
|
|
|
+ if new_str != "" {
|
|
|
+ con = new_str + con
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ con = formatText(con, "all")
|
|
|
+ job.ContentClean = HtmlToText(job.Content)
|
|
|
+ job.Content = con
|
|
|
+ job.BlockPackage = map[string]*u.BlockPackage{}
|
|
|
+ //分块+处理每块kv
|
|
|
+ blockArrays, _ := DivideBlock(job.CategorySecond, con, 1, job.RuleBlock, isSite, codeSite)
|
|
|
+ if len(blockArrays) > 0 { //有分块
|
|
|
+ //从块里面找分包-文本
|
|
|
+ if !job.IsFile {
|
|
|
+ job.BlockPackage = FindPackageFromBlocks(&blockArrays, isSite, codeSite) //从块里面找分包
|
|
|
+ }
|
|
|
+ for _, bl := range blockArrays {
|
|
|
+ FindProjectCode(bl.Text, job) //匹配项目编号
|
|
|
+ //对块行内容业绩相关进行过滤
|
|
|
+ bl.Text = tableClearTextReg.ReplaceAllString(bl.Text, "")
|
|
|
+ //新加 未分块table中未能解析到中标候选人,从正文中解析-全文匹配一次
|
|
|
+ if (job.Winnerorder == nil || len(job.Winnerorder) == 0) || len(job.Winnerorder) > 8 {
|
|
|
+ //表格没有划分时候:-纯文本匹配
|
|
|
+ tmp_text := HtmlToText(bl.Text)
|
|
|
+ bl.Winnerorder = winnerOrderEntity.Find(tmp_text, true, 1, isSite, codeSite)
|
|
|
+ if thanWinnerOrderEffective(job.Winnerorder, bl.Winnerorder) {
|
|
|
+ job.Winnerorder = bl.Winnerorder
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //无分包-附件-格式化文本处理-
|
|
|
+ if (job.BlockPackage == nil || len(job.BlockPackage) == 0) && job.IsFile {
|
|
|
+ tmp_text := HtmlToText(bl.Text)
|
|
|
+ job.BlockPackage = FindPackageFromText(job.Title, tmp_text, isSite, codeSite)
|
|
|
+ }
|
|
|
+ job.Block = append(job.Block, bl)
|
|
|
+ }
|
|
|
+ } else { //未分块,创建分块
|
|
|
+ bl := &u.Block{}
|
|
|
+ newCon := con
|
|
|
+ job.BlockPackage = FindPackageFromText(job.Title, newCon, isSite, codeSite)
|
|
|
+ bl.Text = HtmlToText(con)
|
|
|
+ FindProjectCode(bl.Text, job) //匹配项目编号 ~~ 清洗无效信息文本
|
|
|
+ if blTextReg.MatchString(bl.Text) && !unblTextReg.MatchString(bl.Text) {
|
|
|
+ if strings.Index(bl.Text, "业绩") > 1 {
|
|
|
+ //如果有采购单位信息~置前
|
|
|
+ before_arr := []string{}
|
|
|
+ if beforeTextReg.MatchString(bl.Text) {
|
|
|
+ before_arr = beforeTextReg.FindAllString(bl.Text, -1)
|
|
|
+ }
|
|
|
+ bl.Text = bl.Text[:strings.Index(bl.Text, "业绩")]
|
|
|
+ if len(before_arr) > 0 {
|
|
|
+ bl.Text = strings.Join(before_arr, "\n") + bl.Text
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //特殊-指定处理-结构转化formattext100
|
|
|
+ if formattext100.MatchString(bl.Text) {
|
|
|
+ new_str := formattext100.FindString(bl.Text)
|
|
|
+ new_str = formattext100.ReplaceAllString(new_str, "$1")
|
|
|
+ bl.Text = fmt.Sprintf("中标金额:%s万元\n", new_str) + bl.Text
|
|
|
+ }
|
|
|
+ //调用kv解析库-处理detail
|
|
|
+ bl.Text = formatText(bl.Text, "all")
|
|
|
+ //处理 :
|
|
|
+ bl.ColonKV = GetKVAll(bl.Text, "", nil, 1, isSite, codeSite)
|
|
|
+ //处理空格
|
|
|
+ bl.SpaceKV = SspacekvEntity.Entrance(bl.Text, "", nil, isSite, codeSite)
|
|
|
+ //新加 未分块table中未能解析到 中标候选人,从正文中解析
|
|
|
+ if job.Winnerorder == nil || len(job.Winnerorder) == 0 || len(job.Winnerorder) > 8 {
|
|
|
+ bl.Winnerorder = winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
|
|
|
+ if thanWinnerOrderEffective(job.Winnerorder, bl.Winnerorder) {
|
|
|
+ job.Winnerorder = bl.Winnerorder
|
|
|
+ }
|
|
|
+ } else { //table里面识别出单位候选人-未识别金额...
|
|
|
+ if onlyExistsWinEntName(job.Winnerorder) {
|
|
|
+ new_winorder := winnerOrderEntity.Find(bl.Text, true, 1, isSite, codeSite)
|
|
|
+ if thanExistsNewWinOrder(job.Winnerorder, new_winorder) {
|
|
|
+ job.Winnerorder = new_winorder
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ job.Block = append(job.Block, bl)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
// 是否有效分包
|
|
|
func isUsefulPackage(pkg map[string]*u.BlockPackage) bool {
|
|
|
if pkg == nil || len(pkg) == 0 {
|