|
@@ -6,10 +6,19 @@ import (
|
|
"data_ai/ul"
|
|
"data_ai/ul"
|
|
log "github.com/donnie4w/go-logger/logger"
|
|
log "github.com/donnie4w/go-logger/logger"
|
|
qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
qu "jygit.jydev.jianyu360.cn/data_processing/common_utils"
|
|
|
|
+ "regexp"
|
|
"strings"
|
|
"strings"
|
|
"unicode/utf8"
|
|
"unicode/utf8"
|
|
)
|
|
)
|
|
|
|
|
|
|
|
+func FilterDetail(con string) string {
|
|
|
|
+ return Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "")
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+var Reg = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+")
|
|
|
|
+var Filter = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
|
|
|
|
+var SpecialTextReg = regexp.MustCompile("(原网页|见附件|下载附件|(查看|访问)(源网|原网)|详情请下载附件!|详情请访问原网页!)")
|
|
|
|
+
|
|
// 确认抽取范围
|
|
// 确认抽取范围
|
|
func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
|
|
func ConfrimExtractInfo(q map[string]interface{}) map[string]interface{} {
|
|
dict := map[string]interface{}{}
|
|
dict := map[string]interface{}{}
|
|
@@ -94,10 +103,11 @@ func getPurList(v map[string]interface{}, detail string, f_info map[string]inter
|
|
|
|
|
|
// 过滤信息规则···
|
|
// 过滤信息规则···
|
|
func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool {
|
|
func NotInProgressInfo(title string, detail string, v map[string]interface{}) bool {
|
|
- dl := utf8.RuneCountInString(detail) //文本长度
|
|
|
|
if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
|
|
if strings.Contains(title, "开标记录") || v["jyfb_data"] != nil {
|
|
return true
|
|
return true
|
|
}
|
|
}
|
|
|
|
+ detail = FilterDetail(detail) //只保留文本内容
|
|
|
|
+ dl := utf8.RuneCountInString(detail) //文本长度
|
|
if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) {
|
|
if dl <= 20 || (dl <= 50 && ul.SpecialTextReg.MatchString(detail)) {
|
|
return true
|
|
return true
|
|
}
|
|
}
|