package main import ( log "github.com/donnie4w/go-logger/logger" "github.com/go-ego/gse" qu "qfw/util" "regexp" "strings" "unicode" "unicode/utf8" ) //单位 var specHeadReg *regexp.Regexp = regexp.MustCompile("^([a-zA-Z]{1,2}[\u4e00-\u9fa5]{6,}|某部|州|自治区|自治州|街道|名称|省|市|县|区|业绩|资格|中标|项目|预算单位)") var unHanHeadReg *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5])") var unConReg *regexp.Regexp = regexp.MustCompile("(园|政府|集团|公司|有限|合伙|企|院|学|局|处|校)") var unEndReg *regexp.Regexp = regexp.MustCompile("^.*(公司|学(校)?|博物馆|联合社|合作社|监狱|办公厅|电视台|集团|机构|企业|办公室|委员会|实验室|联社|厂|场|院|所|店|小|台|中心|局|站|城|馆|厅|处|行|科|部|队|联合(会|体)|工作室)$") var unenableReg1 *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5]{1,2}(责任|有限|有限股份|有限责任|实业)公司|.*(某部|先生|女士|小姐)|工程技术处)$") var unenableReg2 *regexp.Regexp = regexp.MustCompile("(\\?|?|单位|#|xxxx|\\*\\*|%|万元|设计企业|免费|代表|代码标识|盖电子|测试测试|删除|错误|吊销|注销|发起人|待清理|&#|护照号|身份证号|\" +\n\t\"法人| |国家拨入|借款|积累资金|认股人|--|、|&|`|美元)") //分词 var GSE *gse.Segmenter = &gse.Segmenter{} //编号 var codeUnConReg *regexp.Regexp = regexp.MustCompile("(null|勘察|测试|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天)") var codeUnLenReg *regexp.Regexp = regexp.MustCompile("([\u4e00-\u9fa5]{9,})") var classMoneyScope map[string]map[string]interface{} func init() { log.Debug("初始化,切词") GSE.LoadDict("./dictionary.txt") //t>d>p classMoneyScope = map[string]map[string]interface{}{ "建筑工程": {"min":10000,"max":10000000000}, "行政办公": {"min":100,"max":100000000}, "医疗卫生": {"min":1000,"max":100000000}, "服务采购": {"min":10,"max":100000000}, "机械设备": {"min":1000,"max":1000000000}, "水利水电": {"min":1000,"max":1000000000}, "能源化工": {"min":1000,"max":1000000000}, "弱电安防": {"min":1000,"max":1000000000}, "信息技术": {"min":100,"max":100000000}, "交通工程": {"min":1000,"max":10000000000}, "市政设施": {"min":1000,"max":10000000000}, "农林牧渔": {"min":100,"max":10000000}, } } //行业金额校验 func checkingClassMoney(money float64,class string) bool { data :=classMoneyScope[class] if data!=nil { min := qu.Float64All(data["min"]) max := qu.Float64All(data["max"]) if money>min && money 0 } //是否含中文 func isHan(str string) bool { var count int for _, v := range str { if unicode.Is(unicode.Han, v) { count++ break } } return count > 0 } //符号数量 func isCharCount(str string) []int { //中文,英文,数字,其他 c1,c2,c3,c4:=0,0,0,0 for _, v := range str { if unicode.Is(unicode.Han, v) { c1++ }else if unicode.IsLetter(v){ c2++ } else if unicode.IsNumber(v){ c3++ }else { c4++ } } return []int{c1,c2,c3,c4} } //中文比例-1:3 func isHanLenToLittle(str string) bool { var count int len := utf8.RuneCountInString(str) min_count := len/3 for _, v := range str { if unicode.Is(unicode.Han, v) { count++ if count>=min_count { return true } } } return false } //是否含字母数字 func isAlphanumeric(str string) bool { var count int for _, v := range str { if unicode.IsNumber(v) || unicode.IsLetter(v) { count++ break } } return count > 0 } //连续数字 func isRegTimeDateCode(str string) bool { reg:=`\d{8}` regx,_ := regexp.Compile(reg) if regx.FindString(str)!="" { return false } if utf8.RuneCountInString(str)==8 { return true } return false } //配置字段初始分 func dealWithFieldSourceScore(source map[string]interface{}) map[string]int64 { fieldArr := []string{"buyer","s_winner","budget","bidamount","projectname","projectcode"} score := make(map[string]int64,0) for _,v := range fieldArr{ score[v] = int64(100) } for _,key := range fieldArr { ext := *qu.ObjToMap(source[key]) if ext!=nil{ ext_from:=qu.ObjToString(ext["ext_from"]) ext_type:=qu.ObjToString(ext["ext_type"]) //规范ext_from ext_from = normalizedExtFromName(ext_from) if ext_from=="winnerorder" || ext_from=="package" || ext_from=="jsondata" || ext_type=="" { dataLock.Lock() score[key] = qu.Int64All(Ext_From[ext_from]) dataLock.Unlock() }else { dataLock.Lock() s := qu.Int64All(Ext_From[ext_from])+qu.Int64All(Ext_Type[ext_type]) score[key] = s/2 dataLock.Unlock() } } } return score } //规范-抽取来源字符串 func normalizedExtFromName(str string) string { if strings.Contains(str,"order") { str = "winnerorder" }else if strings.Contains(str,"JsonData") { str = "jsondata" }else { } return str }