123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- package main
- import (
- log "github.com/donnie4w/go-logger/logger"
- "github.com/go-ego/gse"
- qu "qfw/util"
- "regexp"
- "strings"
- "unicode"
- "unicode/utf8"
- )
- //单位
- var specHeadReg *regexp.Regexp = regexp.MustCompile("^([a-zA-Z]{1,2}[\u4e00-\u9fa5]{6,}|某部|州|自治区|自治州|街道|名称|省|市|县|区|业绩|资格|中标|项目|预算单位)")
- var unHanHeadReg *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5])")
- var unConReg *regexp.Regexp = regexp.MustCompile("(园|政府|集团|公司|有限|合伙|企|院|学|局|处|校)")
- var unEndReg *regexp.Regexp = regexp.MustCompile("^.*(公司|学(校)?|博物馆|联合社|合作社|监狱|办公厅|电视台|集团|机构|企业|办公室|委员会|实验室|联社|厂|场|院|所|店|小|台|中心|局|站|城|馆|厅|处|行|科|部|队|联合(会|体)|工作室)$")
- var unenableReg1 *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5]{1,2}(责任|有限|有限股份|有限责任|实业)公司|.*(某部|先生|女士|小姐)|工程技术处)$")
- var unenableReg2 *regexp.Regexp = regexp.MustCompile("(\\?|?|单位|#|xxxx|\\*\\*|%|万元|设计企业|免费|代表|代码标识|盖电子|测试测试|删除|错误|吊销|注销|发起人|待清理|&#|护照号|身份证号|\" +\n\t\"法人| |国家拨入|借款|积累资金|认股人|--|、|&|`|美元)")
- //分词
- var GSE *gse.Segmenter = &gse.Segmenter{}
- //编号
- var codeUnConReg *regexp.Regexp = regexp.MustCompile("(null|勘察|测试|设计|设备|项目|标段|工程|监理|范围|分包|月|日|天)")
- var codeUnLenReg *regexp.Regexp = regexp.MustCompile("([\u4e00-\u9fa5]{9,})")
- var classMoneyScope map[string]map[string]interface{}
- func init() {
- log.Debug("初始化,切词")
- GSE.LoadDict("./dictionary.txt")
- //t>d>p
- classMoneyScope = map[string]map[string]interface{}{
- "建筑工程": {"min":10000,"max":10000000000},
- "行政办公": {"min":100,"max":100000000},
- "医疗卫生": {"min":1000,"max":100000000},
- "服务采购": {"min":10,"max":100000000},
- "机械设备": {"min":1000,"max":1000000000},
- "水利水电": {"min":1000,"max":1000000000},
- "能源化工": {"min":1000,"max":1000000000},
- "弱电安防": {"min":1000,"max":1000000000},
- "信息技术": {"min":100,"max":100000000},
- "交通工程": {"min":1000,"max":10000000000},
- "市政设施": {"min":1000,"max":10000000000},
- "农林牧渔": {"min":100,"max":10000000},
- }
- }
- //行业金额校验
- func checkingClassMoney(money float64,class string) bool {
- data :=classMoneyScope[class]
- if data!=nil {
- min := qu.Float64All(data["min"])
- max := qu.Float64All(data["max"])
- if money>min && money<max {
- return true
- }
- }
- return false
- }
- //企业库检测
- func qyNameIsExistsQYXY(name string) bool{
- q := map[string]interface{}{
- "company_name": name,
- }
- data :=qy_mgo.FindOne(qy_coll_name,q)
- if data==nil || len(data)<2{
- return false
- }
- return true
- }
- //采购单位库
- func buyerNameIsExists(name string) bool{
- q := map[string]interface{}{
- "buyer_name": name,
- }
- data :=qy_mgo.FindOne("buyer_enterprise",q)
- if data==nil || len(data)<2{
- return false
- }
- return true
- }
- //包含非中文
- func isUnHan(str string) bool {
- var count int
- for _, v := range str {
- if !unicode.Is(unicode.Han, v) {
- count++
- break
- }
- }
- return count > 0
- }
- //是否含中文
- func isHan(str string) bool {
- var count int
- for _, v := range str {
- if unicode.Is(unicode.Han, v) {
- count++
- break
- }
- }
- return count > 0
- }
- //符号数量
- func isCharCount(str string) []int {
- //中文,英文,数字,其他
- c1,c2,c3,c4:=0,0,0,0
- for _, v := range str {
- if unicode.Is(unicode.Han, v) {
- c1++
- }else if unicode.IsLetter(v){
- c2++
- } else if unicode.IsNumber(v){
- c3++
- }else {
- c4++
- }
- }
- return []int{c1,c2,c3,c4}
- }
- //中文比例-1:3
- func isHanLenToLittle(str string) bool {
- var count int
- len := utf8.RuneCountInString(str)
- min_count := len/3
- for _, v := range str {
- if unicode.Is(unicode.Han, v) {
- count++
- if count>=min_count {
- return true
- }
- }
- }
- return false
- }
- //是否含字母数字
- func isAlphanumeric(str string) bool {
- var count int
- for _, v := range str {
- if unicode.IsNumber(v) || unicode.IsLetter(v) {
- count++
- break
- }
- }
- return count > 0
- }
- //连续数字
- func isRegTimeDateCode(str string) bool {
- reg:=`\d{8}`
- regx,_ := regexp.Compile(reg)
- if regx.FindString(str)!="" {
- return false
- }
- if utf8.RuneCountInString(str)==8 {
- return true
- }
- return false
- }
- //配置字段初始分
- func dealWithFieldSourceScore(source map[string]interface{}) map[string]int64 {
- fieldArr := []string{"buyer","s_winner","budget","bidamount","projectname","projectcode"}
- score := make(map[string]int64,0)
- for _,v := range fieldArr{
- score[v] = int64(100)
- }
- for _,key := range fieldArr {
- ext := *qu.ObjToMap(source[key])
- if ext!=nil{
- ext_from:=qu.ObjToString(ext["ext_from"])
- ext_type:=qu.ObjToString(ext["ext_type"])
- //规范ext_from
- ext_from = normalizedExtFromName(ext_from)
- if ext_from=="winnerorder" || ext_from=="package" ||
- ext_from=="jsondata" || ext_type=="" {
- dataLock.Lock()
- score[key] = qu.Int64All(Ext_From[ext_from])
- dataLock.Unlock()
- }else {
- dataLock.Lock()
- s := qu.Int64All(Ext_From[ext_from])+qu.Int64All(Ext_Type[ext_type])
- score[key] = s/2
- dataLock.Unlock()
- }
- }
- }
- return score
- }
- //规范-抽取来源字符串
- func normalizedExtFromName(str string) string {
- if strings.Contains(str,"order") {
- str = "winnerorder"
- }else if strings.Contains(str,"JsonData") {
- str = "jsondata"
- }else {
- }
- return str
- }
|