+ if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) || c == '(' || c == ')' || c == '(' || c == ')' || c == '《' || c == '》' || c == '·' || c == '“' || c == '”' {
+ continue
+ } else {
+ return false
+ }
+ }
+ return true
+}
+func chooseUnEnableData() {
+ var unstart *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5])")
+ var uncon *regexp.Regexp = regexp.MustCompile("(园|政府|集团|公司|有限|合伙|企|院|学|局|处)")
+ var unlen *regexp.Regexp = regexp.MustCompile("^(.{4,})$")
+ var unReg *regexp.Regexp = regexp.MustCompile("^.*(公司|学(校)?|博物馆|联合社|合作社|监狱|办公厅|电视台|集团|机构|企业|办公室|委员会|联社|厂|场|院|所|店|中心|局|站|城|处|行|科|部|队|联合(会|体)|工作室)$")
+ var unhead *regexp.Regexp = regexp.MustCompile("^(某部|省|市|区|县|州|自治区|自治州|街道|名称)")
+
+ var unenableReg1 *regexp.Regexp = regexp.MustCompile("^([\u4e00-\u9fa5]{1,2}(责任|有限|有限股份|有限责任|实业)公司|.*(某部|先生|女士|小姐)|工程技术处)$")
+ var unenableReg2 *regexp.Regexp = regexp.MustCompile("(\\?|?|单位|#|xxxx|\\*\\*|%|万元|设计企业|免费|代表|代码标识|盖电子|测试测试|删除|错误|吊销|注销|发起人|待清理|&#|护照号|身份证号|\" +\n\t\"法人| |国家拨入|借款|积累资金|认股人|--|、|&|`|美元)")
+ sess := save_mgo.GetMgoConn()
+ defer save_mgo.DestoryMongoConn(sess)
+ q, total, isok := map[string]interface{}{}, 0, 0
+ pool := make(chan bool, 1)
+ wg := &sync.WaitGroup{}
+ collection := "zk_sensitive_buyer_new_err"
+ it := sess.DB(save_mgo.DbName).C(collection).Find(&q).Iter()
+ for tmp := make(map[string]interface{}); it.Next(&tmp); total++ {
+ if total%10000 == 0 {
+ log.Debug("current index ", total, isok)
+ }
+ name := qu.ObjToString(tmp["name"])
+ tmpid := BsonTOStringId(tmp["_id"])
+ pool <- true //开线程
+ wg.Add(1)
+ go func(name string, tmpid string) {
+ defer func() {
+ <-pool
+ wg.Done()
+ }()
+ //if isHan(name) {
+ // if unenableReg1.MatchString(name)||unhead.MatchString(name) {
+ data := postBaiDuYun("https://aip.baidubce.com/rpc/2.0/ai_custom/v1/entity_xtr/allbuyer?access_token=24.595a79beb92df28ae44081d8c069e32c.2592000.1627033355.282335-24414386",
+ body, "application/json")
+ //fmt.Println("post...end")
+ if results, ok := data["results"].([]interface{}); ok {
+ re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
+ src = re.ReplaceAllString(src, "")
+ //去除SCRIPT
+ re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
+ src = re.ReplaceAllString(src, "")
+ //去除所有尖括号内的HTML代码,并换成换行符
+ re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
+ src = re.ReplaceAllString(src, "\n")
+ //去除连续的换行符
+ re, _ = regexp.Compile("\\s{2,}")
+ src = re.ReplaceAllString(src, "\n")
+ return strings.TrimSpace(src)
+}
+func escape(s string) string {
+ news := ""
+ for _, c := range s {
+ if unicode.Is(unicode.Han, c) || unicode.IsNumber(c) || unicode.IsLetter(c) {
+ news = news + string(c)
+ } else if c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || c == '/' || c == '#' || c == '@' || c == '(' || c == ')' || c == '>' || c == '<' || c == '“' || c == '”' || c == '?' || c == '、' || c == '.' {