12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- package main
- import (
- "github.com/go-xweb/log"
- qu "qfw/util"
- "regexp"
- "unicode/utf8"
- )
- var effective *regexp.Regexp = regexp.MustCompile("^(([A-Za-z]+)?[\u4e00-\u9fa5]+)$")
- func resourceCleanWordsInfo() {
- datas := *MysqlDevTool.Find("seo_resource", nil, "", "id", -1, -1)
- log.Println("最终清洗阶段~", len(datas))
- isok := 0
- /*
- 1、保留全汉字的标的物
- 2、保留标的物开头为英文字母的词,比如dsp信号发生器、abs管材、B超
- 3、长度均在2-6
- */
- for k, v := range datas {
- if k%1000 == 0 {
- log.Println("cur index ", k, "~", isok)
- }
- name := qu.ObjToString(v["name"])
- letter := qu.ObjToString(v["letter"])
- l := utf8.RuneCountInString(name)
- if l < 2 || l > 6 || letter == "" {
- continue
- }
- //是否全中文
- if !effective.MatchString(name) {
- continue
- }
- isok++
- info := v
- delete(info, "id")
- InsertMysqlDevData("seo_resource_copy", info, qu.ObjToString(v["id"]))
- Source_Mgo.Save("seo_resource_words", info)
- }
- log.Println("is over ", len(datas), "~", isok)
- }
|