words_resource.go 1.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. package main
  2. import (
  3. "github.com/go-xweb/log"
  4. qu "qfw/util"
  5. "regexp"
  6. "unicode/utf8"
  7. )
  8. var effective *regexp.Regexp = regexp.MustCompile("^(([A-Za-z]+)?[\u4e00-\u9fa5]+)$")
  9. func resourceCleanWordsInfo() {
  10. datas := *MysqlDevTool.Find("seo_resource", nil, "", "id", -1, -1)
  11. log.Println("最终清洗阶段~", len(datas))
  12. isok := 0
  13. /*
  14. 1、保留全汉字的标的物
  15. 2、保留标的物开头为英文字母的词,比如dsp信号发生器、abs管材、B超
  16. 3、长度均在2-6
  17. */
  18. for k, v := range datas {
  19. if k%1000 == 0 {
  20. log.Println("cur index ", k, "~", isok)
  21. }
  22. name := qu.ObjToString(v["name"])
  23. letter := qu.ObjToString(v["letter"])
  24. l := utf8.RuneCountInString(name)
  25. if l < 2 || l > 6 || letter == "" {
  26. continue
  27. }
  28. //是否全中文
  29. if !effective.MatchString(name) {
  30. continue
  31. }
  32. isok++
  33. info := v
  34. delete(info, "id")
  35. InsertMysqlDevData("seo_resource_copy", info, qu.ObjToString(v["id"]))
  36. Source_Mgo.Save("seo_resource_words", info)
  37. }
  38. log.Println("is over ", len(datas), "~", isok)
  39. }