123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- package main
- import (
- "fmt"
- mgo "mgoutil/mongodb"
- qu "qfw/util"
- "regexp"
- "strings"
- "sync"
- "github.com/cron"
- "github.com/donnie4w/go-logger/logger"
- )
- //匹配数字类型
- //var Reg2 = regexp.MustCompile(`(((http|https)[::]//(www.)?|www.|WWW.)[0-9\.]{7,})[::]{0,}`)
- //固定后缀
- //var Reg2 = regexp.MustCompile(`((http|https)[::]//(www.)?|www.|WWW.)([0-9A-Za-z_]+[-\.]{0,})+\.(cn|asia|hn|citic|ltd|tv|shop|com|mo|co|net|cnpc|CN|CC|cc|pro|aero|coop|hk|tw|me|rec|arts|store|firm|int|info|org|top|wang|ren|xyz|xin|pub|tech|ink|biz|red|gov|vip|art|edu)+`)
- //支持空格
- var Reg1 = regexp.MustCompile("((http|https)[::]//(www\\.)?|www\\.|WWW\\.)([\\s\u3000\u2003\u00a0]{0,}[-A-Za-z0-9&@$??#/%=~_|.::,]+)+([\\s\u3000\u2003\u00a0]{0,}(com|cn|net))?[-A-Za-z0-9&@$??#/%=~_|.::,]+")
- //var Reg1 = regexp.MustCompile(`((http|https)[::]//(www\.)?|www\.|WWW\.)[-A-Za-z0-9&@$??#/%=~_|.::,]+`)
- var Reg2 = regexp.MustCompile("((http|https)[::]//(www\\.)?|www\\.|WWW\\.)(\\w+[-.\\s\u3000\u2003\u00a0]{0,})+")
- var Clear1 = regexp.MustCompile(".*(cn|com|org|net|co|mo|vn|en)((\\d)+[.]{0,}(\\d){0,})$")
- var RegSpace = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
- var Replace = map[string]string{
- ":": ":",
- ",": ".",
- "。": ".",
- }
- //定时任务
- func TimeTask() {
- defer qu.Catch()
- //StartTask()
- c := cron.New()
- cronstr := "0 */" + fmt.Sprint(TaskTime) + " * * * ?" //每TaskTime小时执行一次
- c.AddFunc(cronstr, func() { StartTask() })
- c.Start()
- }
- //开始任务
- func StartTask() {
- fmt.Println("开始任务...")
- defer qu.Catch()
- sess := Mgo.GetMgoConn()
- defer Mgo.DestoryMongoConn(sess)
- q := map[string]interface{}{
- "_id": map[string]interface{}{
- "$gt": mgo.StringTOBsonId(LatestId),
- },
- }
- endId, ok := GetEndId(q) //获取bidding表最后一个数据id
- if !ok || endId == "" {
- return
- }
- q = map[string]interface{}{
- "_id": map[string]interface{}{
- "$gt": mgo.StringTOBsonId(LatestId),
- "$lte": mgo.StringTOBsonId(endId),
- },
- }
- LatestId = endId //替换起始id
- field := map[string]interface{}{"detail": 1}
- logger.Debug("query:", q)
- it := sess.DB("qfw").C("bidding").Find(q).Select(field).Sort("_id").Iter()
- count := Mgo.Count("test", q)
- fmt.Println("共加载数据", count)
- sum := 0
- wg := &sync.WaitGroup{}
- //lock_bid := &sync.Mutex{}
- lock_dmn := &sync.Mutex{}
- save := []map[string]interface{}{}
- //arr := [][]map[string]interface{}{}
- ch := make(chan bool, 20)
- for tmp := make(map[string]interface{}); it.Next(&tmp); sum++ {
- ch <- true
- wg.Add(1)
- go func(d map[string]interface{}) {
- defer func() {
- <-ch
- wg.Done()
- }()
- id := mgo.BsonIdToSId(d["_id"])
- detail := qu.ObjToString(d["detail"])
- hrefArr := Reg1.FindAllString(detail, -1) //匹配detail
- for _, href := range hrefArr {
- if len(href) < 13 {
- continue
- }
- hrefTmp := RegSpace.ReplaceAllString(href, "") //去除空格
- hrefTmp = strings.ToLower(hrefTmp) //转小写
- for {
- if strings.HasSuffix(hrefTmp, ".") || strings.HasSuffix(hrefTmp, "-") {
- hrefTmp = hrefTmp[:len(hrefTmp)-1]
- } else {
- break
- }
- }
- dmName := Reg2.FindString(hrefTmp) //匹配域名
- for k, v := range Replace { //替换字符
- dmName = strings.ReplaceAll(dmName, k, v)
- }
- if dmName == "" {
- continue
- }
- //特殊情况处理 https://cloudmeeting.189.cn6.7 清理6.7
- text := ""
- apos := Clear1.FindAllStringSubmatchIndex(dmName, -1)
- if len(apos) > 0 {
- for _, pos := range apos {
- if len(pos) > 4 {
- text = dmName[pos[4]:pos[5]] //6.7
- }
- }
- }
- if text != "" {
- lastIndex := strings.LastIndex(dmName, text)
- dmName = dmName[:lastIndex] //https://cloudmeeting.189.cn
- }
- lock_dmn.Lock()
- if !DomainNameMap[dmName] { //不在白名单
- tmpMap := map[string]interface{}{"id": id, "domainame": dmName, "href": href, "detail": detail, "clear": href}
- if text != "" {
- lastIndex := strings.LastIndex(href, text)
- href = href[:lastIndex]
- tmpMap["clear"] = href
- }
- save = append(save, tmpMap)
- if len(save) > 500 {
- tmps := save
- Mgo.SaveBulk("domainlog", tmps...)
- save = []map[string]interface{}{}
- }
- //清理detail
- // detail = strings.ReplaceAll(detail, href, "")
- // query := map[string]interface{}{"_id": d["_id"]}
- // set := map[string]interface{}{
- // "$set": map[string]interface{}{
- // "detail": detail,
- // },
- // }
- // update := []map[string]interface{}{}
- // update = append(update, query)
- // update = append(update, set)
- // arr = append(arr, update)
- // if len(arr) > 500 {
- // tmps := arr
- // Mgo.UpdateBulk("test1", tmps...)
- // arr = [][]map[string]interface{}{}
- // }
- }
- lock_dmn.Unlock()
- }
- }(tmp)
- if sum%100 == 0 {
- fmt.Println("current:", sum)
- }
- tmp = map[string]interface{}{}
- }
- wg.Wait()
- lock_dmn.Lock()
- if len(save) > 0 {
- Mgo.SaveBulk("domainlog", save...)
- save = []map[string]interface{}{}
- }
- // if len(arr) > 0 {
- // Mgo.UpdateBulk("test1", arr...)
- // arr = [][]map[string]interface{}{}
- // }
- lock_dmn.Unlock()
- fmt.Println("本轮任务结束")
- }
- //加载域名信息
- func InitDomainName() {
- defer qu.Catch()
- fmt.Println("初始化域名...")
- if DomainNameMap == nil {
- DomainNameMap = make(map[string]bool)
- }
- list, _ := Mgo.Find("domainame", nil, nil, nil, false, -1, -1)
- for _, l := range *list {
- href := qu.ObjToString(l["href"])
- DomainNameMap[href] = true
- }
- fmt.Println("域名初始化完毕...", len(DomainNameMap))
- }
- //获取最后endId
- func GetEndId(query map[string]interface{}) (string, bool) {
- endId := ""
- ok := false
- list, _ := Mgo.Find("bidding", query, `{"_id":-1}`, `{_id:1}`, false, 0, 1)
- if len(*list) == 1 {
- endId = mgo.BsonIdToSId((*list)[0]["_id"])
- if endId >= LatestId {
- return endId, true
- }
- }
- return endId, ok
- }
|