package main import ( "fmt" mgo "mgoutil/mongodb" qu "qfw/util" "regexp" "strings" "sync" "github.com/cron" "github.com/donnie4w/go-logger/logger" ) //匹配数字类型 //var Reg2 = regexp.MustCompile(`(((http|https)[::]//(www.)?|www.|WWW.)[0-9\.]{7,})[::]{0,}`) //固定后缀 //var Reg2 = regexp.MustCompile(`((http|https)[::]//(www.)?|www.|WWW.)([0-9A-Za-z_]+[-\.]{0,})+\.(cn|asia|hn|citic|ltd|tv|shop|com|mo|co|net|cnpc|CN|CC|cc|pro|aero|coop|hk|tw|me|rec|arts|store|firm|int|info|org|top|wang|ren|xyz|xin|pub|tech|ink|biz|red|gov|vip|art|edu)+`) //支持空格 var Reg1 = regexp.MustCompile("((http|https)[::]//(www\\.)?|www\\.|WWW\\.)([\\s\u3000\u2003\u00a0]{0,}[-A-Za-z0-9&@$??#/%=~_|.::,]+)+([\\s\u3000\u2003\u00a0]{0,}(com|cn|net))?[-A-Za-z0-9&@$??#/%=~_|.::,]+") //var Reg1 = regexp.MustCompile(`((http|https)[::]//(www\.)?|www\.|WWW\.)[-A-Za-z0-9&@$??#/%=~_|.::,]+`) var Reg2 = regexp.MustCompile("((http|https)[::]//(www\\.)?|www\\.|WWW\\.)(\\w+[-.\\s\u3000\u2003\u00a0]{0,})+") var Clear1 = regexp.MustCompile(".*(cn|com|org|net|co|mo|vn|en)((\\d)+[.]{0,}(\\d){0,})$") var RegSpace = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+") var Replace = map[string]string{ ":": ":", ",": ".", "。": ".", } //定时任务 func TimeTask() { defer qu.Catch() //StartTask() c := cron.New() cronstr := "0 */" + fmt.Sprint(TaskTime) + " * * * ?" //每TaskTime小时执行一次 c.AddFunc(cronstr, func() { StartTask() }) c.Start() } //开始任务 func StartTask() { fmt.Println("开始任务...") defer qu.Catch() sess := Mgo.GetMgoConn() defer Mgo.DestoryMongoConn(sess) q := map[string]interface{}{ "_id": map[string]interface{}{ "$gt": mgo.StringTOBsonId(LatestId), }, } endId, ok := GetEndId(q) //获取bidding表最后一个数据id if !ok || endId == "" { return } q = map[string]interface{}{ "_id": map[string]interface{}{ "$gt": mgo.StringTOBsonId(LatestId), "$lte": mgo.StringTOBsonId(endId), }, } LatestId = endId //替换起始id field := map[string]interface{}{"detail": 1} logger.Debug("query:", q) it := sess.DB("qfw").C("bidding").Find(q).Select(field).Sort("_id").Iter() count := Mgo.Count("test", q) fmt.Println("共加载数据", count) sum := 0 wg := &sync.WaitGroup{} //lock_bid := &sync.Mutex{} lock_dmn := &sync.Mutex{} save := []map[string]interface{}{} //arr := [][]map[string]interface{}{} ch := make(chan bool, 20) for tmp := make(map[string]interface{}); it.Next(&tmp); sum++ { ch <- true wg.Add(1) go func(d map[string]interface{}) { defer func() { <-ch wg.Done() }() id := mgo.BsonIdToSId(d["_id"]) detail := qu.ObjToString(d["detail"]) hrefArr := Reg1.FindAllString(detail, -1) //匹配detail for _, href := range hrefArr { if len(href) < 13 { continue } hrefTmp := RegSpace.ReplaceAllString(href, "") //去除空格 hrefTmp = strings.ToLower(hrefTmp) //转小写 for { if strings.HasSuffix(hrefTmp, ".") || strings.HasSuffix(hrefTmp, "-") { hrefTmp = hrefTmp[:len(hrefTmp)-1] } else { break } } dmName := Reg2.FindString(hrefTmp) //匹配域名 for k, v := range Replace { //替换字符 dmName = strings.ReplaceAll(dmName, k, v) } if dmName == "" { continue } //特殊情况处理 https://cloudmeeting.189.cn6.7 清理6.7 text := "" apos := Clear1.FindAllStringSubmatchIndex(dmName, -1) if len(apos) > 0 { for _, pos := range apos { if len(pos) > 4 { text = dmName[pos[4]:pos[5]] //6.7 } } } if text != "" { lastIndex := strings.LastIndex(dmName, text) dmName = dmName[:lastIndex] //https://cloudmeeting.189.cn } lock_dmn.Lock() if !DomainNameMap[dmName] { //不在白名单 tmpMap := map[string]interface{}{"id": id, "domainame": dmName, "href": href, "detail": detail, "clear": href} if text != "" { lastIndex := strings.LastIndex(href, text) href = href[:lastIndex] tmpMap["clear"] = href } save = append(save, tmpMap) if len(save) > 500 { tmps := save Mgo.SaveBulk("domainlog", tmps...) save = []map[string]interface{}{} } //清理detail // detail = strings.ReplaceAll(detail, href, "") // query := map[string]interface{}{"_id": d["_id"]} // set := map[string]interface{}{ // "$set": map[string]interface{}{ // "detail": detail, // }, // } // update := []map[string]interface{}{} // update = append(update, query) // update = append(update, set) // arr = append(arr, update) // if len(arr) > 500 { // tmps := arr // Mgo.UpdateBulk("test1", tmps...) // arr = [][]map[string]interface{}{} // } } lock_dmn.Unlock() } }(tmp) if sum%100 == 0 { fmt.Println("current:", sum) } tmp = map[string]interface{}{} } wg.Wait() lock_dmn.Lock() if len(save) > 0 { Mgo.SaveBulk("domainlog", save...) save = []map[string]interface{}{} } // if len(arr) > 0 { // Mgo.UpdateBulk("test1", arr...) // arr = [][]map[string]interface{}{} // } lock_dmn.Unlock() fmt.Println("本轮任务结束") } //加载域名信息 func InitDomainName() { defer qu.Catch() fmt.Println("初始化域名...") if DomainNameMap == nil { DomainNameMap = make(map[string]bool) } list, _ := Mgo.Find("domainame", nil, nil, nil, false, -1, -1) for _, l := range *list { href := qu.ObjToString(l["href"]) DomainNameMap[href] = true } fmt.Println("域名初始化完毕...", len(DomainNameMap)) } //获取最后endId func GetEndId(query map[string]interface{}) (string, bool) { endId := "" ok := false list, _ := Mgo.Find("bidding", query, `{"_id":-1}`, `{_id:1}`, false, 0, 1) if len(*list) == 1 { endId = mgo.BsonIdToSId((*list)[0]["_id"]) if endId >= LatestId { return endId, true } } return endId, ok }