|
@@ -5,22 +5,19 @@ package spider
|
|
|
|
|
|
import (
|
|
import (
|
|
"crypto/sha1"
|
|
"crypto/sha1"
|
|
- "crypto/sha256"
|
|
|
|
"fmt"
|
|
"fmt"
|
|
"io"
|
|
"io"
|
|
"log"
|
|
"log"
|
|
- "math/big"
|
|
|
|
- "math/rand"
|
|
|
|
mu "mfw/util"
|
|
mu "mfw/util"
|
|
mgo "mongodb"
|
|
mgo "mongodb"
|
|
qu "qfw/util"
|
|
qu "qfw/util"
|
|
"strconv"
|
|
"strconv"
|
|
|
|
+ "strings"
|
|
"sync"
|
|
"sync"
|
|
|
|
|
|
es "qfw/util/elastic"
|
|
es "qfw/util/elastic"
|
|
"regexp"
|
|
"regexp"
|
|
util "spiderutil"
|
|
util "spiderutil"
|
|
- "strings"
|
|
|
|
"sync/atomic"
|
|
"sync/atomic"
|
|
"time"
|
|
"time"
|
|
|
|
|
|
@@ -267,6 +264,7 @@ func (s *Spider) GetLastPublishTime() (errs interface{}) {
|
|
//下载列表
|
|
//下载列表
|
|
func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
defer mu.Catch()
|
|
defer mu.Catch()
|
|
|
|
+ s.AlreadyGetPageHeart = map[int]bool{} //重置记录
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
start, max := s.GetIntVar("spiderStartPage"), s.GetIntVar("spiderMaxPage") //起始页、最大页
|
|
tmpMax := max //临时记录最大页
|
|
tmpMax := max //临时记录最大页
|
|
repeatAllNum := 0 //本轮采集tmpMax页总的重复个数
|
|
repeatAllNum := 0 //本轮采集tmpMax页总的重复个数
|
|
@@ -345,7 +343,7 @@ func (s *Spider) DownListPageItem() (errs interface{}) {
|
|
//atomic.AddInt32(&s.TotalDowncount, 1)
|
|
//atomic.AddInt32(&s.TotalDowncount, 1)
|
|
href := fmt.Sprint(tmp["href"])
|
|
href := fmt.Sprint(tmp["href"])
|
|
if len(href) > 5 { //有效数据
|
|
if len(href) > 5 { //有效数据
|
|
- hashHref := HexText(href)
|
|
|
|
|
|
+ hashHref := util.HexText(href)
|
|
util.RedisClusterSet(hashHref, "", -1) //全量redis
|
|
util.RedisClusterSet(hashHref, "", -1) //全量redis
|
|
list = append(list, tmp)
|
|
list = append(list, tmp)
|
|
}
|
|
}
|
|
@@ -489,7 +487,7 @@ func (s *Spider) HistoricalMendDownloadDetailItem(p interface{}) {
|
|
if len(href) <= 5 { //无效数据
|
|
if len(href) <= 5 { //无效数据
|
|
return
|
|
return
|
|
}
|
|
}
|
|
- hashHref := HexText(href)
|
|
|
|
|
|
+ hashHref := util.HexText(href)
|
|
isExist := util.RedisClusterExists(hashHref) //全量redis判重
|
|
isExist := util.RedisClusterExists(hashHref) //全量redis判重
|
|
//logger.Debug("full href:", href, " isExist:", isExist)
|
|
//logger.Debug("full href:", href, " isExist:", isExist)
|
|
if !s.IsMustDownload { //非强制下载
|
|
if !s.IsMustDownload { //非强制下载
|
|
@@ -580,7 +578,7 @@ func (s *Spider) DownloadDetailItem(p interface{}, num *int) {
|
|
*num++ //视为已采集
|
|
*num++ //视为已采集
|
|
return
|
|
return
|
|
}
|
|
}
|
|
- hashHref := HexText(href)
|
|
|
|
|
|
+ hashHref := util.HexText(href)
|
|
id := "" //记录spider_listdata中保存的数据id,便于下载成功后更新状态
|
|
id := "" //记录spider_listdata中保存的数据id,便于下载成功后更新状态
|
|
if util.Config.Modal == 1 || (util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history") { //除7410、7500、7510、7700节点外所有节点只采集列表页信息
|
|
if util.Config.Modal == 1 || (util.Config.IsHistoryEvent && s.GetVar("spiderType") == "history") { //除7410、7500、7510、7700节点外所有节点只采集列表页信息
|
|
isExist := util.RedisClusterExists(hashHref) //全量信息中已采集
|
|
isExist := util.RedisClusterExists(hashHref) //全量信息中已采集
|
|
@@ -704,7 +702,7 @@ func (s *Spider) DownloadDetailByNames(p interface{}) {
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
func (s *Spider) DownloadDetailPage(param map[string]interface{}, data map[string]interface{}) (map[string]interface{}, interface{}) {
|
|
defer mu.Catch()
|
|
defer mu.Catch()
|
|
s.LastHeartbeat = time.Now().Unix()
|
|
s.LastHeartbeat = time.Now().Unix()
|
|
- util.TimeSleepFunc((time.Duration(s.SleepBase+GetRandMath(s.SleepRand)))*time.Millisecond, TimeSleepChan)
|
|
|
|
|
|
+ util.TimeSleepFunc((time.Duration(s.SleepBase+util.GetRandMath(s.SleepRand)))*time.Millisecond, TimeSleepChan)
|
|
tab := s.L.NewTable()
|
|
tab := s.L.NewTable()
|
|
for k, v := range param {
|
|
for k, v := range param {
|
|
if val, ok := v.(string); ok {
|
|
if val, ok := v.(string); ok {
|
|
@@ -892,7 +890,7 @@ func (s *Spider) DownloadDetail(reload bool, isHistory bool) {
|
|
_id := tmp["_id"]
|
|
_id := tmp["_id"]
|
|
query := map[string]interface{}{"_id": _id}
|
|
query := map[string]interface{}{"_id": _id}
|
|
href := qu.ObjToString(tmp["href"])
|
|
href := qu.ObjToString(tmp["href"])
|
|
- hashHref := HexText(href)
|
|
|
|
|
|
+ hashHref := util.HexText(href)
|
|
update := []map[string]interface{}{}
|
|
update := []map[string]interface{}{}
|
|
//由于目前列表页redis判重是href+code可能导致同一条href有多条不同code采集的数据存在
|
|
//由于目前列表页redis判重是href+code可能导致同一条href有多条不同code采集的数据存在
|
|
//为了避免重复下载,进行全量redis判重
|
|
//为了避免重复下载,进行全量redis判重
|
|
@@ -1093,12 +1091,6 @@ func AllThreadLog() {
|
|
time.AfterFunc(1*time.Minute, AllThreadLog)
|
|
time.AfterFunc(1*time.Minute, AllThreadLog)
|
|
}
|
|
}
|
|
|
|
|
|
-//获取随机数
|
|
|
|
-func GetRandMath(num int) int {
|
|
|
|
- r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
|
|
- return r.Intn(num)
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
//获取hascode
|
|
//获取hascode
|
|
func GetHas1(data string) string {
|
|
func GetHas1(data string) string {
|
|
t := sha1.New()
|
|
t := sha1.New()
|
|
@@ -1109,22 +1101,3 @@ func GetHas1(data string) string {
|
|
}
|
|
}
|
|
return hf + fmt.Sprintf("%x", t.Sum(nil))
|
|
return hf + fmt.Sprintf("%x", t.Sum(nil))
|
|
}
|
|
}
|
|
-
|
|
|
|
-//对href哈希取模
|
|
|
|
-func HexToBigIntMod(href string) int {
|
|
|
|
- //取哈希值
|
|
|
|
- t := sha256.New()
|
|
|
|
- io.WriteString(t, href)
|
|
|
|
- hex := fmt.Sprintf("%x", t.Sum(nil))
|
|
|
|
- //取模
|
|
|
|
- n := new(big.Int)
|
|
|
|
- n, _ = n.SetString(hex[2:], 16)
|
|
|
|
- return int(n.Mod(n, big.NewInt(16)).Int64())
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-//求hash
|
|
|
|
-func HexText(href string) string {
|
|
|
|
- h := sha256.New()
|
|
|
|
- h.Write([]byte(href))
|
|
|
|
- return fmt.Sprintf("%x", h.Sum(nil))
|
|
|
|
-}
|
|
|