|
@@ -25,7 +25,6 @@ var (
|
|
|
Requestthread chan bool //请求线程数
|
|
|
Brequestbody int //是否解析请求正文
|
|
|
wg = &sync.WaitGroup{}
|
|
|
- msave = []map[string]interface{}{}
|
|
|
lock *sync.Mutex = new(sync.Mutex)
|
|
|
checkcoll string
|
|
|
totalcoll string
|
|
@@ -57,7 +56,6 @@ type spiderobj struct {
|
|
|
Cuser string
|
|
|
Muser string //修改人
|
|
|
Mtime int64 //修改时间
|
|
|
- Status int //爬虫状态
|
|
|
I_old int //是否是老爬虫{luacontent:{$exists:1}}
|
|
|
ResponseCode int //响应码
|
|
|
ResponseStr string //响应码串
|
|
@@ -157,14 +155,13 @@ func checktask() {
|
|
|
MAP_site_stop = map[string]int{}
|
|
|
MAP_site_error = map[string][]*spiderobj{}
|
|
|
//加载所有爬虫代码,站点名称、代码、列表url、状态、作者、修改时间
|
|
|
- res, b := mgo.Find("luaconfig", nil, nil, `{"param_common":1,"modifytime":1,"createuser":1,"modifyuser":1,"code":1,"iupload":1,"luacontent":1}`, false, -1, -1)
|
|
|
+ res, b := mgo.Find("luaconfig", `{"state":5}`, nil, `{"param_common":1,"modifytime":1,"createuser":1,"modifyuser":1,"code":1,"state":1,"luacontent":1}`, false, -1, -1)
|
|
|
Spiders = []*spiderobj{}
|
|
|
stopspidercount = 0
|
|
|
if b && res != nil && (*res) != nil && len(*res) > 0 {
|
|
|
for _, spider := range *res {
|
|
|
defer util.Catch()
|
|
|
sp := &spiderobj{}
|
|
|
- sp.Status = util.IntAll(spider["iupload"])
|
|
|
sp.Cuser = util.ObjToString(spider["createuser"])
|
|
|
if spider["param_common"] != nil {
|
|
|
pc := spider["param_common"].([]interface{})
|
|
@@ -181,63 +178,51 @@ func checktask() {
|
|
|
continue
|
|
|
}
|
|
|
}
|
|
|
- if sp.Status == 1 {
|
|
|
- sp.Id = util.BsonIdToSId(spider["_id"])
|
|
|
- if spider["luacontent"] != nil {
|
|
|
- sp.I_old = 1
|
|
|
- //从脚本中取
|
|
|
- con := spider["luacontent"].(string)
|
|
|
- sr := strings.NewReader(con)
|
|
|
- br := bufio.NewReader(sr)
|
|
|
- n := 0
|
|
|
- siteUrl := ""
|
|
|
- for n < 150 {
|
|
|
- n++
|
|
|
- str, e := br.ReadString('\n')
|
|
|
- if e == nil {
|
|
|
- if strings.HasPrefix(str, "local siteUrl") {
|
|
|
- siteUrl = str[strings.Index(str, `"`)+1 : strings.LastIndex(str, `"`)]
|
|
|
- } else if strings.HasPrefix(str, "spiderTargetChannelUrl") {
|
|
|
- if strings.Index(str, "siteUrl") > 0 {
|
|
|
- sp.ListUrl = siteUrl
|
|
|
- } else {
|
|
|
- s1, s2 := strings.Index(str, `"`), strings.LastIndex(str, `"`)
|
|
|
- sp.ListUrl = str[s1+1 : s2]
|
|
|
- }
|
|
|
- break
|
|
|
+
|
|
|
+ sp.Id = util.BsonIdToSId(spider["_id"])
|
|
|
+ if spider["luacontent"] != nil {
|
|
|
+ sp.I_old = 1
|
|
|
+ //从脚本中取
|
|
|
+ con := spider["luacontent"].(string)
|
|
|
+ sr := strings.NewReader(con)
|
|
|
+ br := bufio.NewReader(sr)
|
|
|
+ n := 0
|
|
|
+ siteUrl := ""
|
|
|
+ for n < 150 {
|
|
|
+ n++
|
|
|
+ str, e := br.ReadString('\n')
|
|
|
+ if e == nil {
|
|
|
+ if strings.HasPrefix(str, "local siteUrl") {
|
|
|
+ siteUrl = str[strings.Index(str, `"`)+1 : strings.LastIndex(str, `"`)]
|
|
|
+ } else if strings.HasPrefix(str, "spiderTargetChannelUrl") {
|
|
|
+ if strings.Index(str, "siteUrl") > 0 {
|
|
|
+ sp.ListUrl = siteUrl
|
|
|
+ } else {
|
|
|
+ s1, s2 := strings.Index(str, `"`), strings.LastIndex(str, `"`)
|
|
|
+ sp.ListUrl = str[s1+1 : s2]
|
|
|
}
|
|
|
- } else if e != nil {
|
|
|
break
|
|
|
}
|
|
|
+ } else if e != nil {
|
|
|
+ break
|
|
|
}
|
|
|
}
|
|
|
- sp.Mtime = util.Int64All(spider["modifytime"])
|
|
|
- sp.Muser = util.ObjToString(spider["modifyuser"])
|
|
|
- sp.Code = util.ObjToString(spider["code"])
|
|
|
- if sp.ListUrl != "" {
|
|
|
- if !strings.HasPrefix(sp.ListUrl, "http") {
|
|
|
- sp.ListUrl = "http://" + sp.ListUrl
|
|
|
- }
|
|
|
- Spiders = append(Spiders, sp)
|
|
|
+ }
|
|
|
+ sp.Mtime = util.Int64All(spider["modifytime"])
|
|
|
+ sp.Muser = util.ObjToString(spider["modifyuser"])
|
|
|
+ sp.Code = util.ObjToString(spider["code"])
|
|
|
+ if sp.ListUrl != "" {
|
|
|
+ if !strings.HasPrefix(sp.ListUrl, "http") {
|
|
|
+ sp.ListUrl = "http://" + sp.ListUrl
|
|
|
}
|
|
|
- MAP_site_run[sp.Site]++
|
|
|
- MAP_site_all[sp.Site]++
|
|
|
- } else {
|
|
|
- stopspidercount++
|
|
|
- MAP_site_stop[sp.Site]++
|
|
|
- MAP_site_all[sp.Site]++
|
|
|
+ Spiders = append(Spiders, sp)
|
|
|
}
|
|
|
+ MAP_site_run[sp.Site]++
|
|
|
+ MAP_site_all[sp.Site]++
|
|
|
}
|
|
|
}
|
|
|
log.Println("load url size:", len(Spiders), "stopped spider count:", stopspidercount)
|
|
|
- tn := time.Now()
|
|
|
- now := tn.Unix()
|
|
|
- year := tn.Year()
|
|
|
- mon := tn.Month()
|
|
|
- day := tn.Day()
|
|
|
- hour := tn.Hour()
|
|
|
- minute := tn.Minute()
|
|
|
- reqn := 0
|
|
|
+
|
|
|
MAP_STATUS = map[int][]*spiderobj{}
|
|
|
//根据站点打乱爬虫顺序
|
|
|
NewSP := make(map[string]chan *spiderobj)
|
|
@@ -263,7 +248,7 @@ func checktask() {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
- for _, sp1 := range Newspiders {
|
|
|
+ for k, sp1 := range Newspiders {
|
|
|
Requestthread <- true
|
|
|
wg.Add(1)
|
|
|
go func(sp *spiderobj) {
|
|
@@ -307,27 +292,7 @@ func checktask() {
|
|
|
}
|
|
|
sp.Requesttime = time.Now().Unix()
|
|
|
sp.ResponseBody = restr
|
|
|
- m := map[string]interface{}{
|
|
|
- "s_spiderid": sp.Id,
|
|
|
- "l_time": now,
|
|
|
- "l_modifytime": sp.Mtime,
|
|
|
- "s_modifyuser": sp.Muser,
|
|
|
- "s_listurl": sp.ListUrl,
|
|
|
- "s_site": sp.Site,
|
|
|
- "s_channel": sp.Channel,
|
|
|
- "i_res_code": sp.ResponseCode,
|
|
|
- "s_res_codestr": sp.ResponseStr,
|
|
|
- "s_res_body": sp.ResponseBody,
|
|
|
- "s_code": sp.Code,
|
|
|
- "l_requesttime": sp.Requesttime,
|
|
|
- "i_oldspider": sp.I_old,
|
|
|
- "i_err": sp.I_err,
|
|
|
- "year": year,
|
|
|
- "month": mon,
|
|
|
- "day": day,
|
|
|
- "hour": hour,
|
|
|
- "minute": minute,
|
|
|
- }
|
|
|
+
|
|
|
lock.Lock()
|
|
|
ss := MAP_STATUS[sp.ResponseCode]
|
|
|
if ss == nil {
|
|
@@ -335,13 +300,7 @@ func checktask() {
|
|
|
}
|
|
|
ss = append(ss, sp)
|
|
|
MAP_STATUS[sp.ResponseCode] = ss
|
|
|
- msave = append(msave, m)
|
|
|
- if len(msave) >= 100 {
|
|
|
- reqn += len(msave)
|
|
|
- //go mgo.SaveBulk(checkcoll, msave...)
|
|
|
- msave = []map[string]interface{}{}
|
|
|
- log.Println("save...", reqn)
|
|
|
- }
|
|
|
+
|
|
|
if sp.ResponseCode != 200 {
|
|
|
if sp.Channel == "" {
|
|
|
sp.Channel = sp.Site
|
|
@@ -369,18 +328,12 @@ func checktask() {
|
|
|
)
|
|
|
}
|
|
|
lock.Unlock()
|
|
|
+ log.Println(k, sp.Site, sp.Channel, sp.Code, sp.ResponseCode)
|
|
|
}(sp1)
|
|
|
+
|
|
|
time.Sleep(150 * time.Millisecond)
|
|
|
}
|
|
|
wg.Wait()
|
|
|
- lock.Lock()
|
|
|
- if len(msave) > 0 {
|
|
|
- reqn += len(msave)
|
|
|
- //go mgo.SaveBulk(checkcoll, msave...)
|
|
|
- msave = []map[string]interface{}{}
|
|
|
- log.Println("save...", reqn)
|
|
|
- }
|
|
|
- lock.Unlock()
|
|
|
log.Println("request over...")
|
|
|
//报警
|
|
|
alarmtask()
|