Pārlūkot izejas kodu

数据变更,检测同步调整

zhangjinkun 7 gadi atpakaļ
vecāks
revīzija
46a437f0b3
2 mainītis faili ar 44 papildinājumiem un 91 dzēšanām
  1. 3 3
      src/spidersitecheck/config.json
  2. 41 88
      src/spidersitecheck/main.go

+ 3 - 3
src/spidersitecheck/config.json

@@ -1,7 +1,7 @@
 {
     "mongodbServers": "192.168.3.207:27080",
     "mongodbPoolSize": "5",
-    "mongodbName": "spider",
+    "mongodbName": "editor",
     "influxdb": "https://jianyu:Topnet@20150501@wxlmjy.qmx.top:443",
     "checkcoll": "spidersitecheck",
     "totalcoll": "spidersitetotal",
@@ -10,7 +10,7 @@
         23
     ],
     "useremail": [
-        "renzheng@topnet.net.cn"
+        "zhangjinkun@topnet.net.cn"
     ],
     "smtp": {
         "addr": "smtp.exmail.qq.com",
@@ -22,7 +22,7 @@
     },
     "taskduration": 40,
     "maxalarmcount": 1,
-    "requestthread": 65,
+    "requestthread": 5,
     "brequestbody": 0,
     "requestretry": 2,
     "reqduration": 16,

+ 41 - 88
src/spidersitecheck/main.go

@@ -25,7 +25,6 @@ var (
 	Requestthread   chan bool              //请求线程数
 	Brequestbody    int                    //是否解析请求正文
 	wg                                     = &sync.WaitGroup{}
-	msave                                  = []map[string]interface{}{}
 	lock            *sync.Mutex            = new(sync.Mutex)
 	checkcoll       string
 	totalcoll       string
@@ -57,7 +56,6 @@ type spiderobj struct {
 	Cuser        string
 	Muser        string //修改人
 	Mtime        int64  //修改时间
-	Status       int    //爬虫状态
 	I_old        int    //是否是老爬虫{luacontent:{$exists:1}}
 	ResponseCode int    //响应码
 	ResponseStr  string //响应码串
@@ -157,14 +155,13 @@ func checktask() {
 		MAP_site_stop = map[string]int{}
 		MAP_site_error = map[string][]*spiderobj{}
 		//加载所有爬虫代码,站点名称、代码、列表url、状态、作者、修改时间
-		res, b := mgo.Find("luaconfig", nil, nil, `{"param_common":1,"modifytime":1,"createuser":1,"modifyuser":1,"code":1,"iupload":1,"luacontent":1}`, false, -1, -1)
+		res, b := mgo.Find("luaconfig", `{"state":5}`, nil, `{"param_common":1,"modifytime":1,"createuser":1,"modifyuser":1,"code":1,"state":1,"luacontent":1}`, false, -1, -1)
 		Spiders = []*spiderobj{}
 		stopspidercount = 0
 		if b && res != nil && (*res) != nil && len(*res) > 0 {
 			for _, spider := range *res {
 				defer util.Catch()
 				sp := &spiderobj{}
-				sp.Status = util.IntAll(spider["iupload"])
 				sp.Cuser = util.ObjToString(spider["createuser"])
 				if spider["param_common"] != nil {
 					pc := spider["param_common"].([]interface{})
@@ -181,63 +178,51 @@ func checktask() {
 						continue
 					}
 				}
-				if sp.Status == 1 {
-					sp.Id = util.BsonIdToSId(spider["_id"])
-					if spider["luacontent"] != nil {
-						sp.I_old = 1
-						//从脚本中取
-						con := spider["luacontent"].(string)
-						sr := strings.NewReader(con)
-						br := bufio.NewReader(sr)
-						n := 0
-						siteUrl := ""
-						for n < 150 {
-							n++
-							str, e := br.ReadString('\n')
-							if e == nil {
-								if strings.HasPrefix(str, "local siteUrl") {
-									siteUrl = str[strings.Index(str, `"`)+1 : strings.LastIndex(str, `"`)]
-								} else if strings.HasPrefix(str, "spiderTargetChannelUrl") {
-									if strings.Index(str, "siteUrl") > 0 {
-										sp.ListUrl = siteUrl
-									} else {
-										s1, s2 := strings.Index(str, `"`), strings.LastIndex(str, `"`)
-										sp.ListUrl = str[s1+1 : s2]
-									}
-									break
+
+				sp.Id = util.BsonIdToSId(spider["_id"])
+				if spider["luacontent"] != nil {
+					sp.I_old = 1
+					//从脚本中取
+					con := spider["luacontent"].(string)
+					sr := strings.NewReader(con)
+					br := bufio.NewReader(sr)
+					n := 0
+					siteUrl := ""
+					for n < 150 {
+						n++
+						str, e := br.ReadString('\n')
+						if e == nil {
+							if strings.HasPrefix(str, "local siteUrl") {
+								siteUrl = str[strings.Index(str, `"`)+1 : strings.LastIndex(str, `"`)]
+							} else if strings.HasPrefix(str, "spiderTargetChannelUrl") {
+								if strings.Index(str, "siteUrl") > 0 {
+									sp.ListUrl = siteUrl
+								} else {
+									s1, s2 := strings.Index(str, `"`), strings.LastIndex(str, `"`)
+									sp.ListUrl = str[s1+1 : s2]
 								}
-							} else if e != nil {
 								break
 							}
+						} else if e != nil {
+							break
 						}
 					}
-					sp.Mtime = util.Int64All(spider["modifytime"])
-					sp.Muser = util.ObjToString(spider["modifyuser"])
-					sp.Code = util.ObjToString(spider["code"])
-					if sp.ListUrl != "" {
-						if !strings.HasPrefix(sp.ListUrl, "http") {
-							sp.ListUrl = "http://" + sp.ListUrl
-						}
-						Spiders = append(Spiders, sp)
+				}
+				sp.Mtime = util.Int64All(spider["modifytime"])
+				sp.Muser = util.ObjToString(spider["modifyuser"])
+				sp.Code = util.ObjToString(spider["code"])
+				if sp.ListUrl != "" {
+					if !strings.HasPrefix(sp.ListUrl, "http") {
+						sp.ListUrl = "http://" + sp.ListUrl
 					}
-					MAP_site_run[sp.Site]++
-					MAP_site_all[sp.Site]++
-				} else {
-					stopspidercount++
-					MAP_site_stop[sp.Site]++
-					MAP_site_all[sp.Site]++
+					Spiders = append(Spiders, sp)
 				}
+				MAP_site_run[sp.Site]++
+				MAP_site_all[sp.Site]++
 			}
 		}
 		log.Println("load url size:", len(Spiders), "stopped spider count:", stopspidercount)
-		tn := time.Now()
-		now := tn.Unix()
-		year := tn.Year()
-		mon := tn.Month()
-		day := tn.Day()
-		hour := tn.Hour()
-		minute := tn.Minute()
-		reqn := 0
+
 		MAP_STATUS = map[int][]*spiderobj{}
 		//根据站点打乱爬虫顺序
 		NewSP := make(map[string]chan *spiderobj)
@@ -263,7 +248,7 @@ func checktask() {
 				}
 			}
 		}
-		for _, sp1 := range Newspiders {
+		for k, sp1 := range Newspiders {
 			Requestthread <- true
 			wg.Add(1)
 			go func(sp *spiderobj) {
@@ -307,27 +292,7 @@ func checktask() {
 				}
 				sp.Requesttime = time.Now().Unix()
 				sp.ResponseBody = restr
-				m := map[string]interface{}{
-					"s_spiderid":    sp.Id,
-					"l_time":        now,
-					"l_modifytime":  sp.Mtime,
-					"s_modifyuser":  sp.Muser,
-					"s_listurl":     sp.ListUrl,
-					"s_site":        sp.Site,
-					"s_channel":     sp.Channel,
-					"i_res_code":    sp.ResponseCode,
-					"s_res_codestr": sp.ResponseStr,
-					"s_res_body":    sp.ResponseBody,
-					"s_code":        sp.Code,
-					"l_requesttime": sp.Requesttime,
-					"i_oldspider":   sp.I_old,
-					"i_err":         sp.I_err,
-					"year":          year,
-					"month":         mon,
-					"day":           day,
-					"hour":          hour,
-					"minute":        minute,
-				}
+
 				lock.Lock()
 				ss := MAP_STATUS[sp.ResponseCode]
 				if ss == nil {
@@ -335,13 +300,7 @@ func checktask() {
 				}
 				ss = append(ss, sp)
 				MAP_STATUS[sp.ResponseCode] = ss
-				msave = append(msave, m)
-				if len(msave) >= 100 {
-					reqn += len(msave)
-					//go mgo.SaveBulk(checkcoll, msave...)
-					msave = []map[string]interface{}{}
-					log.Println("save...", reqn)
-				}
+
 				if sp.ResponseCode != 200 {
 					if sp.Channel == "" {
 						sp.Channel = sp.Site
@@ -369,18 +328,12 @@ func checktask() {
 					)
 				}
 				lock.Unlock()
+				log.Println(k, sp.Site, sp.Channel, sp.Code, sp.ResponseCode)
 			}(sp1)
+
 			time.Sleep(150 * time.Millisecond)
 		}
 		wg.Wait()
-		lock.Lock()
-		if len(msave) > 0 {
-			reqn += len(msave)
-			//go mgo.SaveBulk(checkcoll, msave...)
-			msave = []map[string]interface{}{}
-			log.Println("save...", reqn)
-		}
-		lock.Unlock()
 		log.Println("request over...")
 		//报警
 		alarmtask()