Browse Source

download(adv)方法增加心跳

maxiaoshan 3 years ago
parent
commit
6c763d7bc1
2 changed files with 68 additions and 1 deletions
  1. 63 0
      src/spider/script.go
  2. 5 1
      src/spider/util.go

+ 63 - 0
src/spider/script.go

@@ -120,6 +120,33 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 			charset = s.Encoding
 		}
 		ret := Download(s.Downloader, url, "get", util.GetTable(head), charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
+		//后续代码中完善sendListNum方法后将下方有关心跳代码去除
+		//result := "Ret Null"
+		ok := false
+		if ret != "" && ret != "[]" {
+			ret = SpaceReg.ReplaceAllString(ret, "")
+			tmpArr := []interface{}{}
+			tmpMap := map[string]interface{}{}
+			if err := json.Unmarshal([]byte(ret), &tmpMap); err == nil && len(tmpMap) >= 2 {
+				//result = "Map Ok"
+				ok = true
+			} else if err := json.Unmarshal([]byte(ret), &tmpArr); err == nil && len(tmpArr) >= 2 {
+				//result = "Arr Ok"
+				ok = true
+			} else if htmlArr := HtmlReg.FindAllString(ret, -1); len(htmlArr) >= 4 {
+				//result = "Html Ok"
+				ok = true
+			} else if textArr := HanReg.FindAllString(ret, -1); len(textArr) > 0 {
+				text := strings.Join(textArr, "")
+				if len([]rune(text)) >= 50 {
+					//result = "Text Ok"
+					ok = true
+				}
+			}
+		}
+		if ok {
+			UpdateHeart("", "", code, "", "findlist") //记录列表页实际采集数据量心跳
+		}
 		S.Push(lua.LString(ret))
 		atomic.AddInt32(&s.ToDayRequestNum, 1)
 		atomic.AddInt32(&s.TotalRequestNum, 1)
@@ -159,6 +186,33 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 		} else {
 			ret, retcookie = DownloadAdv(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, charset, s.Userproxy, ishttps, s.SCode, s.Timeout)
 		}
+		//后续代码中完善sendListNum方法后将下方有关心跳代码去除
+		//result := "Ret Null"
+		ok := false
+		if ret != "" && ret != "[]" {
+			ret = SpaceReg.ReplaceAllString(ret, "")
+			tmpArr := []interface{}{}
+			tmpMap := map[string]interface{}{}
+			if err := json.Unmarshal([]byte(ret), &tmpMap); err == nil && len(tmpMap) >= 2 {
+				//result = "Map Ok"
+				ok = true
+			} else if err := json.Unmarshal([]byte(ret), &tmpArr); err == nil && len(tmpArr) >= 2 {
+				//result = "Arr Ok"
+				ok = true
+			} else if htmlArr := HtmlReg.FindAllString(ret, -1); len(htmlArr) >= 4 {
+				//result = "Html Ok"
+				ok = true
+			} else if textArr := HanReg.FindAllString(ret, -1); len(textArr) > 0 {
+				text := strings.Join(textArr, "")
+				if len([]rune(text)) >= 50 {
+					//result = "Text Ok"
+					ok = true
+				}
+			}
+		}
+		if ok {
+			UpdateHeart("", "", code, "", "findlist") //记录列表页实际采集数据量心跳
+		}
 		S.Push(lua.LString(ret))
 		scookie, _ := json.Marshal(retcookie)
 		S.Push(lua.LString(scookie))
@@ -241,6 +295,15 @@ func (s *Script) LoadScript(code, script_file string, newstate bool) string {
 		S.Push(ret)
 		return 1
 	}))
+	//推送列表页下载数据量
+	s.L.SetGlobal("sendListNum", s.L.NewFunction(func(S *lua.LState) int {
+		table := S.ToTable(-1)
+		list := util.TableToMap(table)
+		if len(list) > 0 {
+			UpdateHeart("", "", code, "", "findlist") //记录列表页实际采集数据量心跳
+		}
+		return 1
+	}))
 	// s.L.SetGlobal("findMgoData", s.L.NewFunction(func(S *lua.LState) int {
 	// 	update := [][]map[string]interface{}{}
 	// 	query := map[string]interface{}{"state": 0}

+ 5 - 1
src/spider/util.go

@@ -2,8 +2,13 @@ package spider
 
 import (
 	qu "qfw/util"
+	"regexp"
 )
 
+var SpaceReg = regexp.MustCompile("[\\s\u3000\u2003\u00a0]+")
+var HtmlReg = regexp.MustCompile("<[^>]*?>")
+var HanReg = regexp.MustCompile("[\u4e00-\u9fa5]+")
+
 //初始化延迟采集站点集合
 func InitOther() {
 	defer qu.Catch()
@@ -14,5 +19,4 @@ func InitOther() {
 		delayTime := qu.IntAll(l["delaytime"])
 		DelaySites[site] = delayTime
 	}
-	qu.Debug(DelaySites)
 }