mxs 1 年間 前
コミット
d00358fe68
5 ファイル変更53 行追加65 行削除
  1. 8 1
      src/front/heart.go
  2. 8 17
      src/front/spider.go
  3. 12 28
      src/luacheck/luacheck.go
  4. 24 18
      src/spider/script.go
  5. 1 1
      src/util/util.go

+ 8 - 1
src/front/heart.go

@@ -39,10 +39,16 @@ func (f *Front) Heart() {
 		qu.Debug("query:", query, "sort:", sort)
 		list, _ := u.MgoS.Find("spider_heart", query, sort, nil, false, start, limit)
 		count := u.MgoS.Count("spider_heart", query)
+		result := []map[string]interface{}{}
 		for _, l := range *list {
 			code := qu.ObjToString(l["code"])
 			//d, _ := u.MgoE.FindOneByField("luaconfig", map[string]interface{}{"code": code}, map[string]interface{}{"state": 1, "param_common": 1, "str_list": 1, "type_list": 1})
 			d, _ := u.MgoEB.FindOneByField("luaconfig", map[string]interface{}{"code": code}, map[string]interface{}{"state": 1, "param_common": 1, "str_list": 1, "type_list": 1, "pendtime": 1})
+			if len(*d) == 0 {
+				qu.Debug("爬虫:", code, "不存在")
+
+				continue
+			}
 			l["state"] = (*d)["state"]
 			l["param_common"] = (*d)["param_common"]
 			if ft := qu.Int64All(l["firstpage"]); ft != 0 {
@@ -84,9 +90,10 @@ func (f *Front) Heart() {
 			//if typeList == 1 && strings.Contains(strList, "findListHtml") {
 			//	l["isfindlist"] = "是"
 			//}
+			result = append(result, l)
 		}
 
-		f.ServeJson(map[string]interface{}{"draw": draw, "data": list, "recordsFiltered": count, "recordsTotal": count})
+		f.ServeJson(map[string]interface{}{"draw": draw, "data": result, "recordsFiltered": count, "recordsTotal": count})
 	} else {
 		events := []string{}
 		for k, _ := range sp.Config.Uploadevents {

+ 8 - 17
src/front/spider.go

@@ -743,30 +743,21 @@ func LuaTextCheck(infoformat int, param map[string]interface{}, param_list_chrom
 				warnmsg += `item["channel"]的值与模板不一致;`
 			}
 		}
+
 		//校验列表页area、city、distric
-		if !strings.Contains(list, "area") {
-			errmsg += `模板item["area"]不存在;`
-		}
-		if !strings.Contains(list, "city") {
-			errmsg += `模板item["city"]不存在;`
-		}
-		if !strings.Contains(list, "district") {
-			errmsg += `模板item["district"]不存在;`
-		}
 		area := qu.ObjToString(model["area"])
 		city := qu.ObjToString(model["city"])
 		district := qu.ObjToString(model["district"])
-		if area == "" {
-			errmsg += `模板省份信息不能为空;`
-		} else if !strings.Contains(list, area) {
-			warnmsg += "省份信息与模板不一致;"
+		if !strings.Contains(list, fmt.Sprintf(u.CheckText_Area+`="%s"`, area)) {
+			errmsg += `省份信息与模板不一致`
 		}
-		if city != "" && !strings.Contains(list, city) {
-			warnmsg += "城市信息与模板不一致;"
+		if !strings.Contains(list, fmt.Sprintf(u.CheckText_City+`="%s"`, city)) {
+			errmsg += `城市信息与模板不一致`
 		}
-		if district != "" && !strings.Contains(list, district) {
-			warnmsg += "区/县信息与模板不一致;"
+		if !strings.Contains(list, fmt.Sprintf(u.CheckText_District+`="%s"`, district)) {
+			errmsg += `区/县信息与模板不一致`
 		}
+
 		if infoformat == 2 && !strings.Contains(detail, "projectname") {
 			errmsg += "拟建/审批数据缺少projectname字段;"
 		}

+ 12 - 28
src/luacheck/luacheck.go

@@ -1,7 +1,6 @@
 package luacheck
 
 import (
-	"encoding/json"
 	"fmt"
 	"github.com/PuerkitoBio/goquery"
 	lua "github.com/yuin/gopher-lua"
@@ -138,18 +137,19 @@ func LuaCheck(lua map[string]interface{}, err, warn map[string]string) {
 
 	//2、列表页
 	list := qu.ObjToString(lua["str_list"])
+	listTmp := util.SymbolReg.ReplaceAllString(list, "")
 	//2.1 基本参数是否一致
 	//site、channel、code
 	code := qu.ObjToString(param_common[0])
 	site := qu.ObjToString(param_common[1])
 	channel := qu.ObjToString(param_common[2])
-	if !strings.Contains(list, fmt.Sprintf(util.CheckText_Site, site)) {
+	if !strings.Contains(listTmp, fmt.Sprintf(util.CheckText_Site, site)) {
 		err["站点模板"] = `站点的值与模板不一致`
 	}
-	if !strings.Contains(list, fmt.Sprintf(util.CheckText_Channel, channel)) {
+	if !strings.Contains(listTmp, fmt.Sprintf(util.CheckText_Channel, channel)) {
 		err["栏目模板"] = `栏目的值与模板不一致`
 	}
-	if !strings.Contains(list, fmt.Sprintf(util.CheckText_Code, code)) {
+	if !strings.Contains(listTmp, fmt.Sprintf(util.CheckText_Code, code)) {
 		err["爬虫代码模板"] = `爬虫代码的值与模板不一致`
 	}
 	//area、city、district
@@ -157,19 +157,13 @@ func LuaCheck(lua map[string]interface{}, err, warn map[string]string) {
 	district := qu.ObjToString(model["district"])
 
 	//校验列表页area、city、distric
-	if !strings.Contains(list, util.CheckText_Area) {
-		err["省份模板"] = util.CheckText_Area + "模板不存在"
-	} else if area != "" && !strings.Contains(list, fmt.Sprintf(util.CheckText_Area+`="%s"`, area)) {
+	if !strings.Contains(listTmp, fmt.Sprintf(util.CheckText_Area+`="%s"`, area)) {
 		err["省份模板"] = `省份信息与模板不一致`
 	}
-	if !strings.Contains(list, util.CheckText_City) {
-		err["城市模板"] = util.CheckText_City + "模板不存在"
-	} else if city != "" && !strings.Contains(list, fmt.Sprintf(util.CheckText_City+`="%s"`, city)) {
+	if !strings.Contains(listTmp, fmt.Sprintf(util.CheckText_City+`="%s"`, city)) {
 		err["城市模板"] = `城市信息与模板不一致`
 	}
-	if !strings.Contains(list, util.CheckText_District) {
-		err["区/县模板"] = util.CheckText_District + "模板不存在"
-	} else if district != "" && !strings.Contains(list, fmt.Sprintf(util.CheckText_District+`="%s"`, district)) {
+	if !strings.Contains(listTmp, fmt.Sprintf(util.CheckText_District+`="%s"`, district)) {
 		err["区/县模板"] = `区/县信息与模板不一致`
 	}
 	//2.2 sendListNum
@@ -246,11 +240,12 @@ func luaDownload(pageNum int, lua map[string]interface{}, err, warn map[string]s
 			}
 		}
 	}
-
-	if downloadNum == 0 {
+	if downloadNum == 0 && s.PageOneTextHash == "" && s.PageTwoTextHash == "" { //列表页过滤完,downloadNum无法准确判断采集为0
 		err["列表页下载异常"] = "列表页下载量为0"
-		return
+	} else if s.PageOneTextHash != "" && s.PageOneTextHash == s.PageTwoTextHash {
+		err["前两页数据一致"] = "前两页数据一致"
 	}
+
 	listResultCheck(pageNum, result, err, warn, s) //列表页下载结果校验
 	s.L.Close()
 	//校验
@@ -270,12 +265,9 @@ func listResultCheck(pageNum int, lists map[int][]map[string]interface{}, err, w
 	wg := &sync.WaitGroup{}
 	lock := &sync.Mutex{}
 	ch := make(chan bool, 10)
-	hrefs := make([][]string, len(lists)) //记录前两页数据链接
 	var i, num int
 	for _, list := range lists {
-		pageHrefs := []string{}
 		for _, l := range list {
-			pageHrefs = append(pageHrefs, qu.ObjToString(l["href"]))
 			if num > 50 {
 				break
 			}
@@ -324,18 +316,10 @@ func listResultCheck(pageNum int, lists map[int][]map[string]interface{}, err, w
 				lock.Unlock()
 			}(l)
 		}
-		hrefs[i] = pageHrefs
 		i++
 	}
 	wg.Wait()
 
-	if len(hrefs) >= 2 {
-		hrefByte1, _ := json.Marshal(hrefs[0])
-		hrefByte2, _ := json.Marshal(hrefs[1])
-		if sp.GetHashKey(hrefByte1) == sp.GetHashKey(hrefByte2) {
-			err["前两页数据一致"] = "前两页数据一致"
-		}
-	}
 	if fileNum == 0 {
 		warn["未下载任何附件"] = "未下载任何附件"
 	}
@@ -414,7 +398,7 @@ func detailResultCheck(data map[string]interface{}, warn, err map[string]string)
 	href := qu.ObjToString(data["href"])
 	//校验s_title
 	if data["s_title"] == nil {
-		err["s_title缺失"] = "详情页s_title缺失"
+		err["s_title缺失"] = "详情页无s_title信息(" + href + ")"
 	} else {
 		//校验s_title
 		s_title := qu.ObjToString(data["s_title"])

+ 24 - 18
src/spider/script.go

@@ -47,10 +47,12 @@ type Script struct {
 	Downloader        string //下载器
 	Timeout           int64  //超时时间秒
 	L                 *lua.LState
-	Test_luareqcount  int //脚本请求次数
-	Test_goreqtime    int //go发起次数(时间)
-	Test_goreqlist    int //go发起次数(列表)
-	Test_goreqcon     int //go发起次数(正文)
+	Test_luareqcount  int    //脚本请求次数
+	Test_goreqtime    int    //go发起次数(时间)
+	Test_goreqlist    int    //go发起次数(列表)
+	Test_goreqcon     int    //go发起次数(正文)
+	PageOneTextHash   string //爬虫第一页页面内容hash
+	PageTwoTextHash   string //爬虫第二页页面内容hash
 }
 
 // 加载文件
@@ -193,21 +195,25 @@ func (s *Script) LoadScript(site *string, downloadnode, script string, isfile ..
 	}))
 	//推送列表页下载数据量
 	s.L.SetGlobal("sendListNum", s.L.NewFunction(func(S *lua.LState) int {
-		pageno := S.ToInt(-3) //当前页
-		index := S.ToInt(-2)  //第几条数据
+		pageno := S.ToInt(-2)
 		table := S.ToTable(-1)
-		qu.Debug(s.SCode, index, table.Len())
-		if index == 1 {
-			if pageno == 1 { //第一页数据
-				oneMap := sp.TableToMap(table)
-				text, _ := json.Marshal(oneMap)
-				hashText := sp.HexTextByte(text)
-				qu.Debug("第一页:", hashText)
-			} else if pageno == 2 { //第一页数据
-				twoMap := sp.TableToMap(table)
-				text, _ := json.Marshal(twoMap)
-				hashText := sp.HexTextByte(text)
-				qu.Debug("第二页:", hashText)
+		qu.Debug("页码信息:", s.SCode, pageno, table.Len())
+		if table.Len() > 0 {
+			//爬虫翻页
+			if pageno == 1 && s.PageOneTextHash == "" { //记录第一页数据的hash
+				textMap := sp.TableToMap(table)
+				textByte, err := json.Marshal(textMap)
+				text := string(textByte)
+				if err == nil && text != "" {
+					s.PageOneTextHash = sp.HexText(text)
+				}
+			} else if pageno == 2 && s.PageTwoTextHash == "" {
+				textMap := sp.TableToMap(table)
+				textByte, err := json.Marshal(textMap)
+				text := string(textByte)
+				if err == nil && text != "" {
+					s.PageTwoTextHash = sp.HexText(text)
+				}
 			}
 		}
 		return 1

+ 1 - 1
src/util/util.go

@@ -28,7 +28,7 @@ var (
 	TitleHanReg          = regexp.MustCompile(`[\p{Han}]`)
 	TitleFilterReg       = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|查看详情|转发|<[^>]*?>|(\.){3,6})`)
 	DetailFilterReg      = regexp.MustCompile(`((上|下)一(页|篇)|阅读次数|浏览次数|扫一扫|分享|区块链存证)`)
-	ContentHtmlFilterReg = regexp.MustCompile(`(iframe|img)`)
+	ContentHtmlFilterReg = regexp.MustCompile(`<(img|iframe)[^>]+>`)
 	Area                 []string //省份
 	DomainReg            = regexp.MustCompile(`(?://).+?(?:[::/])`)
 	SymbolReg            = regexp.MustCompile("[,,\\s\u3000\u2003\u00a0]+")