Selaa lähdekoodia

验证码支持

maxiaoshan 3 vuotta sitten
vanhempi
commit
0d54027795
5 muutettua tiedostoa jossa 274 lisäystä ja 7 poistoa
  1. 2 1
      src/config.json
  2. 154 0
      src/logs/spider.log
  3. 5 3
      src/main.go
  4. 55 0
      src/spider/download.go
  5. 58 3
      src/spider/script.go

+ 2 - 1
src/config.json

@@ -9,7 +9,7 @@
     "msgserveraddr": "spdata.jianyu360.com:801",
     "msgserveraddrfile": "spdata.jianyu360.com:802",
 	"isdelay":false,
-    "working": 1,
+    "working": 0,
     "chansize": 4,
     "detailchansize": 20,
     "uploadevent": 7100,
@@ -19,6 +19,7 @@
     "ishistoryevent": false,
     "threadbasenum": 50,
     "threadupperlimit": 10,
+    "serveraddress": "127.0.0.1:8030",
     "tesseractadd": "http://test.qmx.top:1688",
     "testdir": "res/test/spider_test.lua",
     "redisservers": "title_repeat_judgement=192.168.3.207:2679,title_repeat_fulljudgement=192.168.3.207:2679,title_repeat_listpagehref=192.168.3.207:1679",

+ 154 - 0
src/logs/spider.log

@@ -7429,3 +7429,157 @@ stack traceback:
 2022/05/26 15:27:26 handler.go:256: warn  2022-05-26 15:27:26 :下载三级页执行死循环 初始化脚本数量: 1
 2022/05/26 15:27:26 handler.go:270: info  Code: a_zgzfcgw_zygg_new Is Downloading Detail: false
 2022/05/26 15:27:26 spider.go:881: info  Thread Info:	Code: a_zgzfcgw_zygg_new 	count: 112 	thread num: 2
+2022/06/07 17:30:27 main.go:132: debug  7100
+2022/06/07 17:30:27 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:30:27 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:30:27 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:30:27 spider.go:182: debug  js_jsszfcgw_cgggjs 江苏省政府采购网 频率: 30 , 150
+2022/06/07 17:30:27 handler.go:138: info  高性能模式:LUA加载完成
+2022/06/07 17:30:27 handler.go:144: info  总共加载脚本数: 1
+2022/06/07 17:30:29 spider.go:299: error  列表页采集报错 1 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:30:29 spider.go:299: error  列表页采集报错 1 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:32:09 main.go:132: debug  7100
+2022/06/07 17:32:09 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:32:09 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:32:09 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:32:09 spider.go:182: debug  js_jsszfcgw_cgggjs 江苏省政府采购网 频率: 30 , 150
+2022/06/07 17:32:09 handler.go:138: info  高性能模式:LUA加载完成
+2022/06/07 17:32:09 handler.go:144: info  总共加载脚本数: 1
+2022/06/07 17:32:26 main.go:132: debug  7100
+2022/06/07 17:32:26 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:32:26 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:32:26 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:32:26 spider.go:182: debug  js_jsszfcgw_cgggjs 江苏省政府采购网 频率: 30 , 150
+2022/06/07 17:32:26 handler.go:138: info  高性能模式:LUA加载完成
+2022/06/07 17:32:26 handler.go:144: info  总共加载脚本数: 1
+2022/06/07 17:34:15 main.go:132: debug  7100
+2022/06/07 17:34:15 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:34:15 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:34:15 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:34:15 spider.go:182: debug  js_jsszfcgw_cgggjs 江苏省政府采购网 频率: 30 , 150
+2022/06/07 17:34:16 handler.go:138: info  高性能模式:LUA加载完成
+2022/06/07 17:34:16 handler.go:144: info  总共加载脚本数: 1
+2022/06/07 17:35:16 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:35:16 spider.go:299: error  列表页采集报错 1 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:36:18 main.go:132: debug  7100
+2022/06/07 17:36:18 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:36:18 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:36:18 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:36:18 spider.go:182: debug  js_jsszfcgw_cgggjs 江苏省政府采购网 频率: 30 , 150
+2022/06/07 17:36:18 handler.go:138: info  高性能模式:LUA加载完成
+2022/06/07 17:36:18 handler.go:144: info  总共加载脚本数: 1
+2022/06/07 17:36:37 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:36:37 main.go:132: debug  7100
+2022/06/07 17:36:37 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:36:37 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:36:37 spider.go:182: debug  js_jsszfcgw_cgggjs 江苏省政府采购网 频率: 30 , 150
+2022/06/07 17:36:38 handler.go:138: info  高性能模式:LUA加载完成
+2022/06/07 17:36:38 handler.go:144: info  总共加载脚本数: 1
+2022/06/07 17:40:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:40:04 main.go:135: debug  7100
+2022/06/07 17:40:04 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:40:04 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:40:04 spider.go:182: debug  js_jsszfcgw_cgggjs 江苏省政府采购网 频率: 30 , 150
+2022/06/07 17:40:04 handler.go:138: info  高性能模式:LUA加载完成
+2022/06/07 17:40:04 handler.go:144: info  总共加载脚本数: 1
+2022/06/07 17:40:33 spider.go:299: error  列表页采集报错 8 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:41:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:42:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:43:03 download.go:129: error  js_jsszfcgw_cgggjs方法DownloadAdv,url:http://www.ccgp-jiangsu.gov.cn/pss/jsp/search_cggg.jsp?cgr=&xmbh=&pqy=&sd=1654531200000&ed=1654617599000&dljg=&cglx=&bt=&code=8bwa&nr=&cgfs=&page=8,err:timeout 150
+2022/06/07 17:43:03 spider.go:299: error  列表页采集报错 8 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:43:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:44:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:45:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:46:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:46:10 download.go:129: error  js_jsszfcgw_cgggjs方法DownloadAdv,url:http://www.ccgp-jiangsu.gov.cn/pss/jsp/search_cggg.jsp?cgr=&xmbh=&pqy=&sd=1654531200000&ed=1654617599000&dljg=&cglx=&bt=&code=S9W5&nr=&cgfs=&page=20,err:timeout 150
+2022/06/07 17:46:10 spider.go:299: error  列表页采集报错 20 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:46:15 spider.go:299: error  列表页采集报错 21 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:47:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:48:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:48:49 download.go:129: error  js_jsszfcgw_cgggjs方法DownloadAdv,url:http://www.ccgp-jiangsu.gov.cn/pss/jsp/search_cggg.jsp?cgr=&xmbh=&pqy=&sd=1654531200000&ed=1654617599000&dljg=&cglx=&bt=&code=xfs5&nr=&cgfs=&page=23,err:timeout 150
+2022/06/07 17:48:49 spider.go:299: error  列表页采集报错 23 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:49:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:50:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:51:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:52:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:53:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:53:49 spider.go:299: error  列表页采集报错 52 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:54:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:54:10 spider.go:299: error  列表页采集报错 52 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:55:04 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 17:55:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:56:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:56:48 download.go:129: error  js_jsszfcgw_cgggjs方法DownloadAdv,url:http://www.ccgp-jiangsu.gov.cn/pss/jsp/search_cggg.jsp?cgr=&xmbh=&pqy=&sd=1654531200000&ed=1654617599000&dljg=&cglx=&bt=&code=xsud&nr=&cgfs=&page=58,err:timeout 150
+2022/06/07 17:56:48 spider.go:299: error  列表页采集报错 58 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:57:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:57:09 spider.go:299: error  列表页采集报错 60 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:58:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 17:58:33 spider.go:299: error  列表页采集报错 61 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:58:41 spider.go:299: error  列表页采集报错 61 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 17:59:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:00:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:01:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:01:59 download.go:129: error  js_jsszfcgw_cgggjs方法DownloadAdv,url:http://www.ccgp-jiangsu.gov.cn/pss/jsp/search_cggg.jsp?cgr=&xmbh=&pqy=&sd=1654531200000&ed=1654617599000&dljg=&cglx=&bt=&code=r3bt&nr=&cgfs=&page=62,err:timeout 150
+2022/06/07 18:01:59 spider.go:299: error  列表页采集报错 62 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 18:02:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:03:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:04:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:04:34 download.go:129: error  js_jsszfcgw_cgggjs方法DownloadAdv,url:http://www.ccgp-jiangsu.gov.cn/pss/jsp/search_cggg.jsp?cgr=&xmbh=&pqy=&sd=1654531200000&ed=1654617599000&dljg=&cglx=&bt=&code=vqnf&nr=&cgfs=&page=62,err:timeout 150
+2022/06/07 18:04:34 spider.go:299: error  列表页采集报错 62 js_jsszfcgw_cgggjs,<string>:58: attempt to index a non-table object(nil)
+stack traceback:
+	<string>:58: in main chunk
+	[G]: ?
+2022/06/07 18:05:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:06:04 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:06:56 main.go:135: debug  7100
+2022/06/07 18:06:56 spider.go:1092: info  Detail Download All Thread: 0
+2022/06/07 18:06:56 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 18:06:56 handler.go:407: info  节点 7100 脚本文件爬虫数 0
+2022/06/07 18:06:56 spider.go:182: debug  js_jsszfcgw_cgggjs 江苏省政府采购网 频率: 30 , 150
+2022/06/07 18:06:56 handler.go:138: info  高性能模式:LUA加载完成
+2022/06/07 18:06:56 handler.go:144: info  总共加载脚本数: 1

+ 5 - 3
src/main.go

@@ -38,7 +38,9 @@ func init() {
 		DbName:      "spider",
 	}
 	spider.Mgo.InitPool()
-	spider.InitOther()             //加载其他信息
+	spider.InitOther() //加载其他信息
+	//验证码识别client
+	//codegrpc.InitCodeGrpcClient()
 	InitRedis(Config.Redisservers) //初始化Redis
 	//	if Config.Redistype == "0" {
 	//		redis.InitRedis(Config.Redisservers)
@@ -115,12 +117,12 @@ func main() {
 	go spider.GcCount()
 	//定时重载脚本文件
 	go spider.ReloadSpiderFile()
+	//内存信息
+	go heapprint()
 	//爬虫信息提交编辑器
 	go spider.SpiderInfoSend()
 	//处理心跳信息
 	go spider.SaveHeartInfo()
-	//内存信息
-	go heapprint()
 	//查列表页信息采集三级页
 	go spider.DetailData()
 	//批量保存错误数据

+ 55 - 0
src/spider/download.go

@@ -251,6 +251,61 @@ func DownloadFile(retLen *int64, downloaderid, url, method string, reqparam, hea
 		return nil
 	}
 }
+func NewDownloadFile(downloaderid, url, method string, reqparam, head map[string]interface{}, mycookie []*http.Cookie, encoding string, useproxy, ishttps bool, code string, timeout int64, noredirect bool) []byte {
+	defer mu.Catch()
+	timeout = timeout * 10
+	msgid := mu.UUID(8)
+	if len(head) < 1 {
+		l := len(agent.UserAgents["common"])
+		r := rand.New(rand.NewSource(time.Now().UnixNano()))
+		head["User-Agent"] = agent.UserAgents["common"][r.Intn(l)]
+	}
+	var ret []byte
+	var err error
+	if downloaderid == "" {
+		ret, err = MsclientFile.Call("", msgid, mu.SERVICE_DOWNLOAD, mu.SENDTO_TYPE_RAND_RECIVER, map[string]interface{}{
+			"url":        url,
+			"method":     method,
+			"head":       head,
+			"reqparam":   reqparam,
+			"cookie":     mycookie,
+			"encoding":   encoding,
+			"useproxy":   useproxy,
+			"ishttps":    ishttps,
+			"new":        true,
+			"noredirect": noredirect,
+		}, timeout)
+	} else {
+		if isAvailableFile(downloaderid) {
+			ret, err = MsclientFile.Call(downloaderid, msgid, mu.SERVICE_DOWNLOAD, mu.SENDTO_TYPE_P2P, map[string]interface{}{
+				"url":        url,
+				"method":     method,
+				"head":       head,
+				"reqparam":   reqparam,
+				"cookie":     mycookie,
+				"encoding":   encoding,
+				"useproxy":   useproxy,
+				"ishttps":    ishttps,
+				"new":        true,
+				"noredirect": noredirect,
+			}, timeout)
+		} else {
+			return nil
+		}
+	}
+	if err != nil {
+		str := code + "方法DownloadFile,url:" + url + ",err:" + err.Error()
+		logger.Error(str, timeout)
+	}
+	tmp := map[string]interface{}{}
+	json.Unmarshal(ret, &tmp)
+	if v, ok := tmp["code"].(string); ok && v == "200" {
+		bs, _ := base64.StdEncoding.DecodeString(tmp["content"].(string))
+		return bs
+	} else {
+		return nil
+	}
+}
 
 //下载点是否可用
 func isAvailable(code string) bool {

+ 58 - 3
src/spider/script.go

@@ -7,19 +7,19 @@ LUA中公共的方法需要抽出来,主脚本文件加载LUA公共文件
 package spider
 
 import (
+	codegrpc "analysiscode"
 	"bytes"
 	"compress/gzip"
 	"crypto/aes"
 	"encoding/base64"
 	"encoding/json"
+	"github.com/shopspring/decimal"
 	"io/ioutil"
 	mu "mfw/util"
 	"net/http"
 	"net/url"
 	"path"
 
-	"github.com/shopspring/decimal"
-
 	qu "qfw/util"
 	_ "qfw/util/redis"
 	"regexp"
@@ -867,7 +867,62 @@ func (s *Script) LoadScript(site, channel, user *string, code, script_file strin
 		S.Push(lua.LString(decimalNum.String()))
 		return 1
 	}))
-
+	//获取验证码
+	s.L.SetGlobal("getCodeByPath", s.L.NewFunction(func(S *lua.LState) int {
+		cookie := S.ToString(-1)
+		head := S.ToTable(-2)
+		stype := S.ToString(-3)
+		path := S.ToString(-4)
+		headMap := util.GetTable(head)
+		//qu.Debug("cookie----------", cookie)
+		//qu.Debug("headMap----------", headMap)
+		headJsonStr := ""
+		headByte, err := json.Marshal(headMap)
+		if err == nil {
+			headJsonStr = string(headByte)
+		}
+		code, respHead, respCookie := codegrpc.GetCodeByPath(path, stype, headJsonStr, cookie)
+		//qu.Debug("code====", code)
+		//qu.Debug("respHead====", respHead)
+		//qu.Debug("respCookie====", respCookie)
+		S.Push(lua.LString(code))
+		respHeadMap := map[string]interface{}{}
+		json.Unmarshal([]byte(respHead), &respHeadMap)
+		hTable := util.MapToLuaTable(S, respHeadMap)
+		S.Push(hTable)
+		S.Push(lua.LString(respCookie))
+		return 3
+	}))
+	s.L.SetGlobal("newDownloadFile", s.L.NewFunction(func(S *lua.LState) int {
+		cookie := S.ToString(-1)
+		head := S.ToTable(-2)
+		param := S.ToTable(-3)
+		method := S.ToString(-4)
+		url := S.ToString(-5)
+		fileName := S.ToString(-6)
+		ishttps := strings.Contains(url, "https")
+		var mycookie []*http.Cookie
+		if cookie != "{}" {
+			json.Unmarshal([]byte(cookie), &mycookie)
+		} else {
+			mycookie = make([]*http.Cookie, 0)
+		}
+		fileName = strings.TrimSpace(fileName)
+		url = strings.TrimSpace(url)
+		ret := NewDownloadFile(s.Downloader, url, method, util.GetTable(param), util.GetTable(head), mycookie, s.Encoding, false, ishttps, "", s.Timeout, false)
+		url, name, size, ftype, fid := util.UploadFile(s.SCode, fileName, url, ret)
+		if strings.TrimSpace(ftype) == "" {
+			if len(path.Ext(name)) > 0 {
+				ftype = path.Ext(name)[1:]
+			}
+		}
+		S.Push(lua.LString(url))
+		S.Push(lua.LString(name))
+		S.Push(lua.LString(size))
+		S.Push(lua.LString(ftype))
+		S.Push(lua.LString(fid))
+		return 5
+	}))
 	return ""
 }
 func dealHref(pageListUrl, href string) string {