Ver código fonte

新增工具类以及采集相关调整

maxiaoshan 1 ano atrás
pai
commit
65da19fa57

+ 29 - 0
src/spiderutil/aes.go

@@ -1,7 +1,9 @@
 package spiderutil
 
 import (
+	"bytes"
 	"crypto/aes"
+	"crypto/cipher"
 )
 
 //aes ecb加密
@@ -37,6 +39,33 @@ func AesECBDecrypter(data, key []byte) []byte {
 	return decrypted[:trim]
 }
 
+//aes cbc加密
+func AesCBCEncrypt(data, key, iv []byte) []byte {
+	// 创建一个AES块,使用提供的密钥
+	block, _ := aes.NewCipher(key)
+	// 使用CBC模式创建一个加密器
+	mode := cipher.NewCBCEncrypter(block, iv)
+	// 对数据进行填充
+	padding := aes.BlockSize - len(data)%aes.BlockSize
+	padtext := append(data, bytes.Repeat([]byte{byte(padding)}, padding)...)
+	// 加密数据
+	encrypted := make([]byte, len(padtext))
+	mode.CryptBlocks(encrypted, padtext)
+	return encrypted
+}
+
+//aes cbc解密
+func AesCBCDecrypter(data, key, iv []byte) []byte {
+	block, _ := aes.NewCipher(key)
+	decrypter := cipher.NewCBCDecrypter(block, iv)
+	// 解密数据
+	decrypted := make([]byte, len(data))
+	decrypter.CryptBlocks(decrypted, data)
+	// 去除填充
+	unpadding := int(decrypted[len(decrypted)-1])
+	return decrypted[:len(decrypted)-unpadding]
+}
+
 func generateKey(key []byte) (genKey []byte) {
 	genKey = make([]byte, 16)
 	copy(genKey, key)

+ 217 - 0
src/spiderutil/chromedputil.go

@@ -0,0 +1,217 @@
+package spiderutil
+
+type ChromeActions struct {
+	Action   string `json:"action"`   //执行动作
+	Param    string `json:"param"`    //选择器语句
+	Selector string `json:"selector"` //选择器Selector
+}
+
+type ChromeTask struct {
+	Flow     bool            `json:"flow"`     //是否是顺序采集
+	RunRedis bool            `json:"runredis"` //是否执行redis判重(只用于顺序采集)
+	TimeOut  int64           `json:"timeout"`  //超时时间
+	Actions  []ChromeActions `json:"actions"`  //动作集
+	//顺序采集时需要下方采集详情页参数
+	OtherTimeOut int64           `json:"othertimeout"` //超时时间`
+	OtherActions []ChromeActions `json:"otheractions"` //动作集
+}
+
+//const (
+//	ActionTypeNavigate    string = "navigate"
+//	ActionTypeClick       string = "click"
+//	ActionTypeOuterHTML   string = "outerhtml"
+//	ActionTypeEvaluate    string = "evaluate"
+//	ActionTypeWaitReady   string = "waitready"
+//	ActionTypeWaitVisible string = "waitvisible"
+//	ActionTypeSleep       string = "wait"
+//
+//	//ActionTypeInput  string = "input"
+//	//ActionTypeScroll string = "scroll"
+//	//ActionTypeAssert string = "assert"
+//	//ActionTypeClose  string = "close"
+//)
+//
+//type ChromedpCase struct {
+//	TimeOunt int64
+//	IsProxy  bool
+//	Actions  []Action
+//	//Flags    map[string]interface{}
+//	//Options []Option
+//}
+//
+//type Action struct {
+//	Type   string                 //动作类型
+//	Params map[string]interface{} //参数
+//
+//	//Name         string //动作名称
+//	//Wait         int64  //延时
+//	//WaitReady    string //页面等待加载
+//	//FailContinue bool   //检索失败是否继续
+//	//Checks       *Check //检查点
+//}
+//
+//func GetChromedpCase(timeout int64, isProxy bool, task []map[string]interface{}) *ChromedpCase {
+//	chromeCase := &ChromedpCase{
+//		TimeOunt: timeout,
+//		IsProxy:  isProxy,
+//		Actions:  GetActions(task),
+//		//Flags:    map[string]interface{}{},
+//	}
+//	//if blink != "" {
+//	//	chromeCase.Flags = map[string]interface{}{"disable-blink-features": "AutomationControlled"}
+//	//}
+//	return chromeCase
+//}
+//
+//func GetActions(task []map[string]interface{}) (acts []Action) {
+//	for _, method_param := range task {
+//		for method, param := range method_param {
+//			switch method {
+//			case ActionTypeNavigate: //打开网页
+//				acts = append(acts, Action{
+//					Type:   method,
+//					Params: map[string]interface{}{"url": param},
+//				})
+//			case ActionTypeClick: //点击
+//				acts = append(acts, Action{
+//					Type:   method,
+//					Params: map[string]interface{}{"selector": param},
+//				})
+//			case ActionTypeOuterHTML: //输出html
+//				acts = append(acts, Action{
+//					Type:   method,
+//					Params: map[string]interface{}{"selector": param},
+//				})
+//			case ActionTypeEvaluate: //执行javascript
+//				acts = append(acts, Action{
+//					Type:   method,
+//					Params: map[string]interface{}{"selector": param},
+//				})
+//			case ActionTypeWaitReady: //等待元素加载完毕
+//				acts = append(acts, Action{
+//					Type:   method,
+//					Params: map[string]interface{}{"selector": param},
+//				})
+//			case ActionTypeWaitVisible: //等待元素可见
+//				acts = append(acts, Action{
+//					Type:   method,
+//					Params: map[string]interface{}{"selector": param},
+//				})
+//			case ActionTypeSleep:
+//				acts = append(acts, Action{
+//					Type:   method,
+//					Params: map[string]interface{}{"duration": qu.Int64All(param)},
+//				})
+//			}
+//
+//		}
+//	}
+//	return
+//}
+//
+//func DownloadByChromedp(chromeCase *ChromedpCase) (resultHtml []string) {
+//	if chromeCase != nil {
+//		//1、设置浏览器
+//		options := []chromedp.ExecAllocatorOption{
+//			chromedp.Flag("headless", false),
+//			chromedp.Flag("disable-blink-features", "AutomationControlled"),                     //headless参数是用来控制Chrome/Chromium是否以无头模式运行的
+//			chromedp.Flag("disable-gpu", true),                                                  //关闭gpu
+//			chromedp.Flag("disable-dev-shm-usage", true),                                        //chromedp禁用系统文件存储/dev/shm
+//			chromedp.Flag("default-browser-check", true),                                        //禁用默认浏览器检查
+//			chromedp.Flag("disable-plugins", true),                                              //禁用插件
+//			chromedp.Flag("ignore-certificate-errors", true),                                    //忽略错误
+//			chromedp.Flag("disable-web-security", true),                                         //禁用网络安全标志
+//			chromedp.Flag("mute-audio", true),                                                   // 关闭声音
+//			chromedp.Flag("accept-language", `zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-TW;q=0.6`), //
+//			//chromedp.Flag("blink-settings", "imageEnable=false"),//禁用页面图片
+//			//chromedp.Flag("user-agent", ""), //客户端的类型和版本号
+//		}
+//		//其他设置
+//		if chromeCase.IsProxy {
+//			proxyAddr := "http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch"
+//			proxyauthor := "Basic amlhbnl1MDAxOjEyM3F3ZSFB"
+//			proxyIp := GetProxyAddr(proxyAddr, proxyauthor)
+//			options = append(options, chromedp.ProxyServer(proxyIp))
+//		}
+//		//for k, v := range chromeCase.Flags {
+//		//	if vv, ok := v.(string); ok {
+//		//		options = append(options, chromedp.Flag(k, vv))
+//		//	} else if vv, ok := v.(bool); ok {
+//		//		options = append(options, chromedp.Flag(k, vv))
+//		//	}
+//		//}
+//		options = append(chromedp.DefaultExecAllocatorOptions[:], options...)
+//		//2、创建上下文
+//		var ctx context.Context
+//		var cancel context.CancelFunc
+//		ctx, cancel = chromedp.NewExecAllocator(context.Background(), options...)
+//		ctx, cancel = chromedp.NewContext(ctx)
+//		if chromeCase.TimeOunt > 0 { //设置页面打开时长
+//			ctx, cancel = context.WithTimeout(ctx, time.Duration(chromeCase.TimeOunt)*time.Second)
+//		} else { //设置默认打开时长
+//			ctx, cancel = context.WithTimeout(ctx, 30*time.Second)
+//		}
+//		defer cancel() //关闭浏览器
+//		//3、执行动作集
+//		act := []chromedp.Action{}
+//		for _, action := range chromeCase.Actions {
+//			switch action.Type {
+//			case ActionTypeNavigate: //打开网页
+//				act = append(act, chromedp.Navigate(action.Params["url"].(string)))
+//			case ActionTypeClick: //点击
+//				act = append(act, chromedp.Click(action.Params["selector"].(string)))
+//			case ActionTypeOuterHTML: //输出html
+//				//act = append(act, chromedp.OuterHTML(action.Params["selector"].(string), nil))
+//				act = append(act, OuterHTMLFunc(action.Params["selector"].(string), &resultHtml))
+//			case ActionTypeEvaluate: //执行javascript
+//				//act = append(act, chromedp.Evaluate(action.Params["selector"].(string), nil))
+//				act = append(act, EvaluateFunc(action.Params["selector"].(string), &resultHtml))
+//			case ActionTypeWaitReady: //等待元素加载完毕
+//				act = append(act, chromedp.WaitReady(action.Params["selector"].(string)))
+//			case ActionTypeWaitVisible: //等待元素可见
+//				act = append(act, chromedp.WaitVisible(action.Params["selector"].(string)))
+//			case ActionTypeSleep:
+//				act = append(act, CdpSleep(action.Params["duration"].(int64)))
+//			}
+//		}
+//		err := chromedp.Run(ctx, act...)
+//		if err != nil {
+//			fmt.Println("Chromedp Run Error :", err)
+//		}
+//
+//		return
+//	}
+//	return
+//}
+//
+//func CdpSleep(sleep int64) chromedp.Action {
+//	if sleep < 1 {
+//		sleep = 1
+//	}
+//	return chromedp.Sleep(time.Duration(sleep) * time.Second)
+//}
+//
+////OuterHTML获取html
+//func OuterHTMLFunc(sel string, result *[]string) chromedp.ActionFunc {
+//	return func(ctx context.Context) (err error) {
+//		var html string
+//		//chromedp.OuterHTML(sel, &html).Do(ctx)
+//		chromedp.OuterHTML(sel, &html, chromedp.ByQuery).Do(ctx)
+//		if html != "" {
+//			*result = append(*result, html)
+//		}
+//		return
+//	}
+//}
+//
+////Evaluate获取js执行结果
+//func EvaluateFunc(sel string, result *[]string) chromedp.ActionFunc {
+//	return func(ctx context.Context) (err error) {
+//		var res string
+//		chromedp.Evaluate(sel, &res).Do(ctx)
+//		if res != "" {
+//			*result = append(*result, res)
+//		}
+//		return
+//	}
+//}

+ 2 - 3
src/spiderutil/clean.go

@@ -8,15 +8,14 @@ package spiderutil
 
 import (
 	"bytes"
+	"github.com/disintegration/imaging"
+	"github.com/donnie4w/go-logger/logger"
 	"image"
 	"image/color"
 	"math"
 	"os"
 	qu "qfw/util"
 	"strings"
-
-	"github.com/disintegration/imaging"
-	"github.com/donnie4w/go-logger/logger"
 )
 
 const (

+ 38 - 36
src/spiderutil/sysconfig.go

@@ -9,42 +9,44 @@ import (
 
 //系统配置
 type config struct {
-	Webport           string                     `json:"webport"`
-	Mongodb_spider    string                     `json:"mongodb_spider"`
-	Spider_dbsize     int                        `json:"spider_dbsize"`
-	Mongodb_editor    string                     `json:"mongodb_editor"`
-	Editor_dbsize     int                        `json:"editor_dbsize"`
-	Mongodb_dbsize    int                        `json:"mongodb_dbsize"`
-	Msgname           string                     `json:"msgname"`
-	Msgserveraddr     string                     `json:"msgserveraddr"`
-	MsgserveraddrFile string                     `json:"msgserveraddrfile"`
-	Editoraddr        string                     `json:"editoraddr"`
-	Tesseractadd      string                     `json:"tesseractadd"`
-	BidEditor         dbInfo                     `json:"bideditor"`
-	Bidding           dbInfo                     `json:"bidding"`
-	Testdir           string                     `json:"testdir"`
-	Uploadevent       int                        `json:"uploadevent"`
-	Redistype         string                     `json:"redistype"`
-	Redisservers      string                     `json:"redisservers"`
-	BloomRedisservers string                     `json:"bloomredisservers"`
-	Redishosts        []string                   `json:"redishosts"`
-	FileServer        string                     `json:"fileServer"`
-	Luadisablelib     map[string]map[string]bool `json:"luadisablelib"`
-	Working           int                        `json:"working"` //0高效模式,1节能模式
-	Chansize          int                        `json:"chansize"`
-	DetailChansize    int                        `json:"detailchansize"` //下载三级页的线程数
-	LogLevel          int                        `json:"logLevel"`       //日志基本1debug 2info 3warn
-	DayNum            int                        `json:"daynum"`
-	Modal             int                        `json:"Modal"`             //1列表页三级页分开采集,0原始采完列表采三级页(7000,7700)
-	IsHistoryEvent    bool                       `json:"ishistoryevent"`    //只有7000为true
-	SiteType          map[string][]string        `json:"sitetype"`          //网站类型
-	SiteColl          string                     `json:"sitecoll"`          //网站表名
-	ThreadBaseNum     int                        `json:"threadbasenum"`     //开启线程的数据基数
-	ThreadUpperLimit  int                        `json:"threadupperlimit"`  //总线程上限
-	RedisClusterAddrs []string                   `json:"redisclusteraddrs"` //redis集群地址
-	ProxyAddr         string                     `json:"proxyaddr"`         //代理地址
-	ProxyAuthor       string                     `json:"proxyauthor"`       //代理作者
-	RenderAddr        string                     `json:"renderaddr"`        //页面渲染服务地址
+	Webport                   string                     `json:"webport"`
+	Mongodb_spider            string                     `json:"mongodb_spider"`
+	Spider_dbsize             int                        `json:"spider_dbsize"`
+	Mongodb_editor            string                     `json:"mongodb_editor"`
+	Editor_dbsize             int                        `json:"editor_dbsize"`
+	Mongodb_dbsize            int                        `json:"mongodb_dbsize"`
+	Msgname                   string                     `json:"msgname"`
+	Msgserveraddr             string                     `json:"msgserveraddr"`
+	MsgserveraddrFile         string                     `json:"msgserveraddrfile"`
+	MsgserveraddrChromedp     string                     `json:"msgserveraddrchromedp"`
+	MsgserveraddrChromedpTest string                     `json:"msgserveraddrchromedptest"`
+	Editoraddr                string                     `json:"editoraddr"`
+	Tesseractadd              string                     `json:"tesseractadd"`
+	BidEditor                 dbInfo                     `json:"bideditor"`
+	Bidding                   dbInfo                     `json:"bidding"`
+	Testdir                   string                     `json:"testdir"`
+	Uploadevent               int                        `json:"uploadevent"`
+	Redistype                 string                     `json:"redistype"`
+	Redisservers              string                     `json:"redisservers"`
+	BloomRedisservers         string                     `json:"bloomredisservers"`
+	Redishosts                []string                   `json:"redishosts"`
+	FileServer                string                     `json:"fileServer"`
+	Luadisablelib             map[string]map[string]bool `json:"luadisablelib"`
+	Working                   int                        `json:"working"` //0高效模式,1节能模式
+	Chansize                  int                        `json:"chansize"`
+	DetailChansize            int                        `json:"detailchansize"` //下载三级页的线程数
+	LogLevel                  int                        `json:"logLevel"`       //日志基本1debug 2info 3warn
+	DayNum                    int                        `json:"daynum"`
+	Modal                     int                        `json:"Modal"`             //1列表页三级页分开采集,0原始采完列表采三级页(7000,7700)
+	IsHistoryEvent            bool                       `json:"ishistoryevent"`    //只有7000为true
+	SiteType                  map[string][]string        `json:"sitetype"`          //网站类型
+	SiteColl                  string                     `json:"sitecoll"`          //网站表名
+	ThreadBaseNum             int                        `json:"threadbasenum"`     //开启线程的数据基数
+	ThreadUpperLimit          int                        `json:"threadupperlimit"`  //总线程上限
+	RedisClusterAddrs         []string                   `json:"redisclusteraddrs"` //redis集群地址
+	ProxyAddr                 string                     `json:"proxyaddr"`         //代理地址
+	ProxyAuthor               string                     `json:"proxyauthor"`       //代理作者
+	RenderAddr                string                     `json:"renderaddr"`        //页面渲染服务地址
 
 	//补漏
 	Checkmaxpage  map[string]int `json:"checkmaxpage"`

+ 2 - 2
src/spiderutil/upload.go

@@ -16,8 +16,8 @@ import (
 )
 
 var weedclClient *weedcl.Client
-var fileTypeReg = regexp.MustCompile("^(xlsx|xls|bmp|mdb|docx|gif|avi|chm|dbx|jpg|mp4|bat|bmp|psd|eml|rtf|mpg|ini|wpd|pwl|flv|doc|pdf|wmv|mid|mxp|qdf|ps|mp3|wav|sql|rar|torrent|png|bmp|jar|pst|css|xml|rmvb|tif|dwg|zip|mov|mf|properties|js|gz|ram|exe)$")
-var commUsedReg = regexp.MustCompile("^(docx|gif|jpg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls)$")
+var fileTypeReg = regexp.MustCompile("^(xlsx|xls|bmp|mdb|docx|gif|avi|chm|dbx|jpg|jpeg|mp4|bat|bmp|psd|eml|rtf|mpg|ini|wpd|pwl|flv|doc|pdf|wmv|mid|mxp|qdf|ps|mp3|wav|sql|rar|torrent|png|bmp|jar|pst|css|xml|rmvb|tif|dwg|zip|mov|mf|properties|js|gz|ram|exe)$")
+var commUsedReg = regexp.MustCompile("^(docx|gif|jpg|jpeg|doc|pdf|rar|png|zip|gz|swf|xlsx|xls)$")
 var filterTypeReg = regexp.MustCompile("^(html|htm|xml|json)$")
 
 func InitWeedcl() {

+ 2 - 0
src/spiderutil/util.go

@@ -107,6 +107,8 @@ func MapToTable(l *lua.LState, obj []interface{}) *lua.LTable {
 				tb.RawSet(lua.LString(k), lua.LString(v))
 			}
 			listtb.Insert((i + 1), tb)
+		} else if tmp, ok := obj[i].(string); ok {
+			listtb.Insert(i, lua.LString(tmp))
 		}
 	}
 	return listtb