瀏覽代碼

工具类更新

mxs 1 年之前
父節點
當前提交
c610b11069

+ 12 - 0
src/spiderutil/clearHtml.go

@@ -20,6 +20,9 @@ type Cut struct {
 	styletag      *regexp.Regexp
 	colstag       *regexp.Regexp
 	rowstag       *regexp.Regexp
+	gttag         *regexp.Regexp
+	lttag         *regexp.Regexp
+	quotag        *regexp.Regexp
 	display       *regexp.Regexp
 	multiCR       *regexp.Regexp
 	replBlankLine *regexp.Regexp
@@ -42,6 +45,9 @@ func NewCut() *Cut {
 	rows, _ := regexp.Compile(`rowspan="\d+"`)
 	border, _ := regexp.Compile(`(border="(\d+)")|(cellpadding="(\d+)")|(cellspacing="(\d+)")`)
 	dis, _ := regexp.Compile(`display:none`)
+	gt := regexp.MustCompile(">")
+	lt := regexp.MustCompile("<")
+	quo := regexp.MustCompile(""")
 	return &Cut{
 		annotate:      at,
 		tag:           t,
@@ -51,6 +57,9 @@ func NewCut() *Cut {
 		colstag:       cols,
 		isborder:      border,
 		rowstag:       rows,
+		gttag:         gt,
+		lttag:         lt,
+		quotag:        quo,
 		display:       dis,
 		multiCR:       m,
 		replBlankLine: regexp.MustCompile("\\s+[\r\n]"),
@@ -148,6 +157,9 @@ func (c *Cut) ClearHtml(src string) string {
 	src = c.replBlankLine.ReplaceAllString(src, "\n")
 	//清除多余换行
 	c.multiCR.ReplaceAllString(src, "\n")
+	src = c.gttag.ReplaceAllString(src, ">")
+	src = c.lttag.ReplaceAllString(src, "<")
+	src = c.quotag.ReplaceAllString(src, `"`)
 	return strings.Replace(src, "\n", "<br/>", -1)
 }
 

+ 0 - 142
src/spiderutil/nats.go

@@ -1,142 +0,0 @@
-package spiderutil
-
-import (
-	"context"
-	"github.com/gogf/gf/v2/encoding/gcompress"
-	"github.com/gogf/gf/v2/frame/g"
-	"github.com/nats-io/nats.go"
-	"time"
-)
-
-/*
-没有消费者,消息会丢掉
-1、使用reply机制,应用来控制
-2、必须先有订阅,且订阅者不能断
-*/
-
-type Jnats struct {
-	Addr string //nats服务地址
-	Nc   *nats.Conn
-}
-
-func NewJnats(addr string) *Jnats {
-	js := &Jnats{
-		Addr: addr,
-	}
-	js.ReConnect()
-	return js
-}
-
-// 连接、设置、重试
-func (j *Jnats) ReConnect() bool {
-	var err error
-	opts := []nats.Option{
-		//nats.Name(c.Name), 指定clent名字
-		// nats.SetCustomDialer(n),
-		nats.MaxReconnects(86400),
-		nats.ReconnectWait(time.Second), //默认两秒
-		nats.ReconnectBufSize(83886080), //在链接繁时的消息缓冲,默认8M
-		nats.ErrorHandler(func(_ *nats.Conn, _ *nats.Subscription, err error) {
-			g.Log().Error(context.Background(), "NATS error: %v", err)
-		}),
-		nats.DisconnectHandler(func(c *nats.Conn) {
-			g.Log().Info(context.Background(), "Disconnected from NATS")
-		}),
-		nats.ClosedHandler(func(c *nats.Conn) {
-			g.Log().Info(context.Background(), "NATS connection is closed")
-		}),
-	}
-	if j.Nc == nil || j.Nc.IsClosed() {
-		j.Nc, err = nats.Connect(j.Addr, opts...)
-		if err != nil {
-			g.Log().Error(context.Background(), "NATS connect error: %v", err)
-			time.Sleep(time.Second)
-			return j.ReConnect()
-		} else {
-			return true
-		}
-	} else if j.Nc.IsConnected() { //连接状态
-		return true
-	} else { //异常状态
-		j.Nc.Flush()
-		j.Nc.Drain()
-		j.Nc.Close()
-		time.Sleep(time.Second)
-		return j.ReConnect()
-	}
-	return false
-}
-
-// 生产消息
-func (j *Jnats) Pub(sub string, msg any) error {
-	return j.Nc.Publish(sub, g.NewVar(msg).Bytes())
-}
-
-// 生产压缩消息
-func (j *Jnats) PubZip(sub string, msg any) error {
-	res, err := gcompress.Zlib(g.NewVar(msg).Bytes())
-	if err != nil {
-		return err
-	} else {
-		return j.Nc.Publish(sub, res)
-	}
-}
-
-// 直接消费消息
-func (j *Jnats) Sub(sub string, handle func(msg *nats.Msg)) {
-	_, err := j.Nc.Subscribe(sub, handle)
-	if err != nil {
-		g.Log().Error(context.Background(), err)
-	}
-}
-
-// 直接消费压缩消息
-func (j *Jnats) SubZip(sub string, handle func(msg *nats.Msg)) {
-	_, err := j.Nc.Subscribe(sub, func(msg *nats.Msg) {
-		v := msg.Data
-		if len(v) > 0 {
-			res, err := gcompress.UnZlib(v)
-			if err != nil {
-				g.Log().Error(context.Background(), "NATS gcompress error: %v", err, msg)
-			} else {
-				msg.Data = res
-			}
-		}
-		handle(msg)
-	})
-	if err != nil {
-		g.Log().Error(context.Background(), err)
-	}
-}
-
-// 队列负载分组消费压缩消息
-func (j *Jnats) QueueSubZip(sub, queue string, handle func(msg *nats.Msg)) {
-	_, err := j.Nc.QueueSubscribe(sub, queue, func(msg *nats.Msg) {
-		v := msg.Data
-		if len(v) > 0 {
-			res, err := gcompress.UnZlib(v)
-			if err != nil {
-				g.Log().Error(context.Background(), "NATS gcompress error: %v", err, msg)
-			} else {
-				msg.Data = res
-			}
-		}
-		handle(msg)
-	})
-	if err != nil {
-		g.Log().Error(context.Background(), err)
-	}
-}
-
-//queue 队列消息暂时不用
-
-// 回复机制
-// 生产压缩消息并请求回执
-func (j *Jnats) PubReqZip(sub string, msg any, timeout time.Duration) (*nats.Msg, error) {
-	res, err := gcompress.Zlib(g.NewVar(msg).Bytes())
-	if err != nil {
-		return nil, err
-	} else {
-		return j.Nc.Request(sub, res, timeout)
-	}
-}

+ 0 - 55
src/spiderutil/pythonserver.go

@@ -8,12 +8,8 @@ package spiderutil
 
 import (
 	"encoding/json"
-	"fmt"
 	"github.com/imroc/req"
-	"io"
-	"net/http"
 	qu "qfw/util"
-	"strings"
 	"time"
 )
 
@@ -62,54 +58,3 @@ func DownloadByRender(href string) (contenthtml string) {
 	}
 	return
 }
-
-func DownloadByDataIntercept(url, url_regex string, timeout int, proxy bool) map[string]interface{} {
-	defer qu.Catch()
-	request_params := map[string]interface{}{
-		"user_agent": "Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.0 Safari/532.2",
-		"timeout":    timeout,
-		"wait_until": "networkidle",
-	}
-	if proxy {
-		request_params["proxy"] = GetProxyAddr(Config.ProxyAddr, Config.ProxyAuthor)
-	}
-	intercept_params := map[string]interface{}{
-		"url_regex":   url_regex,
-		"url_regexes": []string{url_regex},
-		"event":       "request",
-	}
-	param := map[string]interface{}{
-		"request_params":   request_params,
-		"intercept_params": intercept_params,
-	}
-	param_json, _ := json.Marshal(param)
-	var data = strings.NewReader(string(param_json))
-	client := &http.Client{}
-	req, err := http.NewRequest("POST", fmt.Sprintf(Config.PwAddr, url), data)
-	if err != nil {
-		return map[string]interface{}{}
-	}
-	req.Header.Set("accept", "application/json")
-	req.Header.Set("Content-Type", "application/json")
-	for i := 1; i <= 3; i++ {
-		resp, err := client.Do(req)
-		if err != nil {
-			continue
-		}
-		defer resp.Body.Close()
-		bodyText, err := io.ReadAll(resp.Body)
-		if err != nil {
-			continue
-		}
-		result := map[string]interface{}{}
-		if json.Unmarshal(bodyText, &result) == nil {
-			if msg, ok := result["msg"].(string); ok && msg == "success" {
-				if r, ok := result["r"].(map[string]interface{}); ok && r != nil {
-					headers, _ := r["headers"].(map[string]interface{})
-					return headers
-				}
-			}
-		}
-	}
-	return map[string]interface{}{}
-}

+ 41 - 0
src/spiderutil/sortmapintval.go

@@ -0,0 +1,41 @@
+package spiderutil
+
+import "sort"
+
+type IntValSorter struct {
+	Keys []string
+	Vals []int
+}
+
+func MapIntValueSort(m map[string]int) *IntValSorter {
+	ivs := NewIntValSorter(m)
+	ivs.Sort()
+	return ivs
+}
+
+func NewIntValSorter(m map[string]int) *IntValSorter {
+	ivs := &IntValSorter{
+		Keys: make([]string, 0, len(m)),
+		Vals: make([]int, 0, len(m)),
+	}
+	for k, v := range m {
+		ivs.Keys = append(ivs.Keys, k)
+		ivs.Vals = append(ivs.Vals, v)
+	}
+	return ivs
+}
+
+func (ivs *IntValSorter) Sort() {
+	sort.Sort(ivs)
+}
+
+func (ivs *IntValSorter) Len() int {
+	return len(ivs.Vals)
+}
+func (ivs *IntValSorter) Less(i, j int) bool {
+	return ivs.Vals[i] < ivs.Vals[j]
+}
+func (ivs *IntValSorter) Swap(i, j int) {
+	ivs.Vals[i], ivs.Vals[j] = ivs.Vals[j], ivs.Vals[i]
+	ivs.Keys[i], ivs.Keys[j] = ivs.Keys[j], ivs.Keys[i]
+}

+ 0 - 0
src/spiderutil/sortmapval.go → src/spiderutil/sortmapstrval.go


+ 2 - 1
src/spiderutil/sysconfig.go

@@ -41,6 +41,8 @@ type config struct {
 	PageTurnInfo              pageTurnInfo               `json:"pageturninfo"`      //翻页相关配置
 	IsHistoryEvent            bool                       `json:"ishistoryevent"`    //只有7000为true
 	SiteType                  map[string][]string        `json:"sitetype"`          //网站类型
+	SiteType2                 map[string][]string        `json:"sitetype2"`         //网站类型
+	PlatType                  map[string]bool            `json:"plattype"`          //平台类型
 	SiteColl                  string                     `json:"sitecoll"`          //网站表名
 	ThreadBaseNum             int                        `json:"threadbasenum"`     //开启线程的数据基数
 	ThreadUpperLimit          int                        `json:"threadupperlimit"`  //总线程上限
@@ -48,7 +50,6 @@ type config struct {
 	ProxyAddr                 string                     `json:"proxyaddr"`         //代理地址
 	ProxyAuthor               string                     `json:"proxyauthor"`       //代理作者
 	RenderAddr                string                     `json:"renderaddr"`        //页面渲染服务地址
-	PwAddr                    string                     `json:"pwaddr"`            //瑞数采集服务地址
 	//补漏
 	Checkmaxpage  map[string]int `json:"checkmaxpage"`
 	CheckHourTime int            `json:"checkhourtime"`

+ 1 - 1
src/spiderutil/template.go

@@ -114,7 +114,7 @@ function downloadDetailPage(data)
 			return data
 		else
 			timeSleep(60)--延时60秒再次请求
-			if i==5 then
+			if i==3 then
 				--print("下载失败")
 				saveErrLog(spiderCode,spiderName,data["href"],err)
 			end

+ 19 - 2
src/spiderutil/util.go

@@ -17,11 +17,15 @@ import (
 	"os"
 	"qfw/util"
 	"regexp"
+	"strings"
 	"time"
 )
 
-var Reg = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+")
-var Filter = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
+var (
+	Reg      = regexp.MustCompile("[^0-9A-Za-z\u4e00-\u9fa5]+")
+	Filter   = regexp.MustCompile("<[^>]*?>|[\\s\u3000\u2003\u00a0]")
+	InputReg = regexp.MustCompile(`<input[^>]*?\svalue="([^"]*?)"`)
+)
 
 //time.AfterFunc 加锁
 
@@ -213,6 +217,19 @@ func HexTextByte(text []byte) string {
 }
 
 func Sha(con string) string {
+	h := sha256.New()
+	//提取input
+	matches := InputReg.FindAllStringSubmatch(con, -1)
+	//替换input
+	for _, match := range matches {
+		con = strings.Replace(con, match[0], match[1], 1)
+	}
+	con = Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "")
+	h.Write([]byte(con))
+	return fmt.Sprintf("%x", h.Sum(nil))
+}
+
+func Sha_back(con string) string {
 	h := sha256.New()
 	con = Reg.ReplaceAllString(Filter.ReplaceAllString(con, ""), "")
 	h.Write([]byte(con))