123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345 |
- package main
- import (
- "bytes"
- "encoding/json"
- "flag"
- "fmt"
- "io"
- "io/ioutil"
- "log"
- "net/http"
- "os"
- "regexp"
- "sort"
- "strconv"
- "strings"
- "sync"
- "time"
- "github.com/cespare/xxhash/v2"
- "golang.org/x/net/proxy"
- )
- type (
- //Attach 附件
- Attach struct {
- Department string `json:"department"`
- Href string `json:"attach_href"`
- Title string `json:"info_title"`
- Raw string
- }
- Attaches []*Attach
- )
- var (
- df = flag.String("df", "./data.dat", "datafile 数据文件")
- save2dir = flag.String("d", "./attaches", "save to dir 保存到目录")
- areadir = flag.String("ad", "河南/郑州市/政府", "地域目录")
- defaultYear = flag.String("y", "", "year 默认年份,注意:只有整个下载清单是同一个年份才可以指定")
- threads = flag.Int("ts", 4, "threads 线程数")
- sleep = flag.Int64("s", 0, "sleep 下载间歇")
- filter = flag.String("ft", "true", "filter title 是否过滤标题,标题必须包含年份、预算或者预决算")
- defaultExt = flag.String("de", "pdf", "default ext默认扩展名")
- headFile = flag.String("hf", "", "headfile 下载请求头")
- logFile = flag.String("lf", "./error.dat", "log file日志文件")
- proxyStr = flag.String("p", "", "proxy 代理,使用代理时,只能开一个线程")
- connectTimeout = flag.Int64("cto", 30, "超时设定")
- defaultYearInt int
- fileIndex = map[string]int{}
- allowFileExt = map[string]bool{
- "doc": true,
- "docx": true,
- "xls": true,
- "xlsx": true,
- "pdf": true,
- "zip": true,
- "rar": true,
- "txt": true,
- "png": true,
- "jpg": true,
- "jpeg": true,
- "ppt": true,
- "pptx": true,
- "wps": true,
- }
- reg, _ = regexp.Compile("(\\d{4})")
- reg4Title1, _ = regexp.Compile("(202\\d+)年度?(.*?)((部门)*(预|决)算)")
- reg4Title2, _ = regexp.Compile("(.*?)(202\\d+)年度?((部门)*(预|决)算)")
- downloadLog *SpiderLog
- header = map[string]string{}
- fnLock = new(sync.RWMutex)
- proxyUri []string
- )
- // init
- func init() {
- flag.Parse()
- var err error
- defaultYearInt, _ = strconv.Atoi(*defaultYear)
- //TODO 目录检查并创建目录 3年的
- for _, year := range []int{2022, 2023, 2024} {
- path := fmt.Sprintf("%s/%d年/%s", *save2dir, year, *areadir)
- if _, err = os.Stat(path); err != nil {
- os.MkdirAll(path, os.ModeAppend|os.ModePerm)
- }
- }
- if *headFile != "" {
- bs, err := ioutil.ReadFile(*headFile)
- if err == nil {
- content := string(bs)
- for _, l := range strings.Split(content, "\n") {
- if strings.HasPrefix(l, "#") {
- continue
- }
- pos := strings.Index(l, ":")
- if pos > 0 {
- key := l[:pos]
- value := l[pos+1:]
- header[key] = value
- }
- }
- }
- }
- log.Println(header)
- if *logFile != "" {
- downloadLog, err = NewSpiderLog(*logFile)
- if err != nil {
- log.Fatal(err)
- }
- }
- proxyUri = strings.Split(*proxyStr, ";")
- }
- // id
- func id(text string) uint64 {
- has := xxhash.New()
- has.WriteString(text)
- return has.Sum64()
- }
- // changeIp
- func changeIp(client *http.Client) {
- resp, err := client.Get("http://pleasechangemyip.com")
- if err == nil {
- ioutil.ReadAll(resp.Body)
- resp.Body.Close()
- }
- //time.Sleep(1 * time.Second)
- }
- // httpClient
- func httpClient(index int) *http.Client {
- if *proxyStr != "" {
- pos := index % len(proxyUri)
- dialer, err := proxy.SOCKS5("tcp", proxyUri[pos], nil, proxy.Direct)
- if err != nil {
- log.Println(err.Error())
- }
- // setup a http client
- httpTransport := &http.Transport{}
- httpTransport.Dial = dialer.Dial
- httpClient := &http.Client{Transport: httpTransport, Timeout: time.Duration(*connectTimeout) * time.Second}
- changeIp(httpClient)
- return httpClient
- } else {
- return &http.Client{Timeout: time.Duration(*connectTimeout) * time.Second}
- }
- }
- // readDataFile
- func readDataFile() Attaches {
- ret := make(Attaches, 0, 0)
- bs, err := ioutil.ReadFile(*df)
- if err != nil {
- log.Fatal(err)
- }
- content := string(bs)
- for _, s := range strings.Split(content, "\n") {
- if len(s) == 0 {
- continue
- }
- var data string = s
- if strings.HasPrefix(s, ",") {
- data = s[1:]
- }
- var attach = new(Attach)
- err = json.Unmarshal([]byte(data), attach)
- if err == nil {
- ret = append(ret, attach)
- attach.Raw = s
- }
- }
- return ret
- }
- // getExt 取得后缀,扩展名
- func getExt(text string) string {
- if strings.Contains(text, ".") {
- tmp := strings.Split(text, ".")
- return tmp[len(tmp)-1]
- } else {
- return ""
- }
- }
- // 分解title
- func extractTitle(organ, title, href string) (year int, department, ext string) {
- if v := strings.ToLower(getExt(title)); allowFileExt[v] {
- ext = v
- } else if v := getExt(href); allowFileExt[v] {
- ext = v
- }
- if ext == "" {
- ext = *defaultExt
- }
- if *filter == "true" {
- check := strings.Contains(organ, "预算") || strings.Contains(organ, "预决算")
- if !check && !strings.Contains(title, "预算") && !strings.Contains(title, "预决算") {
- log.Print("失败(文件不包含预算|决算)", organ, title)
- return
- }
- }
- var matchObj []string
- if reg.MatchString(title) {
- matchObj = reg.FindAllString(title, -1)
- } else if reg.MatchString(organ) {
- matchObj = reg.FindAllString(organ, -1)
- }
- //年排序
- if matchObj != nil && len(matchObj) > 0 {
- yearArr := make([]int, len(matchObj))
- for i, v := range matchObj {
- yearArr[i], _ = strconv.Atoi(v)
- }
- sort.Slice(yearArr, func(i, j int) bool { return yearArr[i] > yearArr[j] })
- year = yearArr[0]
- }
- if department == "" {
- if len(organ) > len(title) {
- department = organ
- } else {
- department = title
- }
- }
- return
- }
- // 拼装文件名
- func createFileName(organ, title, href string) (year int, filename string) {
- fnLock.Lock()
- defer fnLock.Unlock()
- year, department, ext := extractTitle(organ, title, href)
- if year < 2022 || year > 2024 {
- log.Println("无法生成有效的文件名", organ, title, href)
- return 0, ""
- }
- //修正文件名,去掉特殊符号
- department = strings.Map(func(r rune) rune {
- switch r {
- case '\n', '.', '-', '?', '=', '&', '*', '#', '$', '%', '^', '(', ')', '|':
- return -1
- default:
- return r
- }
- }, department)
- key := fmt.Sprintf("%d_%s", year, department)
- if v, ok := fileIndex[key]; ok {
- fileIndex[key] = v + 1
- } else {
- fileIndex[key] = 1
- }
- filename = fmt.Sprintf("%d_%s_%d.%s",
- year, department, fileIndex[key], ext)
- return year, filename
- }
- // download
- func download(attach *Attach, index int) {
- //生成本地文件
- year, newFileName := createFileName(attach.Department, attach.Title, attach.Href)
- if newFileName == "" {
- log.Println("失败(无法生成文件名)", attach.Title, attach.Href)
- downloadLog.Log(attach)
- return
- }
- log.Println("dowload and save to ", attach.Title, attach.Department, newFileName)
- //TODO 下载文件
- req, err := http.NewRequest("GET", attach.Href, nil)
- if err != nil {
- log.Println("失败(请求头)", attach.Title, attach.Href)
- downloadLog.Log(attach)
- return
- }
- for k, v := range header {
- req.Header.Set(k, v)
- }
- client := httpClient(index)
- resp, err := client.Do(req)
- if err != nil {
- log.Println("失败(网络问题)", attach.Title, attach.Href)
- downloadLog.Log(attach)
- return
- }
- buf := bytes.NewBuffer(nil)
- _, err = io.Copy(buf, resp.Body)
- if err != nil {
- log.Println("失败(网络问题)", attach.Title, attach.Href)
- downloadLog.Log(attach)
- return
- }
- resp.Body.Close()
- //TODO 保存到目录
- outputPaht := fmt.Sprintf("%s/%d年/%s/%s", *save2dir, year, *areadir, newFileName)
- //log.Println(outputPaht)
- err = os.WriteFile(outputPaht, buf.Bytes(), os.ModeAppend|os.ModePerm)
- if err != nil {
- log.Println("失败(写入文件)", attach.Title, attach.Href)
- downloadLog.Log(attach)
- return
- }
- if *sleep > 0 {
- time.Sleep(time.Duration(*sleep) * time.Second)
- }
- }
- // main
- func main() {
- checkHrefRepeat := map[uint64]bool{}
- lock := make(chan bool, *threads)
- wg := new(sync.WaitGroup)
- fn := func(attach *Attach, index int) {
- defer func() {
- <-lock
- wg.Done()
- }()
- download(attach, index)
- }
- attches := readDataFile()
- attchesLen := len(attches)
- for index, attach := range attches {
- _id := id(attach.Href)
- if _, ok := checkHrefRepeat[_id]; ok {
- log.Println("失败(URL地址重复)", attach.Title, attach.Href)
- continue
- }
- checkHrefRepeat[_id] = true
- lock <- true
- wg.Add(1)
- go fn(attach, index)
- log.Printf("当前下载进度%.2f%%\n", float32(index)/float32(attchesLen)*100)
- }
- wg.Wait()
- log.Println("下载完成,请关闭本窗口")
- close(lock)
- time.Sleep(20 * time.Second)
- }
|