package main import ( "bytes" "encoding/json" "flag" "fmt" "io" "io/ioutil" "log" "net/http" "os" "regexp" "sort" "strconv" "strings" "sync" "time" "github.com/cespare/xxhash/v2" "golang.org/x/net/proxy" ) type ( //Attach 附件 Attach struct { Department string `json:"department"` Href string `json:"attach_href"` Title string `json:"info_title"` Raw string } Attaches []*Attach ) var ( df = flag.String("df", "./data.dat", "datafile 数据文件") save2dir = flag.String("d", "./attaches", "save to dir 保存到目录") areadir = flag.String("ad", "河南/郑州市/政府", "地域目录") defaultYear = flag.String("y", "", "year 默认年份,注意:只有整个下载清单是同一个年份才可以指定") threads = flag.Int("ts", 4, "threads 线程数") sleep = flag.Int64("s", 0, "sleep 下载间歇") filter = flag.String("ft", "true", "filter title 是否过滤标题,标题必须包含年份、预算或者预决算") defaultExt = flag.String("de", "pdf", "default ext默认扩展名") headFile = flag.String("hf", "", "headfile 下载请求头") logFile = flag.String("lf", "./error.dat", "log file日志文件") proxyStr = flag.String("p", "", "proxy 代理,使用代理时,只能开一个线程") connectTimeout = flag.Int64("cto", 30, "超时设定") defaultYearInt int fileIndex = map[string]int{} allowFileExt = map[string]bool{ "doc": true, "docx": true, "xls": true, "xlsx": true, "pdf": true, "zip": true, "rar": true, "txt": true, "png": true, "jpg": true, "jpeg": true, "ppt": true, "pptx": true, "wps": true, } reg, _ = regexp.Compile("(\\d{4})") reg4Title1, _ = regexp.Compile("(202\\d+)年度?(.*?)((部门)*(预|决)算)") reg4Title2, _ = regexp.Compile("(.*?)(202\\d+)年度?((部门)*(预|决)算)") downloadLog *SpiderLog header = map[string]string{} fnLock = new(sync.RWMutex) proxyUri []string ) // init func init() { flag.Parse() var err error defaultYearInt, _ = strconv.Atoi(*defaultYear) //TODO 目录检查并创建目录 3年的 for _, year := range []int{2022, 2023, 2024} { path := fmt.Sprintf("%s/%d年/%s", *save2dir, year, *areadir) if _, err = os.Stat(path); err != nil { os.MkdirAll(path, os.ModeAppend|os.ModePerm) } } if *headFile != "" { bs, err := ioutil.ReadFile(*headFile) if err == nil { content := string(bs) for _, l := range strings.Split(content, "\n") { if strings.HasPrefix(l, "#") { continue } pos := strings.Index(l, ":") if pos > 0 { key := l[:pos] value := l[pos+1:] header[key] = value } } } } log.Println(header) if *logFile != "" { downloadLog, err = NewSpiderLog(*logFile) if err != nil { log.Fatal(err) } } proxyUri = strings.Split(*proxyStr, ";") } // id func id(text string) uint64 { has := xxhash.New() has.WriteString(text) return has.Sum64() } // changeIp func changeIp(client *http.Client) { resp, err := client.Get("http://pleasechangemyip.com") if err == nil { ioutil.ReadAll(resp.Body) resp.Body.Close() } //time.Sleep(1 * time.Second) } // httpClient func httpClient(index int) *http.Client { if *proxyStr != "" { pos := index % len(proxyUri) dialer, err := proxy.SOCKS5("tcp", proxyUri[pos], nil, proxy.Direct) if err != nil { log.Println(err.Error()) } // setup a http client httpTransport := &http.Transport{} httpTransport.Dial = dialer.Dial httpClient := &http.Client{Transport: httpTransport, Timeout: time.Duration(*connectTimeout) * time.Second} changeIp(httpClient) return httpClient } else { return &http.Client{Timeout: time.Duration(*connectTimeout) * time.Second} } } // readDataFile func readDataFile() Attaches { ret := make(Attaches, 0, 0) bs, err := ioutil.ReadFile(*df) if err != nil { log.Fatal(err) } content := string(bs) for _, s := range strings.Split(content, "\n") { if len(s) == 0 { continue } var data string = s if strings.HasPrefix(s, ",") { data = s[1:] } var attach = new(Attach) err = json.Unmarshal([]byte(data), attach) if err == nil { ret = append(ret, attach) attach.Raw = s } } return ret } // getExt 取得后缀,扩展名 func getExt(text string) string { if strings.Contains(text, ".") { tmp := strings.Split(text, ".") return tmp[len(tmp)-1] } else { return "" } } // 分解title func extractTitle(organ, title, href string) (year int, department, ext string) { if v := strings.ToLower(getExt(title)); allowFileExt[v] { ext = v } else if v := getExt(href); allowFileExt[v] { ext = v } if ext == "" { ext = *defaultExt } if *filter == "true" { check := strings.Contains(organ, "预算") || strings.Contains(organ, "预决算") if !check && !strings.Contains(title, "预算") && !strings.Contains(title, "预决算") { log.Print("失败(文件不包含预算|决算)", organ, title) return } } var matchObj []string if reg.MatchString(title) { matchObj = reg.FindAllString(title, -1) } else if reg.MatchString(organ) { matchObj = reg.FindAllString(organ, -1) } //年排序 if matchObj != nil && len(matchObj) > 0 { yearArr := make([]int, len(matchObj)) for i, v := range matchObj { yearArr[i], _ = strconv.Atoi(v) } sort.Slice(yearArr, func(i, j int) bool { return yearArr[i] > yearArr[j] }) year = yearArr[0] } if department == "" { if len(organ) > len(title) { department = organ } else { department = title } } return } // 拼装文件名 func createFileName(organ, title, href string) (year int, filename string) { fnLock.Lock() defer fnLock.Unlock() year, department, ext := extractTitle(organ, title, href) if year < 2022 || year > 2024 { log.Println("无法生成有效的文件名", organ, title, href) return 0, "" } //修正文件名,去掉特殊符号 department = strings.Map(func(r rune) rune { switch r { case '\n', '.', '-', '?', '=', '&', '*', '#', '$', '%', '^', '(', ')', '|': return -1 default: return r } }, department) key := fmt.Sprintf("%d_%s", year, department) if v, ok := fileIndex[key]; ok { fileIndex[key] = v + 1 } else { fileIndex[key] = 1 } filename = fmt.Sprintf("%d_%s_%d.%s", year, department, fileIndex[key], ext) return year, filename } // download func download(attach *Attach, index int) { //生成本地文件 year, newFileName := createFileName(attach.Department, attach.Title, attach.Href) if newFileName == "" { log.Println("失败(无法生成文件名)", attach.Title, attach.Href) downloadLog.Log(attach) return } log.Println("dowload and save to ", attach.Title, attach.Department, newFileName) //TODO 下载文件 req, err := http.NewRequest("GET", attach.Href, nil) if err != nil { log.Println("失败(请求头)", attach.Title, attach.Href) downloadLog.Log(attach) return } for k, v := range header { req.Header.Set(k, v) } client := httpClient(index) resp, err := client.Do(req) if err != nil { log.Println("失败(网络问题)", attach.Title, attach.Href) downloadLog.Log(attach) return } buf := bytes.NewBuffer(nil) _, err = io.Copy(buf, resp.Body) if err != nil { log.Println("失败(网络问题)", attach.Title, attach.Href) downloadLog.Log(attach) return } resp.Body.Close() //TODO 保存到目录 outputPaht := fmt.Sprintf("%s/%d年/%s/%s", *save2dir, year, *areadir, newFileName) //log.Println(outputPaht) err = os.WriteFile(outputPaht, buf.Bytes(), os.ModeAppend|os.ModePerm) if err != nil { log.Println("失败(写入文件)", attach.Title, attach.Href) downloadLog.Log(attach) return } if *sleep > 0 { time.Sleep(time.Duration(*sleep) * time.Second) } } // main func main() { checkHrefRepeat := map[uint64]bool{} lock := make(chan bool, *threads) wg := new(sync.WaitGroup) fn := func(attach *Attach, index int) { defer func() { <-lock wg.Done() }() download(attach, index) } attches := readDataFile() attchesLen := len(attches) for index, attach := range attches { _id := id(attach.Href) if _, ok := checkHrefRepeat[_id]; ok { log.Println("失败(URL地址重复)", attach.Title, attach.Href) continue } checkHrefRepeat[_id] = true lock <- true wg.Add(1) go fn(attach, index) log.Printf("当前下载进度%.2f%%\n", float32(index)/float32(attchesLen)*100) } wg.Wait() log.Println("下载完成,请关闭本窗口") close(lock) time.Sleep(20 * time.Second) }