batch_download.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. package main
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "flag"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "log"
  10. "net/http"
  11. "os"
  12. "regexp"
  13. "sort"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "github.com/cespare/xxhash/v2"
  19. "golang.org/x/net/proxy"
  20. )
  21. type (
  22. //Attach 附件
  23. Attach struct {
  24. Department string `json:"department"`
  25. Href string `json:"attach_href"`
  26. Title string `json:"info_title"`
  27. Raw string
  28. }
  29. Attaches []*Attach
  30. )
  31. var (
  32. df = flag.String("df", "./data.dat", "datafile 数据文件")
  33. save2dir = flag.String("d", "./attaches", "save to dir 保存到目录")
  34. areadir = flag.String("ad", "河南/郑州市/政府", "地域目录")
  35. defaultYear = flag.String("y", "", "year 默认年份,注意:只有整个下载清单是同一个年份才可以指定")
  36. threads = flag.Int("ts", 4, "threads 线程数")
  37. sleep = flag.Int64("s", 0, "sleep 下载间歇")
  38. filter = flag.String("ft", "true", "filter title 是否过滤标题,标题必须包含年份、预算或者预决算")
  39. defaultExt = flag.String("de", "pdf", "default ext默认扩展名")
  40. headFile = flag.String("hf", "", "headfile 下载请求头")
  41. logFile = flag.String("lf", "./error.dat", "log file日志文件")
  42. proxyStr = flag.String("p", "", "proxy 代理,使用代理时,只能开一个线程")
  43. connectTimeout = flag.Int64("cto", 30, "超时设定")
  44. defaultYearInt int
  45. fileIndex = map[string]int{}
  46. allowFileExt = map[string]bool{
  47. "doc": true,
  48. "docx": true,
  49. "xls": true,
  50. "xlsx": true,
  51. "pdf": true,
  52. "zip": true,
  53. "rar": true,
  54. "txt": true,
  55. "png": true,
  56. "jpg": true,
  57. "jpeg": true,
  58. "ppt": true,
  59. "pptx": true,
  60. "wps": true,
  61. }
  62. reg, _ = regexp.Compile("(\\d{4})")
  63. reg4Title1, _ = regexp.Compile("(202\\d+)年度?(.*?)((部门)*(预|决)算)")
  64. reg4Title2, _ = regexp.Compile("(.*?)(202\\d+)年度?((部门)*(预|决)算)")
  65. downloadLog *SpiderLog
  66. header = map[string]string{}
  67. fnLock = new(sync.RWMutex)
  68. proxyUri []string
  69. )
  70. // init
  71. func init() {
  72. flag.Parse()
  73. var err error
  74. defaultYearInt, _ = strconv.Atoi(*defaultYear)
  75. //TODO 目录检查并创建目录 3年的
  76. for _, year := range []int{2022, 2023, 2024} {
  77. path := fmt.Sprintf("%s/%d年/%s", *save2dir, year, *areadir)
  78. if _, err = os.Stat(path); err != nil {
  79. os.MkdirAll(path, os.ModeAppend|os.ModePerm)
  80. }
  81. }
  82. if *headFile != "" {
  83. bs, err := ioutil.ReadFile(*headFile)
  84. if err == nil {
  85. content := string(bs)
  86. for _, l := range strings.Split(content, "\n") {
  87. if strings.HasPrefix(l, "#") {
  88. continue
  89. }
  90. pos := strings.Index(l, ":")
  91. if pos > 0 {
  92. key := l[:pos]
  93. value := l[pos+1:]
  94. header[key] = value
  95. }
  96. }
  97. }
  98. }
  99. log.Println(header)
  100. if *logFile != "" {
  101. downloadLog, err = NewSpiderLog(*logFile)
  102. if err != nil {
  103. log.Fatal(err)
  104. }
  105. }
  106. proxyUri = strings.Split(*proxyStr, ";")
  107. }
  108. // id
  109. func id(text string) uint64 {
  110. has := xxhash.New()
  111. has.WriteString(text)
  112. return has.Sum64()
  113. }
  114. // changeIp
  115. func changeIp(client *http.Client) {
  116. resp, err := client.Get("http://pleasechangemyip.com")
  117. if err == nil {
  118. ioutil.ReadAll(resp.Body)
  119. resp.Body.Close()
  120. }
  121. //time.Sleep(1 * time.Second)
  122. }
  123. // httpClient
  124. func httpClient(index int) *http.Client {
  125. if *proxyStr != "" {
  126. pos := index % len(proxyUri)
  127. dialer, err := proxy.SOCKS5("tcp", proxyUri[pos], nil, proxy.Direct)
  128. if err != nil {
  129. log.Println(err.Error())
  130. }
  131. // setup a http client
  132. httpTransport := &http.Transport{}
  133. httpTransport.Dial = dialer.Dial
  134. httpClient := &http.Client{Transport: httpTransport, Timeout: time.Duration(*connectTimeout) * time.Second}
  135. changeIp(httpClient)
  136. return httpClient
  137. } else {
  138. return &http.Client{Timeout: time.Duration(*connectTimeout) * time.Second}
  139. }
  140. }
  141. // readDataFile
  142. func readDataFile() Attaches {
  143. ret := make(Attaches, 0, 0)
  144. bs, err := ioutil.ReadFile(*df)
  145. if err != nil {
  146. log.Fatal(err)
  147. }
  148. content := string(bs)
  149. for _, s := range strings.Split(content, "\n") {
  150. if len(s) == 0 {
  151. continue
  152. }
  153. var data string = s
  154. if strings.HasPrefix(s, ",") {
  155. data = s[1:]
  156. }
  157. var attach = new(Attach)
  158. err = json.Unmarshal([]byte(data), attach)
  159. if err == nil {
  160. ret = append(ret, attach)
  161. attach.Raw = s
  162. }
  163. }
  164. return ret
  165. }
  166. // getExt 取得后缀,扩展名
  167. func getExt(text string) string {
  168. if strings.Contains(text, ".") {
  169. tmp := strings.Split(text, ".")
  170. return tmp[len(tmp)-1]
  171. } else {
  172. return ""
  173. }
  174. }
  175. // 分解title
  176. func extractTitle(organ, title, href string) (year int, department, ext string) {
  177. if v := strings.ToLower(getExt(title)); allowFileExt[v] {
  178. ext = v
  179. } else if v := getExt(href); allowFileExt[v] {
  180. ext = v
  181. }
  182. if ext == "" {
  183. ext = *defaultExt
  184. }
  185. if *filter == "true" {
  186. check := strings.Contains(organ, "预算") || strings.Contains(organ, "预决算")
  187. if !check && !strings.Contains(title, "预算") && !strings.Contains(title, "预决算") {
  188. log.Print("失败(文件不包含预算|决算)", organ, title)
  189. return
  190. }
  191. }
  192. var matchObj []string
  193. if reg.MatchString(title) {
  194. matchObj = reg.FindAllString(title, -1)
  195. } else if reg.MatchString(organ) {
  196. matchObj = reg.FindAllString(organ, -1)
  197. }
  198. //年排序
  199. if matchObj != nil && len(matchObj) > 0 {
  200. yearArr := make([]int, len(matchObj))
  201. for i, v := range matchObj {
  202. yearArr[i], _ = strconv.Atoi(v)
  203. }
  204. sort.Slice(yearArr, func(i, j int) bool { return yearArr[i] > yearArr[j] })
  205. year = yearArr[0]
  206. }
  207. if department == "" {
  208. if len(organ) > len(title) {
  209. department = organ
  210. } else {
  211. department = title
  212. }
  213. }
  214. return
  215. }
  216. // 拼装文件名
  217. func createFileName(organ, title, href string) (year int, filename string) {
  218. fnLock.Lock()
  219. defer fnLock.Unlock()
  220. year, department, ext := extractTitle(organ, title, href)
  221. if year < 2022 || year > 2024 {
  222. log.Println("无法生成有效的文件名", organ, title, href)
  223. return 0, ""
  224. }
  225. //修正文件名,去掉特殊符号
  226. department = strings.Map(func(r rune) rune {
  227. switch r {
  228. case '\n', '.', '-', '?', '=', '&', '*', '#', '$', '%', '^', '(', ')', '|':
  229. return -1
  230. default:
  231. return r
  232. }
  233. }, department)
  234. key := fmt.Sprintf("%d_%s", year, department)
  235. if v, ok := fileIndex[key]; ok {
  236. fileIndex[key] = v + 1
  237. } else {
  238. fileIndex[key] = 1
  239. }
  240. filename = fmt.Sprintf("%d_%s_%d.%s",
  241. year, department, fileIndex[key], ext)
  242. return year, filename
  243. }
  244. // download
  245. func download(attach *Attach, index int) {
  246. //生成本地文件
  247. year, newFileName := createFileName(attach.Department, attach.Title, attach.Href)
  248. if newFileName == "" {
  249. log.Println("失败(无法生成文件名)", attach.Title, attach.Href)
  250. downloadLog.Log(attach)
  251. return
  252. }
  253. log.Println("dowload and save to ", attach.Title, attach.Department, newFileName)
  254. //TODO 下载文件
  255. req, err := http.NewRequest("GET", attach.Href, nil)
  256. if err != nil {
  257. log.Println("失败(请求头)", attach.Title, attach.Href)
  258. downloadLog.Log(attach)
  259. return
  260. }
  261. for k, v := range header {
  262. req.Header.Set(k, v)
  263. }
  264. client := httpClient(index)
  265. resp, err := client.Do(req)
  266. if err != nil {
  267. log.Println("失败(网络问题)", attach.Title, attach.Href)
  268. downloadLog.Log(attach)
  269. return
  270. }
  271. buf := bytes.NewBuffer(nil)
  272. _, err = io.Copy(buf, resp.Body)
  273. if err != nil {
  274. log.Println("失败(网络问题)", attach.Title, attach.Href)
  275. downloadLog.Log(attach)
  276. return
  277. }
  278. resp.Body.Close()
  279. //TODO 保存到目录
  280. outputPaht := fmt.Sprintf("%s/%d年/%s/%s", *save2dir, year, *areadir, newFileName)
  281. //log.Println(outputPaht)
  282. err = os.WriteFile(outputPaht, buf.Bytes(), os.ModeAppend|os.ModePerm)
  283. if err != nil {
  284. log.Println("失败(写入文件)", attach.Title, attach.Href)
  285. downloadLog.Log(attach)
  286. return
  287. }
  288. if *sleep > 0 {
  289. time.Sleep(time.Duration(*sleep) * time.Second)
  290. }
  291. }
  292. // main
  293. func main() {
  294. checkHrefRepeat := map[uint64]bool{}
  295. lock := make(chan bool, *threads)
  296. wg := new(sync.WaitGroup)
  297. fn := func(attach *Attach, index int) {
  298. defer func() {
  299. <-lock
  300. wg.Done()
  301. }()
  302. download(attach, index)
  303. }
  304. attches := readDataFile()
  305. attchesLen := len(attches)
  306. for index, attach := range attches {
  307. _id := id(attach.Href)
  308. if _, ok := checkHrefRepeat[_id]; ok {
  309. log.Println("失败(URL地址重复)", attach.Title, attach.Href)
  310. continue
  311. }
  312. checkHrefRepeat[_id] = true
  313. lock <- true
  314. wg.Add(1)
  315. go fn(attach, index)
  316. log.Printf("当前下载进度%.2f%%\n", float32(index)/float32(attchesLen)*100)
  317. }
  318. wg.Wait()
  319. log.Println("下载完成,请关闭本窗口")
  320. close(lock)
  321. time.Sleep(20 * time.Second)
  322. }