spider_details.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-01-08
  4. ---------
  5. @summary: 湖北省政府采购网上商城 - 详情
  6. ---------
  7. @author: lzz
  8. """
  9. import feapder
  10. from feapder.utils.tools import log
  11. from items.spider_item import DataBakItem
  12. from untils.attachment import AttachmentDownloader
  13. from untils.tools import get_proxy
  14. from tools import ocr_captcha
  15. class Spider(feapder.BiddingDetailSpider):
  16. __custom_setting__ = dict(
  17. SPIDER_MAX_RETRY_TIMES=10
  18. )
  19. def start_callback(self):
  20. self.proxy = get_proxy()
  21. self.headers = {
  22. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  23. "Accept-Language": "zh-CN,zh;q=0.9",
  24. "Cache-Control": "no-cache",
  25. "Pragma": "no-cache",
  26. "Upgrade-Insecure-Requests": "1",
  27. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
  28. }
  29. self.file_types = [
  30. 'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
  31. 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
  32. ]
  33. self.downloader = AttachmentDownloader()
  34. def start_requests(self):
  35. data_list = self.get_tasks_by_rabbitmq(limit=50)
  36. for item in data_list:
  37. log.debug(item)
  38. request_params = item.get("request_params")
  39. yield feapder.Request(url=item.get("parse_url"),
  40. callback=eval(item.get("parse")),
  41. deal_detail=item.get("deal_detail"),
  42. verify=False,
  43. item=item,
  44. **request_params)
  45. def download_midware(self, request):
  46. captcha, cookies = ocr_captcha(self.headers, self.proxy)
  47. params = {
  48. "captcha": captcha
  49. }
  50. request.params = params
  51. request.cookies = cookies
  52. request.headers = self.headers
  53. request.proxies = self.proxy
  54. def validate(self, request, response):
  55. html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first()
  56. if html is None:
  57. raise ValueError('详情数据为空!')
  58. return True
  59. def detail_get(self, request, response):
  60. items = request.item
  61. data_item = DataBakItem(**items)
  62. html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first()
  63. data_item.contenthtml = html
  64. attachments = {}
  65. file_list = response.xpath('//div[@class="jm_textcon"]//a[@href]')
  66. for info in file_list:
  67. file_url = info.xpath('./@href').extract_first()
  68. file_name = info.xpath('./text()').extract_first()
  69. if not file_name or not file_url:
  70. continue
  71. file_name = file_name.strip()
  72. file_type = file_name.split('.')[-1].lower()
  73. if file_type not in self.file_types:
  74. file_type = file_url.split('.')[-1].lower()
  75. if file_type in self.file_types and "file" in file_url:
  76. attachment = self.downloader.fetch_attachment(
  77. file_name=file_name,
  78. file_type=file_type,
  79. download_url=file_url
  80. )
  81. attachments[str(len(attachments) + 1)] = attachment
  82. if len(attachments) > 0:
  83. data_item.projectinfo = {"attachments": attachments}
  84. yield data_item
  85. def exception_request(self, request, response):
  86. self.proxy = get_proxy()
  87. yield request
  88. if __name__ == "__main__":
  89. Spider(redis_key="lzz:Hbszfcgwssc").start()