spider_details.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-01-08
  4. ---------
  5. @summary: 湖北省政府采购网上商城 - 详情
  6. ---------
  7. @author: lzz
  8. """
  9. import feapder
  10. from items.spider_item import DataBakItem
  11. from untils.attachment import AttachmentDownloader
  12. from feapder.utils.tools import log
  13. import requests
  14. from untils.tools import get_proxy
  15. import re
  16. from untils.get_imgcode import jy_ocr
  17. def ocr_captcha(headers, proxies=False, max_retries=10):
  18. session = requests.session()
  19. session.proxies = proxies
  20. s = re.compile("'src', '(.*?)'", flags=re.S) # src
  21. href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
  22. code = ''
  23. for _ in range(max_retries):
  24. resp1 = session.get(href, headers=headers, timeout=30, verify=False)
  25. text = resp1.content.decode()
  26. img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
  27. resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
  28. code = jy_ocr(image=resp2.content)
  29. if code and len(code) == 6:
  30. break
  31. return code, session.cookies.get_dict()
  32. class Details(feapder.BiddingDetailSpider):
  33. __custom_setting__ = dict(
  34. SPIDER_MAX_RETRY_TIMES=10
  35. )
  36. def start_callback(self):
  37. self.proxy = get_proxy()
  38. self.headers = {
  39. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  40. "Accept-Language": "zh-CN,zh;q=0.9",
  41. "Cache-Control": "no-cache",
  42. "Pragma": "no-cache",
  43. "Upgrade-Insecure-Requests": "1",
  44. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
  45. }
  46. self.file_types = [
  47. 'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
  48. 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
  49. ]
  50. def start_requests(self):
  51. data_list = self.get_tasks_by_rabbitmq(limit=50)
  52. for item in data_list:
  53. log.debug(item)
  54. request_params = item.get("request_params")
  55. yield feapder.Request(url=item.get("parse_url"),
  56. callback=eval(item.get("parse")),
  57. deal_detail=item.get("deal_detail"),
  58. verify=False,
  59. item=item,
  60. **request_params)
  61. def download_midware(self, request):
  62. captcha, cookies = ocr_captcha(self.headers, self.proxy)
  63. params = {
  64. "captcha": captcha
  65. }
  66. request.params = params
  67. request.cookies = cookies
  68. request.headers = self.headers
  69. request.proxies = self.proxy
  70. def validate(self, request, response):
  71. html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first()
  72. if html is None:
  73. raise ValueError('详情数据为空!')
  74. return True
  75. def detail_get(self, request, response):
  76. items = request.item
  77. data_item = DataBakItem(**items)
  78. html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first()
  79. data_item.contenthtml = html
  80. attachments = {}
  81. file_list = response.xpath('//div[@class="jm_textcon"]//a[@href]')
  82. for info in file_list:
  83. file_url = info.xpath('./@href').extract_first()
  84. file_name = info.xpath('./text()').extract_first()
  85. if not file_name or not file_url:
  86. continue
  87. file_name = file_name.strip()
  88. file_type = file_name.split('.')[-1].lower()
  89. if file_type not in self.file_types:
  90. file_type = file_url.split('.')[-1].lower()
  91. if file_type in self.file_types and "file" in file_url:
  92. attachment = AttachmentDownloader().fetch_attachment(
  93. file_name=file_name,
  94. file_type=file_type,
  95. download_url=file_url
  96. )
  97. attachments[str(len(attachments) + 1)] = attachment
  98. if len(attachments) > 0:
  99. data_item.projectinfo = {"attachments": attachments}
  100. yield data_item
  101. def exception_request(self, request, response):
  102. self.proxy = get_proxy()
  103. yield request
  104. if __name__ == "__main__":
  105. Details(redis_key="lzz:Hbszfcgwssc").start()