gtcgpt_details.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-04-09
  4. ---------
  5. @summary: 国铁采购平台
  6. ---------
  7. @author: lzz
  8. """
  9. import re
  10. import feapder
  11. import feapder.utils.tools as tools
  12. from items.spider_item import DataBakItem
  13. from untils.tools import get_proxy
  14. from fingerprint import get_fingerprint, fetch_alteon_pcgmh, check_fingerprint
  15. class Spider(feapder.BiddingDetailSpider):
  16. def start_callback(self):
  17. self.alteon_pcgmh = None
  18. self.cookies = None
  19. self.proxy = get_proxy()
  20. def start_requests(self):
  21. data_list = self.get_tasks_by_rabbitmq(limit=100)
  22. for item in data_list:
  23. request_params = item.get("request_params")
  24. yield feapder.Request(url=item.get("parse_url"),
  25. proxies=False,
  26. item=item,
  27. deal_detail=item.get("deal_detail"),
  28. **request_params)
  29. def download_midware(self, request):
  30. headers = {
  31. 'Accept': '*/*',
  32. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  33. 'Cache-Control': 'no-cache',
  34. 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
  35. 'Origin': 'https://cg.95306.cn',
  36. 'Referer': request.item.get('href'),
  37. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
  38. 'X-Requested-With': 'XMLHttpRequest',
  39. }
  40. if self.alteon_pcgmh is None:
  41. self.alteon_pcgmh = fetch_alteon_pcgmh(self.proxy)
  42. if self.cookies is None:
  43. self.cookies = {
  44. 'AlteonPcgmh': self.alteon_pcgmh,
  45. 'mhId': request.params['mhId'],
  46. }
  47. request.headers = headers
  48. request.proxies = self.proxy
  49. request.cookies = self.cookies
  50. def validate(self, request, response):
  51. if response.json['code'] == '0-0203':
  52. referer = request.item.get('href')
  53. check_fingerprint(request.params['mhId'], self.cookies, referer, self.proxy)
  54. request.callback = self.request_retry
  55. return True
  56. elif response.json.get('data'):
  57. request.callback = tools.resolve_method(self, request.item['parse'])
  58. return True
  59. else:
  60. raise ValueError('数据不能为空!')
  61. def request_retry(self, request, response):
  62. if 'parse' not in request.item:
  63. raise AttributeError('request.item not attribute "parse"')
  64. yield request
  65. def detail_get(self, request, response):
  66. item = request.item
  67. data_item = DataBakItem(**item)
  68. html = response.json.get('data').get('noticeContent').get('notCont')
  69. html = re.sub('data:image(.*?) ', '', html, flags=re.S | re.M)
  70. data_item.contenthtml = html
  71. yield data_item
  72. def exception_request(self, request, response):
  73. self.alteon_pcgmh = None
  74. self.cookies = None
  75. self.proxy = get_proxy()
  76. request.params['mhId'] = get_fingerprint()
  77. yield request
  78. if __name__ == "__main__":
  79. Spider(redis_key="lzz:Gtcgpt").start()