gtcgpt_details.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-04-09
  4. ---------
  5. @summary: 国铁采购平台
  6. ---------
  7. @author: lzz
  8. """
  9. import re
  10. import feapder
  11. from items.spider_item import DataBakItem
  12. from untils.tools import get_proxy
  13. from fingerprint import get_fingerprint
  14. class Spider(feapder.BiddingDetailSpider):
  15. def start_callback(self):
  16. self.cookies = None
  17. self.proxy = get_proxy()
  18. def start_requests(self):
  19. data_list = self.get_tasks_by_rabbitmq(limit=100)
  20. for item in data_list:
  21. request_params = item.get("request_params")
  22. yield feapder.Request(url=item.get("parse_url"),
  23. proxies=False,
  24. callback=eval(item.get("parse")),
  25. item=item,
  26. deal_detail=item.get("deal_detail"),
  27. **request_params)
  28. def download_midware(self, request):
  29. headers = {
  30. 'Accept': '*/*',
  31. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  32. 'Cache-Control': 'no-cache',
  33. 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
  34. 'Origin': 'https://cg.95306.cn',
  35. 'Referer': request.item.get('href'),
  36. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
  37. 'X-Requested-With': 'XMLHttpRequest',
  38. }
  39. if self.cookies is None:
  40. self.cookies = {
  41. 'AlteonPcgmh': '0a03b7f3bb36ad3f1f41',
  42. 'mhId': request.params['mhId'],
  43. }
  44. request.headers = headers
  45. request.proxies = self.proxy
  46. request.cookies = self.cookies
  47. def validate(self, request, response):
  48. data = response.json.get('data')
  49. if not data:
  50. raise ValueError('数据不能为空!')
  51. return True
  52. def detail_get(self, request, response):
  53. items = request.item
  54. html = response.json.get('data').get('noticeContent').get('notCont')
  55. html = re.sub('data:image(.*?) ', '', html, flags=re.S | re.M)
  56. data_item = DataBakItem(**items)
  57. data_item.contenthtml = html
  58. yield data_item
  59. def exception_request(self, request, response):
  60. self.cookies = None
  61. self.proxy = get_proxy()
  62. request.params['mhId'] = get_fingerprint()
  63. yield request
  64. if __name__ == "__main__":
  65. Spider(redis_key="lzz:Gtcgpt").start()