# -*- coding: utf-8 -*- """ Created on 2025-04-09 --------- @summary: 国铁采购平台 --------- @author: lzz """ import re import feapder import feapder.utils.tools as tools from items.spider_item import DataBakItem from untils.tools import get_proxy from fingerprint import get_fingerprint, fetch_alteon_pcgmh, check_fingerprint class Spider(feapder.BiddingDetailSpider): def start_callback(self): self.alteon_pcgmh = None self.cookies = None self.proxy = get_proxy() def start_requests(self): data_list = self.get_tasks_by_rabbitmq(limit=100) for item in data_list: request_params = item.get("request_params") yield feapder.Request(url=item.get("parse_url"), proxies=False, item=item, deal_detail=item.get("deal_detail"), **request_params) def download_midware(self, request): headers = { 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Origin': 'https://cg.95306.cn', 'Referer': request.item.get('href'), "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", 'X-Requested-With': 'XMLHttpRequest', } if self.alteon_pcgmh is None: self.alteon_pcgmh = fetch_alteon_pcgmh(self.proxy) if self.cookies is None: self.cookies = { 'AlteonPcgmh': self.alteon_pcgmh, 'mhId': request.params['mhId'], } request.headers = headers request.proxies = self.proxy request.cookies = self.cookies def validate(self, request, response): if response.json['code'] == '0-0203': referer = request.item.get('href') check_fingerprint(request.params['mhId'], self.cookies, referer, self.proxy) request.callback = self.request_retry return True elif response.json.get('data'): request.callback = tools.resolve_method(self, request.item['parse']) return True else: raise ValueError('数据不能为空!') def request_retry(self, request, response): if 'parse' not in request.item: raise AttributeError('request.item not attribute "parse"') yield request def detail_get(self, request, response): item = request.item data_item = DataBakItem(**item) html = response.json.get('data').get('noticeContent').get('notCont') html = re.sub('data:image(.*?) ', '', html, flags=re.S | re.M) data_item.contenthtml = html yield data_item def exception_request(self, request, response): self.alteon_pcgmh = None self.cookies = None self.proxy = get_proxy() request.params['mhId'] = get_fingerprint() yield request if __name__ == "__main__": Spider(redis_key="lzz:Gtcgpt").start()