12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- # -*- coding: utf-8 -*-
- """
- Created on 2025-04-09
- ---------
- @summary: 国铁采购平台
- ---------
- @author: lzz
- """
- import re
- import feapder
- from items.spider_item import DataBakItem
- from untils.tools import get_proxy
- from fingerprint import get_fingerprint
- class Spider(feapder.BiddingDetailSpider):
- def start_callback(self):
- self.cookies = None
- self.proxy = get_proxy()
- def start_requests(self):
- data_list = self.get_tasks_by_rabbitmq(limit=100)
- for item in data_list:
- request_params = item.get("request_params")
- yield feapder.Request(url=item.get("parse_url"),
- proxies=False,
- callback=eval(item.get("parse")),
- item=item,
- deal_detail=item.get("deal_detail"),
- **request_params)
- def download_midware(self, request):
- headers = {
- 'Accept': '*/*',
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
- 'Cache-Control': 'no-cache',
- 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
- 'Origin': 'https://cg.95306.cn',
- 'Referer': request.item.get('href'),
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
- 'X-Requested-With': 'XMLHttpRequest',
- }
- if self.cookies is None:
- self.cookies = {
- 'AlteonPcgmh': '0a03b7f3bb36ad3f1f41',
- 'mhId': request.params['mhId'],
- }
- request.headers = headers
- request.proxies = self.proxy
- request.cookies = self.cookies
- def validate(self, request, response):
- data = response.json.get('data')
- if not data:
- raise ValueError('数据不能为空!')
- return True
- def detail_get(self, request, response):
- items = request.item
- html = response.json.get('data').get('noticeContent').get('notCont')
- html = re.sub('data:image(.*?) ', '', html, flags=re.S | re.M)
- data_item = DataBakItem(**items)
- data_item.contenthtml = html
- yield data_item
- def exception_request(self, request, response):
- self.cookies = None
- self.proxy = get_proxy()
- request.params['mhId'] = get_fingerprint()
- yield request
- if __name__ == "__main__":
- Spider(redis_key="lzz:Gtcgpt").start()
|