# -*- coding: utf-8 -*- """ Created on 2024-01-08 --------- @summary: 湖北省政府采购网上商城 - 详情 --------- @author: lzz """ import feapder from feapder.utils.tools import log from items.spider_item import DataBakItem from untils.attachment import AttachmentDownloader from untils.tools import get_proxy from tools import ocr_captcha class Spider(feapder.BiddingDetailSpider): __custom_setting__ = dict( SPIDER_MAX_RETRY_TIMES=10 ) def start_callback(self): self.proxy = get_proxy() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", } self.file_types = [ 'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd' ] self.downloader = AttachmentDownloader() def start_requests(self): data_list = self.get_tasks_by_rabbitmq(limit=50) for item in data_list: log.debug(item) request_params = item.get("request_params") yield feapder.Request(url=item.get("parse_url"), callback=eval(item.get("parse")), deal_detail=item.get("deal_detail"), verify=False, item=item, **request_params) def download_midware(self, request): captcha, cookies = ocr_captcha(self.headers, self.proxy) params = { "captcha": captcha } request.params = params request.cookies = cookies request.headers = self.headers request.proxies = self.proxy def validate(self, request, response): html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first() if html is None: raise ValueError('详情数据为空!') return True def detail_get(self, request, response): items = request.item data_item = DataBakItem(**items) html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first() data_item.contenthtml = html attachments = {} file_list = response.xpath('//div[@class="jm_textcon"]//a[@href]') for info in file_list: file_url = info.xpath('./@href').extract_first() file_name = info.xpath('./text()').extract_first() if not file_name or not file_url: continue file_name = file_name.strip() file_type = file_name.split('.')[-1].lower() if file_type not in self.file_types: file_type = file_url.split('.')[-1].lower() if file_type in self.file_types and "file" in file_url: attachment = self.downloader.fetch_attachment( file_name=file_name, file_type=file_type, download_url=file_url ) attachments[str(len(attachments) + 1)] = attachment if len(attachments) > 0: data_item.projectinfo = {"attachments": attachments} yield data_item def exception_request(self, request, response): self.proxy = get_proxy() yield request if __name__ == "__main__": Spider(redis_key="lzz:Hbszfcgwssc").start()