# -*- coding: utf-8 -*- """ Created on 2024-01-08 --------- @summary: 湖北省政府采购网上商城 - 详情 --------- @author: lzz """ import feapder from items.spider_item import DataBakItem from untils.attachment import AttachmentDownloader from feapder.utils.tools import log import requests from untils.tools import get_proxy import re from untils.get_imgcode import jy_ocr def ocr_captcha(headers, proxies=False, max_retries=10): session = requests.session() session.proxies = proxies s = re.compile("'src', '(.*?)'", flags=re.S) # src href = 'http://wssc.hubeigp.gov.cn/simple_captcha' code = '' for _ in range(max_retries): resp1 = session.get(href, headers=headers, timeout=30, verify=False) text = resp1.content.decode() img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text)) resp2 = session.get(img_url, headers=headers, timeout=30, verify=False) code = jy_ocr(image=resp2.content) if code and len(code) == 6: break return code, session.cookies.get_dict() class Details(feapder.BiddingDetailSpider): __custom_setting__ = dict( SPIDER_MAX_RETRY_TIMES=10 ) def start_callback(self): self.proxy = get_proxy() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", } self.file_types = [ 'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd' ] def start_requests(self): data_list = self.get_tasks_by_rabbitmq(limit=50) for item in data_list: log.debug(item) request_params = item.get("request_params") yield feapder.Request(url=item.get("parse_url"), callback=eval(item.get("parse")), deal_detail=item.get("deal_detail"), verify=False, item=item, **request_params) def download_midware(self, request): captcha, cookies = ocr_captcha(self.headers, self.proxy) params = { "captcha": captcha } request.params = params request.cookies = cookies request.headers = self.headers request.proxies = self.proxy def validate(self, request, response): html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first() if html is None: raise ValueError('详情数据为空!') return True def detail_get(self, request, response): items = request.item data_item = DataBakItem(**items) html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first() data_item.contenthtml = html attachments = {} file_list = response.xpath('//div[@class="jm_textcon"]//a[@href]') for info in file_list: file_url = info.xpath('./@href').extract_first() file_name = info.xpath('./text()').extract_first() if not file_name or not file_url: continue file_name = file_name.strip() file_type = file_name.split('.')[-1].lower() if file_type not in self.file_types: file_type = file_url.split('.')[-1].lower() if file_type in self.file_types and "file" in file_url: attachment = AttachmentDownloader().fetch_attachment( file_name=file_name, file_type=file_type, download_url=file_url ) attachments[str(len(attachments) + 1)] = attachment if len(attachments) > 0: data_item.projectinfo = {"attachments": attachments} yield data_item def exception_request(self, request, response): self.proxy = get_proxy() yield request if __name__ == "__main__": Details(redis_key="lzz:Hbszfcgwssc").start()