# -*- coding: utf-8 -*- """ Created on 2023-7-22 --------- @summary: 云采通高校采购联盟-结果公告 --------- @author: lzz """ from urllib.parse import urljoin import feapder from feapder.network.selector import Selector from feapder.utils.log import log from items.spider_item import DataBakItem from untils.attachment import AttachmentDownloader from untils.tools import remove_htmldata class Spider(feapder.BiddingDetailSpider): def start_requests(self): data_list = self.get_tasks_by_rabbitmq(limit=30) for item in data_list: log.debug(item) request_params = item.get("request_params") yield feapder.Request(url=item.get("parse_url"), render=True, render_time=item.get("render_time"), callback=eval(item.get("parse")), item=item, files_info=item.get("files"), deal_detail=item.get("deal_detail"), **request_params) def detail_get(self, request, response): iframe = response.xpath('//iframe[@id="contentFrame"]').extract_first() if iframe: content_frame = response.browser.tab('#contentFrame') response = Selector(content_frame.html) items = request.item data_item = DataBakItem(**items) html = '' detail_path = [ '//table[@class="MsoNormalTable"]', '//div[@class="project-details positionrl"]', '//div[@class="content"]', '//div[@class="project-war medium"]', '/html/body' ] for xpath in detail_path: html = response.xpath(xpath).extract_first() # 标书详细内容 if html is not None: break rm_lsit = [ '//div[@class="project-bid print-hide"]', '//div[@class="select-item ml10"]', '//div[@id="guide8"]', '打印公告' ] data_item.contenthtml = remove_htmldata(rm_lsit, html, response) attachments = {} if request.files_info: files_info = request.files_info files = response.xpath(files_info.get("list_xpath")) if len(files) > 0: for index, info in enumerate(files): file_url = info.xpath(files_info.get("url_xpath")).extract_first() file_name = info.xpath(files_info.get("name_xpath")).extract_first() file_name = (file_name or data_item.title) if 'http' not in file_url and files_info.get("host"): file_url = urljoin(files_info.get("host"), file_url) if not files_info.get("file_type"): file_type = file_url.split(".")[-1].lower() if file_type not in files_info.get("files_type"): file_type = file_name.split(".")[-1].lower() else: file_type = files_info.get("file_type") if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url: if file_type in file_name: file_name = file_name.replace(f'.{file_type}', '').strip() attachment = AttachmentDownloader().fetch_attachment( file_name=file_name, file_type=file_type, download_url=file_url, proxies=request.get_proxies() ) attachments[str(len(attachments) + 1)] = attachment if len(attachments) > 0: data_item.projectinfo = {"attachments": attachments} yield data_item if __name__ == "__main__": Spider(redis_key="lzz:yctgxcglm_jggg").start()