# -*- coding: utf-8 -*- """ Created on 2025-04-23 --------- @summary: 上海市公共资源交易中心 --------- @author: lzz """ import feapder from items.spider_item import DataBakItem from collections import namedtuple from feapder.network.selector import Selector from untils.tools import extract_file_type,get_proxy import re, time, random import requests from untils.attachment import AttachmentDownloader from feapder.utils.webdriver import WebDriver def get_html(url): _kwargs = {} _kwargs.setdefault("load_images", False) _kwargs.setdefault("headless", False) _kwargs.setdefault("driver_type", "CHROME") _kwargs.setdefault("render_time", 3) _kwargs.setdefault("usages_local_driver", False) with WebDriver(**_kwargs) as browser: try: browser.get(url) time.sleep(5) iframe_html = Selector(browser.page_source).xpath('//div[@id="app"]').extract_first("") return iframe_html except Exception as e: print("iframe获取失败") return "" def get_iframe(xmid,proxies=None): headers = { "Accept": "application/json, text/plain, */*", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Pragma": "no-cache", "Referer": "https://www.suaee.com/suaeeHome/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", "projectType": "suaeeHome", "sourcecode": "SUAEE" } url = "https://www.suaee.com/manageproject/foreign/projectPreview/detail" params = { "xmid": xmid } try: response = requests.get(url, headers=headers, params=params, timeout=30, proxies=proxies, verify=False) file_list = response.json().get('data').get('xgfj') return file_list except: return [] class Shsggzyjyzx(feapder.BiddingListSpider): def start_callback(self): Menu = namedtuple('Menu', ['channel', 'code', 'typeone', 'crawl_page']) self.site = "上海市公共资源交易中心" self.menus = [ Menu('综合采购', 'sh_shsggzyjyzx_wzcg', 'jyxxwzcg', 1), ] self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Pragma": "no-cache", "Referer": "https://www.shggzy.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", } self.count = 0 self.proxies = get_proxy() def start_requests(self): for menu in self.menus: start_url = f'https://www.shggzy.com/{menu.typeone}.jhtml' yield feapder.Request(url=start_url, item=menu._asdict(), use_session=True, random_user_agent=False, page=1, proxies=False) def download_midware(self, request): menu = request.item typeone = menu.get('typeone') if request.session == None: request.session = requests.session() url = f"https://www.shggzy.com/{typeone}" res = request.session.get(url, headers=self.headers, proxies = self.proxies, timeout=30, verify=False) params = {"cExt": f"{res.headers.get('cExt')}"} request.params = params request.headers = self.headers request.proxies = self.proxies if request.page > 1: urll = f'https://www.shggzy.com/{typeone}_{request.page}.jhtml' request.url = urll def exception_request(self, request, response): self.proxies = get_proxy() yield request def parse(self, request, response): if self.count > 5: return if type(response.text) == bytes: self.count += 1 yield request else: self.count = 0 menu = request.item info_list = response.xpath('//div[@class="gui-title-bottom"]/ul/li') for info in info_list: href_param = info.xpath('./@onclick').extract_first().strip() href = "https://www.shggzy.com" + "".join(re.findall("window.open\('(.*?)'", href_param, re.S)).strip() project_code = info.xpath('./span[last()-1]/text()').extract_first("").strip() dedup_params = href.split('?')[0] + project_code title = info.xpath('./span[@class="cs-span2"]/text()').extract_first().strip() create_time = info.xpath('./span[last()]/text()').extract_first().strip() area = "上海" # 省份 city = "上海市" # 城市 data_item = DataBakItem() # 存储数据的管道 data_item.href = href # 标书链接 data_item.unique_key = (dedup_params,) data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的) data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的) data_item.title = title # 标题 data_item.publishtime = create_time # 标书发布时间 data_item.site = self.site data_item.area = area # 城市默认:全国 data_item.city = city # 城市 默认为空 if menu.get('code') in ['sh_shsggzyjyzx_nyys', 'sh_shsggzyjyzx_sfpm']: data_item.infoformat = 3 time.sleep(random.randint(3, 5)) res = requests.get(href, headers=self.headers, proxies=self.proxies, verify=False, timeout=30) if res.text: attachments = {} iframe_html = "" root = Selector(res.text) html = root.xpath('//div[@class="table_1"]').extract_first() # 标书详细内容 if not html: html = root.xpath('//div[@class="content"]').extract_first() iframe_url = root.xpath('//iframe[@id="frame-content"]/@src').extract_first("") if iframe_url: iframe_html = get_html(iframe_url) xmid = "".join(re.findall('xmid=(.*?)&',iframe_url)) file_list = get_iframe(xmid=xmid,proxies=self.proxies) if file_list: for ff in file_list: f_name = ff.get('fileName') fileType = ff.get('fileType') filePath = ff.get('filePath') f_url = f"https://www.suaee.com/manageserver/fileDow?type={fileType}&filePath={filePath.replace('/','%2F')}&fileName={f_name}" f_type = extract_file_type(f_name, f_url) if f_type: attachment = AttachmentDownloader().fetch_attachment( file_name=f_name, file_type=f_type, download_url=f_url) attachments[str(len(attachments) + 1)] = attachment files = root.xpath('//div[@class="content-box"]//div[@class="content"]//a[@href]') if len(files) > 0: for index, info in enumerate(files): file_url = info.xpath('./@href').extract_first() file_name = info.xpath('./text()').extract_first() file_type = extract_file_type(file_name, file_url) if file_type and 'file' in file_url: attachment = AttachmentDownloader().fetch_attachment( file_name=file_name, file_type=file_type, download_url=file_url) attachments[str(len(attachments) + 1)] = attachment if attachments: data_item.projectinfo = {"attachments": attachments} extra_html = root.xpath('//div[@id="sectionListDiv"]').extract_first() if extra_html and html: html = html.replace(extra_html, '') data_item.contenthtml = html + iframe_html yield data_item time.sleep(random.randint(3, 5)) # 无限翻页 request = self.infinite_pages(request, response) yield request if __name__ == "__main__": Shsggzyjyzx(redis_key="lzz:shsggzyjyzx_wzcg").start()