123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207 |
- # -*- coding: utf-8 -*-
- """
- Created on 2025-04-23
- ---------
- @summary: 上海市公共资源交易中心
- ---------
- @author: lzz
- """
- import feapder
- from items.spider_item import DataBakItem
- from collections import namedtuple
- from feapder.network.selector import Selector
- from untils.tools import extract_file_type,get_proxy
- import re, time, random
- import requests
- from untils.attachment import AttachmentDownloader
- from feapder.utils.webdriver import WebDriver
- def get_html(url):
- _kwargs = {}
- _kwargs.setdefault("load_images", False)
- _kwargs.setdefault("headless", False)
- _kwargs.setdefault("driver_type", "CHROME")
- _kwargs.setdefault("render_time", 3)
- _kwargs.setdefault("usages_local_driver", False)
- with WebDriver(**_kwargs) as browser:
- try:
- browser.get(url)
- time.sleep(5)
- iframe_html = Selector(browser.page_source).xpath('//div[@id="app"]').extract_first("")
- return iframe_html
- except Exception as e:
- print("iframe获取失败")
- return ""
- def get_iframe(xmid,proxies=None):
- headers = {
- "Accept": "application/json, text/plain, */*",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Pragma": "no-cache",
- "Referer": "https://www.suaee.com/suaeeHome/",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
- "projectType": "suaeeHome",
- "sourcecode": "SUAEE"
- }
- url = "https://www.suaee.com/manageproject/foreign/projectPreview/detail"
- params = {
- "xmid": xmid
- }
- try:
- response = requests.get(url, headers=headers, params=params, timeout=30, proxies=proxies, verify=False)
- file_list = response.json().get('data').get('xgfj')
- return file_list
- except:
- return []
- class Shsggzyjyzx(feapder.BiddingListSpider):
- def start_callback(self):
- Menu = namedtuple('Menu', ['channel', 'code', 'typeone', 'crawl_page'])
- self.site = "上海市公共资源交易中心"
- self.menus = [
- Menu('碳排放权', 'sh_shsggzyjyzx_tpfq', 'jyxxtpf', 1),
- ]
- self.headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Pragma": "no-cache",
- "Referer": "https://www.shggzy.com",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
- }
- self.count = 0
- self.proxies = get_proxy()
- def start_requests(self):
- for menu in self.menus:
- start_url = f'https://www.shggzy.com/{menu.typeone}.jhtml'
- yield feapder.Request(url=start_url, item=menu._asdict(), use_session=True,
- random_user_agent=False, page=1, proxies=False)
- def download_midware(self, request):
- menu = request.item
- typeone = menu.get('typeone')
- if request.session == None:
- request.session = requests.session()
- url = f"https://www.shggzy.com/{typeone}"
- res = request.session.get(url, headers=self.headers, proxies = self.proxies, timeout=30, verify=False)
- params = {"cExt": f"{res.headers.get('cExt')}"}
- request.params = params
- request.headers = self.headers
- request.proxies = self.proxies
- if request.page > 1:
- urll = f'https://www.shggzy.com/{typeone}_{request.page}.jhtml'
- request.url = urll
- def exception_request(self, request, response):
- self.proxies = get_proxy()
- yield request
- def parse(self, request, response):
- if self.count > 5:
- return
- if type(response.text) == bytes:
- self.count += 1
- yield request
- else:
- self.count = 0
- menu = request.item
- info_list = response.xpath('//div[@class="gui-title-bottom"]/ul/li')
- for info in info_list:
- href_param = info.xpath('./@onclick').extract_first().strip()
- href = "https://www.shggzy.com" + "".join(re.findall("window.open\('(.*?)'", href_param, re.S)).strip()
- project_code = info.xpath('./span[last()-1]/text()').extract_first("").strip()
- dedup_params = href.split('?')[0] + project_code
- title = info.xpath('./span[@class="cs-span2"]/text()').extract_first().strip()
- create_time = info.xpath('./span[last()]/text()').extract_first().strip()
- area = "上海" # 省份
- city = "上海市" # 城市
- data_item = DataBakItem() # 存储数据的管道
- data_item.href = href # 标书链接
- data_item.unique_key = (dedup_params,)
- data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
- data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
- data_item.title = title # 标题
- data_item.publishtime = create_time # 标书发布时间
- data_item.site = self.site
- data_item.area = area # 城市默认:全国
- data_item.city = city # 城市 默认为空
-
- if menu.get('code') in ['sh_shsggzyjyzx_nyys', 'sh_shsggzyjyzx_sfpm']:
- data_item.infoformat = 3
- time.sleep(random.randint(3, 5))
- res = requests.get(href, headers=self.headers, proxies=self.proxies, verify=False, timeout=30)
- if res.text:
- attachments = {}
- iframe_html = ""
- root = Selector(res.text)
- html = root.xpath('//div[@class="table_1"]').extract_first() # 标书详细内容
- if not html:
- html = root.xpath('//div[@class="content"]').extract_first()
- iframe_url = root.xpath('//iframe[@id="frame-content"]/@src').extract_first("")
- if iframe_url:
- iframe_html = get_html(iframe_url)
- xmid = "".join(re.findall('xmid=(.*?)&',iframe_url))
- file_list = get_iframe(xmid=xmid,proxies=self.proxies)
- if file_list:
- for ff in file_list:
- f_name = ff.get('fileName')
- fileType = ff.get('fileType')
- filePath = ff.get('filePath')
- f_url = f"https://www.suaee.com/manageserver/fileDow?type={fileType}&filePath={filePath.replace('/','%2F')}&fileName={f_name}"
- f_type = extract_file_type(f_name, f_url)
- if f_type:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=f_name, file_type=f_type, download_url=f_url)
- attachments[str(len(attachments) + 1)] = attachment
- files = root.xpath('//div[@class="content-box"]//div[@class="content"]//a[@href]')
- if len(files) > 0:
- for index, info in enumerate(files):
- file_url = info.xpath('./@href').extract_first()
- file_name = info.xpath('./text()').extract_first()
- file_type = extract_file_type(file_name, file_url)
- if file_type and 'file' in file_url:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name, file_type=file_type, download_url=file_url)
- attachments[str(len(attachments) + 1)] = attachment
- if attachments:
- data_item.projectinfo = {"attachments": attachments}
- extra_html = root.xpath('//div[@id="sectionListDiv"]').extract_first()
- if extra_html and html:
- html = html.replace(extra_html, '')
- data_item.contenthtml = html + iframe_html
- yield data_item
- time.sleep(random.randint(3, 5))
- # 无限翻页
- request = self.infinite_pages(request, response)
- yield request
- if __name__ == "__main__":
- Shsggzyjyzx(redis_key="lzz:shsggzyjyzx_tpfq").start()
|