data_spider
/
platform-spiders


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
							# -*- coding: utf-8 -*-
"""
Created on 2025-04-23
---------
@summary: 上海市公共资源交易中心
---------
@author: lzz
"""
import feapder
from items.spider_item import DataBakItem
from collections import namedtuple
from feapder.network.selector import Selector
from untils.tools import extract_file_type,get_proxy
import re, time, random
import requests
from untils.attachment import AttachmentDownloader
from feapder.utils.webdriver import WebDriver


def get_html(url):
    _kwargs = {}
    _kwargs.setdefault("load_images", False)
    _kwargs.setdefault("headless", False)
    _kwargs.setdefault("driver_type", "CHROME")
    _kwargs.setdefault("render_time", 3)
    _kwargs.setdefault("usages_local_driver", False)

    with WebDriver(**_kwargs) as browser:
        try:
            browser.get(url)
            time.sleep(5)
            iframe_html = Selector(browser.page_source).xpath('//div[@id="app"]').extract_first("")
            return iframe_html
        except Exception as e:
            print("iframe获取失败")
            return ""


def get_iframe(xmid,proxies=None):
    headers = {
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Pragma": "no-cache",
        "Referer": "https://www.suaee.com/suaeeHome/",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "projectType": "suaeeHome",
        "sourcecode": "SUAEE"
    }
    url = "https://www.suaee.com/manageproject/foreign/projectPreview/detail"
    params = {
        "xmid": xmid
    }
    try:
        response = requests.get(url, headers=headers, params=params, timeout=30, proxies=proxies, verify=False)
        file_list = response.json().get('data').get('xgfj')
        return file_list
    except:
        return []

class Shsggzyjyzx(feapder.BiddingListSpider):

    def start_callback(self):
        Menu = namedtuple('Menu', ['channel', 'code', 'typeone', 'crawl_page'])
        self.site = "上海市公共资源交易中心"

        self.menus = [
            Menu('土地出让', 'sh_shsggzyjyzx_tdcr', 'jyxxtd', 1),
        ]

        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Referer": "https://www.shggzy.com",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        }

        self.count = 0
        self.proxies = get_proxy()

    def start_requests(self):

        for menu in self.menus:
            start_url = f'https://www.shggzy.com/{menu.typeone}.jhtml'
            yield feapder.Request(url=start_url, item=menu._asdict(), use_session=True,
                                  random_user_agent=False, page=1, proxies=False)

    def download_midware(self, request):
        menu = request.item
        typeone = menu.get('typeone')
        if request.session == None:
            request.session = requests.session()

        url = f"https://www.shggzy.com/{typeone}"

        res = request.session.get(url, headers=self.headers, proxies = self.proxies, timeout=30, verify=False)
        params = {"cExt": f"{res.headers.get('cExt')}"}
        request.params = params
        request.headers = self.headers
        request.proxies = self.proxies

        if request.page > 1:
            urll = f'https://www.shggzy.com/{typeone}_{request.page}.jhtml'
            request.url = urll

    def exception_request(self, request, response):
        self.proxies = get_proxy()
        yield request

    def parse(self, request, response):
        if self.count > 5:
            return
        if type(response.text) == bytes:
            self.count += 1
            yield request
        else:
            self.count = 0
            menu = request.item
            info_list = response.xpath('//div[@class="gui-title-bottom"]/ul/li')
            for info in info_list:
                href_param = info.xpath('./@onclick').extract_first().strip()
                href = "https://www.shggzy.com" + "".join(re.findall("window.open\('(.*?)'", href_param, re.S)).strip()
                project_code = info.xpath('./span[last()-1]/text()').extract_first("").strip()
                dedup_params = href.split('?')[0] + project_code
                title = info.xpath('./span[@class="cs-span2"]/text()').extract_first().strip()
                create_time = info.xpath('./span[last()]/text()').extract_first().strip()

                area = "上海"  # 省份
                city = "上海市"  # 城市

                data_item = DataBakItem()  # 存储数据的管道
                data_item.href = href  # 标书链接
                data_item.unique_key = (dedup_params,)
                data_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
                data_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
                data_item.title = title  # 标题
                data_item.publishtime = create_time  # 标书发布时间
                data_item.site = self.site
                data_item.area = area  # 城市默认:全国
                data_item.city = city  # 城市 默认为空
                
                if menu.get('code') in ['sh_shsggzyjyzx_nyys', 'sh_shsggzyjyzx_sfpm']:
                    data_item.infoformat = 3

                time.sleep(random.randint(3, 5))
                res = requests.get(href, headers=self.headers, proxies=self.proxies, verify=False, timeout=30)
                if res.text:
                    attachments = {}
                    iframe_html = ""
                    root = Selector(res.text)
                    html = root.xpath('//div[@class="table_1"]').extract_first()  # 标书详细内容
                    if not html:
                        html = root.xpath('//div[@class="content"]').extract_first()

                    iframe_url = root.xpath('//iframe[@id="frame-content"]/@src').extract_first("")
                    if iframe_url:
                        iframe_html = get_html(iframe_url)
                        xmid = "".join(re.findall('xmid=(.*?)&',iframe_url))
                        file_list = get_iframe(xmid=xmid,proxies=self.proxies)
                        if file_list:
                            for ff in file_list:
                                f_name = ff.get('fileName')
                                fileType = ff.get('fileType')
                                filePath = ff.get('filePath')
                                f_url = f"https://www.suaee.com/manageserver/fileDow?type={fileType}&filePath={filePath.replace('/','%2F')}&fileName={f_name}"

                                f_type = extract_file_type(f_name, f_url)
                                if f_type:
                                    attachment = AttachmentDownloader().fetch_attachment(
                                        file_name=f_name, file_type=f_type, download_url=f_url)
                                    attachments[str(len(attachments) + 1)] = attachment

                    files = root.xpath('//div[@class="content-box"]//div[@class="content"]//a[@href]')
                    if len(files) > 0:
                        for index, info in enumerate(files):
                            file_url = info.xpath('./@href').extract_first()
                            file_name = info.xpath('./text()').extract_first()
                            file_type = extract_file_type(file_name, file_url)
                            if file_type and 'file' in file_url:
                                attachment = AttachmentDownloader().fetch_attachment(
                                    file_name=file_name, file_type=file_type, download_url=file_url)
                                attachments[str(len(attachments) + 1)] = attachment

                    if attachments:
                        data_item.projectinfo = {"attachments": attachments}

                    extra_html = root.xpath('//div[@id="sectionListDiv"]').extract_first()
                    if extra_html and html:
                        html = html.replace(extra_html, '')

                    data_item.contenthtml = html + iframe_html

                    yield data_item

            time.sleep(random.randint(3, 5))
            # 无限翻页
            request = self.infinite_pages(request, response)
            yield request


if __name__ == "__main__":
    Shsggzyjyzx(redis_key="lzz:shsggzyjyzx_tdcr").start()