data_spider
/
platform-spiders


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
							# -*- coding: utf-8 -*-
"""
Created on 2025-05-24
---------
@summary: 茂名市公共资源交易平台
---------
@author: lzz
"""
import re
import feapder
from feapder.network.selector import Selector
from items.spider_item import DataBakItem
from untils.attachment import AttachmentDownloader
from untils.tools import extract_file_type
from gdmm_utils import *


class Details(feapder.BiddingDetailSpider):

    def start_requests(self):
        data_list = self.get_tasks_by_rabbitmq(limit=20)
        for item in data_list:
            request_params = item.get("request_params")
            yield feapder.Request(url=item.get("parse_url"), item=item, proxies=False,
                                  deal_detail=item.get("deal_detail"), **request_params,
                                  callback='parse')

    def download_midware(self, request):
        en_str = get_enstr(request.params)
        request.proxies = get_proxy(socks5h=True)

        request.headers = {
            "Accept": "application/json, text/plain, */*",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Referer": "https://ygp.gdzwfw.gov.cn/",
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-origin",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
            "X-Dgi-Req-App": en_str.get('X-Dgi-Req-App'),
            "X-Dgi-Req-Nonce": en_str.get('X-Dgi-Req-Nonce'),
            "X-Dgi-Req-Signature": en_str.get('X-Dgi-Req-Signature'),
            "X-Dgi-Req-Timestamp": en_str.get('X-Dgi-Req-Timestamp'),
        }

    def parse(self, request, response):
        attachments = {}
        items = request.item
        list_item = DataBakItem(**items)

        detail_info = response.json.get('data').get('tradingNoticeColumnModelList')
        ggxx_info = detail_info[0].get('multiKeyValueTableList')[0]

        tphtml = ""
        if ggxx_info:
            for gd in ggxx_info:
                temps = f'''
                    <tr>
                        <th colspan="1"><span>{gd.get('key')}</span></th>
                        <td colspan="3"><span>{gd.get('value', '无')}</span>
                        </td>
                    </tr>
                    '''
                tphtml += temps

        ggxx_html = f'''
            <section>
                <h2 id="公告信息" class="subtitle">公告信息</h2>
                <div class="mt-2">
                    <div>
                        <div>
                            <table>
                                <tbody>
                                {tphtml}
                                </tbody>
                            </table>
                        </div>
                    </div>
                </div>
            </section>
            '''

        ggnr_html = detail_info[1].get('richtext') or ""
        if not ggnr_html:
            try:
                ggnr_html = detail_info[2].get('richtext') or ""
                htxx_info = detail_info[1].get('multiKeyValueTableList')[0]
                hhh = ""
                for hd in htxx_info:
                    ttp = f'''
                    <tr>
                        <th colspan="1"><span>{hd.get('key')}</span></th>
                        <td colspan="3"><span>{hd.get('value')}</span></td>
                    </tr>
                    '''
                    hhh += ttp

                htxx_html = f'''
                <h2 id="合同信息" class="subtitle">合同信息</h2>
                <table>
                    <tbody>
                    {hhh}
                    </tbody>
                </table>
                '''
            except:
                ggnr_html = ""
                kb_info = detail_info[1].get('tradingNoticeTableColumnModel')
                title_info = kb_info.get('columnHeaderList')
                nr_info = kb_info.get('dataList')
                tt_html = ""
                for hd in title_info:
                    tmp1 = f'''
                    <th>{hd.get('name')}</th>
                    '''
                    tt_html += tmp1
                nr_html = ""
                idx = 1
                for nr in nr_info:
                    tmp2 = f'''
                    <tr>
                        <td style="width: 60px;">
                            <div>{idx}</div>
                        </td>
                        <td>
                            <div><span>{nr.get('bidderName')}</span></div>
                        </td>
                        <td>
                            <div><span>{nr.get('bidderOrgCode')}</span></div>
                        </td>
                        <td>
                            <div><span>{nr.get('bidManager')}</span></div>
                        </td>
                        <td>
                            <div><span>{nr.get('isCommitMargin')}</span></div>
                        </td>
                        <td>
                            <div><span>{nr.get('checkinTime')}</span></div>
                        </td>
                    </tr>
                    '''
                    nr_html += tmp2
                    idx += 1

                htxx_html = f'''
                <table>
                    <thead>
                    <tr>
                        <th style="width: 60px;">序号</th>
                        {tt_html}
                    </tr>
                    </thead>
                    <tbody>
                        {nr_html}
                    </tbody>
                </table>
                '''
        else:
            htxx_html = ""

        try:
            f_list = detail_info[-1].get('noticeFileBOList')
        except:
            f_list = None
        if f_list:
            ff_html = ""
            for index,f in enumerate(f_list):
                f_id = f.get('rowGuid')
                version = "".join(re.findall('new/jygg/(.*?)/', list_item.href))
                f_url = f"https://ygp.gdzwfw.gov.cn/ggzy-portal/base/sys-file/download/{version}/{f_id}"
                f_name = f.get('fileName').strip()
                temp = f'''
                <li>
                    <span>附件名称 {index+1}</span>
                    <div>
                        <div>
                            <a href="{f_url}">{f_name}</a>
                        </div>
                    </div>
                </li>
                '''
                ff_html += temp
                f_type = extract_file_type(f_name, f_url,['zbs'])
                if f_type:
                    attachment = AttachmentDownloader().fetch_attachment(
                        file_name=f_name, file_type=f_type, download_url=f_url,
                        proxies=request.proxies)
                    attachments[str(len(attachments) + 1)] = attachment

            file_html = f'''
            <div class="fileList">
                <h2 id="相关附件" class="subtitle">相关附件</h2>
                <ul>
                    {ff_html}
                </ul>
            </div>
            '''
        else:
            file_html = ""

        list_item.contenthtml = ggxx_html + htxx_html + ggnr_html + file_html

        iframe_url = Selector(ggnr_html).xpath('//iframe/@src').extract_first()

        fm_type = extract_file_type('公告内容', iframe_url)
        if fm_type:
            attachmentf = AttachmentDownloader().fetch_attachment(
                file_name='公告内容', file_type=fm_type, download_url=iframe_url,
                proxies=request.proxies)
            attachments[str(len(attachments) + 1)] = attachmentf

        file_list = Selector(ggnr_html).xpath('//a[@href]')
        if file_list:
            for info in file_list:
                file_name = "".join(info.xpath('.//text()').extract()).strip()
                file_url = info.xpath('./@href').extract_first()
                file_type = extract_file_type(file_name, file_url,['zbs'])
                if file_type:
                    attachment = AttachmentDownloader().fetch_attachment(
                        file_name=file_name, file_type=file_type, download_url=file_url)
                    attachments[str(len(attachments) + 1)] = attachment

        fi_list = Selector(ggnr_html).xpath('//img[@src]')
        if fi_list:
            for idx,info in enumerate(fi_list):
                fi_name = str(idx+1)
                fi_url = info.xpath('./@src').extract_first()
                if "download" in fi_url:
                    attachment = AttachmentDownloader().fetch_attachment(
                        file_name=fi_name, file_type='jpg', download_url=fi_url)
                    attachments[str(len(attachments) + 1)] = attachment

        if attachments:
            list_item.projectinfo = {"attachments": attachments}

        yield list_item


if __name__ == '__main__':
    Details(redis_key="lzz:mmsggzyjypt_gcjs").start()