123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-01-04
- ---------
- @summary: 广东省公共资源交易平台
- ---------
- @author: lzz
- """
- import re
- import feapder
- from feapder.network.selector import Selector
- from items.spider_item import DataBakItem
- from untils.attachment import AttachmentDownloader
- from untils.tools import extract_file_type
- from gd_utils import *
- class Spider(feapder.BiddingDetailSpider):
- def start_callback(self):
- self._downloader = AttachmentDownloader()
- def start_requests(self):
- data_list = self.get_tasks_by_rabbitmq(limit=30)
- for item in data_list:
- request_params = item.get("request_params")
- yield feapder.Request(url=item.get("parse_url"),
- item=item,
- proxies=False,
- deal_detail=item.get("deal_detail"),
- **request_params)
- def download_midware(self, request):
- en_str = get_enstr(request.params)
- request.proxies = get_proxy(socks5h=True)
- request.headers = {
- "Accept": "application/json, text/plain, */*",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Pragma": "no-cache",
- "Referer": "https://ygp.gdzwfw.gov.cn/ggzy-portal/",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
- "X-Dgi-Req-App": en_str.get('X-Dgi-Req-App'),
- "X-Dgi-Req-Nonce": en_str.get('X-Dgi-Req-Nonce'),
- "X-Dgi-Req-Signature": en_str.get('X-Dgi-Req-Signature'),
- "X-Dgi-Req-Timestamp": en_str.get('X-Dgi-Req-Timestamp'),
- }
- def parse(self, request, response):
- attachments = {}
- items = request.item
- data_item = DataBakItem(**items)
- detail_info = response.json.get('data').get('tradingNoticeColumnModelList')
- ggxx_info = detail_info[0].get('multiKeyValueTableList')[0]
- tphtml = ""
- if ggxx_info:
- for gd in ggxx_info:
- temps = f'''
- <tr>
- <th colspan="1"><span>{gd.get('key')}</span></th>
- <td colspan="3"><span>{gd.get('value', '无')}</span>
- </td>
- </tr>
- '''
- tphtml += temps
- ggxx_html = f'''
- <section>
- <h2 id="公告信息" class="subtitle">公告信息</h2>
- <div class="mt-2">
- <div>
- <div>
- <table>
- <tbody>
- {tphtml}
- </tbody>
- </table>
- </div>
- </div>
- </div>
- </section>
- '''
- ggnr_html = detail_info[1].get('richtext') or ""
- if not ggnr_html:
- try:
- ggnr_html = detail_info[2].get('richtext') or ""
- htxx_info = detail_info[1].get('multiKeyValueTableList')[0]
- htxx_dict = {}
- for hd in htxx_info:
- htxx_dict[hd.get('key')] = hd.get('value')
- htxx_html = f'''
- <h2 id="合同信息" class="subtitle">合同信息</h2>
- <table>
- <tbody>
- <tr>
- <th colspan="1"><span>合同名称</span></th>
- <td colspan="3"><span>{htxx_dict.get('合同名称')}</span></td>
- </tr>
- <tr>
- <th colspan="1"><span>招标人名称</span></th>
- <td colspan="1"><span>{htxx_dict.get('招标人名称')}</span></td>
- <th colspan="1"><span>中标人名称</span></th>
- <td colspan="1"><span>{htxx_dict.get('中标人名称')}</span></td>
- </tr>
- <tr>
- <th colspan="1"><span>合同期限</span></th>
- <td colspan="1"><span>{htxx_dict.get('合同期限')}</span></td>
- <th colspan="1"><span>合同签署时间</span></th>
- <td colspan="1"><span>{htxx_dict.get('合同签署时间')}</span></td>
- </tr>
- <tr>
- <th colspan="1"><span>合同金额</span></th>
- <td colspan="1"><span>{htxx_dict.get('合同金额')}</span></td>
- <th colspan="1"><span>其它形式合同报价</span></th>
- <td colspan="1"><span>{htxx_dict.get('其它形式合同报价')}</span></td>
- </tr>
- </tbody>
- </table>
- '''
- except:
- ggnr_html = ""
- kb_info = detail_info[1].get('tradingNoticeTableColumnModel')
- title_info = kb_info.get('columnHeaderList')
- nr_info = kb_info.get('dataList')
- tt_html = ""
- for hd in title_info:
- tmp1 = f'''
- <th>{hd.get('name')}</th>
- '''
- tt_html += tmp1
- nr_html = ""
- idx = 1
- for nr in nr_info:
- tmp2 = f'''
- <tr>
- <td style="width: 60px;">
- <div>{idx}</div>
- </td>
- <td>
- <div><span>{nr.get('bidderName')}</span></div>
- </td>
- <td>
- <div><span>{nr.get('bidderOrgCode')}</span></div>
- </td>
- <td>
- <div><span>{nr.get('bidManager')}</span></div>
- </td>
- <td>
- <div><span>{nr.get('isCommitMargin')}</span></div>
- </td>
- <td>
- <div><span>{nr.get('checkinTime')}</span></div>
- </td>
- </tr>
- '''
- nr_html += tmp2
- idx += 1
- htxx_html = f'''
- <table>
- <thead>
- <tr>
- <th style="width: 60px;">序号</th>
- {tt_html}
- </tr>
- </thead>
- <tbody>
- {nr_html}
- </tbody>
- </table>
- '''
- else:
- htxx_html = ""
- try:
- f_list = detail_info[-1].get('noticeFileBOList')
- except:
- f_list = None
- if f_list:
- ff_html = ""
- index = 1
- for f in f_list:
- f_id = f.get('rowGuid')
- version = "".join(re.findall('new/jygg/(.*?)/', data_item.href))
- f_url = f"https://ygp.gdzwfw.gov.cn/ggzy-portal/base/sys-file/download/{version}/{f_id}"
- f_name = f.get('fileName').strip()
- temp = f'''
- <li>
- <span>附件名称 {index}</span>
- <div>
- <div>
- <a href="{f_url}">{f_name}</a>
- </div>
- </div>
- </li>
- '''
- index += 1
- ff_html += temp
- f_type = extract_file_type(f_name, f_url)
- if f_type:
- attachment = self._downloader.fetch_attachment(
- file_name=f_name,
- file_type=f_type,
- download_url=f_url,
- proxies=request.get_proxies()
- )
- attachments[str(len(attachments) + 1)] = attachment
- file_html = f'''
- <div class="fileList">
- <h2 id="相关附件" class="subtitle">相关附件</h2>
- <ul>
- {ff_html}
- </ul>
- </div>
- '''
- else:
- file_html = ""
- data_item.contenthtml = ggxx_html + htxx_html + ggnr_html + file_html
- file_list = Selector(ggnr_html).xpath('//a[@href]')
- if file_list:
- for info in file_list:
- file_name = "".join(info.xpath('.//text()').extract()).strip()
- file_url = info.xpath('./@href').extract_first()
- file_type = extract_file_type(file_name, file_url)
- if file_type:
- attachment = self._downloader.fetch_attachment(
- file_name=file_name,
- file_type=file_type,
- download_url=file_url
- )
- attachments[str(len(attachments) + 1)] = attachment
- if len(attachments) > 0:
- data_item.projectinfo = {"attachments": attachments}
- yield data_item
- if __name__ == '__main__':
- Spider(redis_key="lzz:gdsggzyjypt_gcjs").start()
|