排污权-详情页.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-01-04
  4. ---------
  5. @summary: 广东省公共资源交易平台
  6. ---------
  7. @author: lzz
  8. """
  9. import re
  10. import feapder
  11. from feapder.network.selector import Selector
  12. from items.spider_item import DataBakItem
  13. from untils.attachment import AttachmentDownloader
  14. from untils.tools import extract_file_type
  15. from gd_utils import *
  16. class Details(feapder.BiddingDetailSpider):
  17. def start_requests(self):
  18. data_list = self.get_tasks_by_rabbitmq(limit=20)
  19. for item in data_list:
  20. request_params = item.get("request_params")
  21. yield feapder.Request(url=item.get("parse_url"), item=item,proxies=False,
  22. deal_detail=item.get("deal_detail"), **request_params,
  23. callback='parse')
  24. def download_midware(self, request):
  25. en_str = get_enstr(request.params)
  26. request.proxies = get_proxy(socks5h=True)
  27. request.headers = {
  28. "Accept": "application/json, text/plain, */*",
  29. "Accept-Language": "zh-CN,zh;q=0.9",
  30. "Cache-Control": "no-cache",
  31. "Connection": "keep-alive",
  32. "Pragma": "no-cache",
  33. "Referer": "https://ygp.gdzwfw.gov.cn/ggzy-portal/",
  34. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
  35. "X-Dgi-Req-App": en_str.get('X-Dgi-Req-App'),
  36. "X-Dgi-Req-Nonce": en_str.get('X-Dgi-Req-Nonce'),
  37. "X-Dgi-Req-Signature": en_str.get('X-Dgi-Req-Signature'),
  38. "X-Dgi-Req-Timestamp": en_str.get('X-Dgi-Req-Timestamp'),
  39. }
  40. def parse(self, request, response):
  41. attachments = {}
  42. items = request.item
  43. list_item = DataBakItem(**items)
  44. detail_info = response.json.get('data').get('tradingNoticeColumnModelList')
  45. ggxx_info = detail_info[0].get('multiKeyValueTableList')[0]
  46. tphtml = ""
  47. if ggxx_info:
  48. for gd in ggxx_info:
  49. temps = f'''
  50. <tr>
  51. <th colspan="1"><span>{gd.get('key')}</span></th>
  52. <td colspan="3"><span>{gd.get('value', '无')}</span>
  53. </td>
  54. </tr>
  55. '''
  56. tphtml += temps
  57. ggxx_html = f'''
  58. <section>
  59. <h2 id="公告信息" class="subtitle">公告信息</h2>
  60. <div class="mt-2">
  61. <div>
  62. <div>
  63. <table>
  64. <tbody>
  65. {tphtml}
  66. </tbody>
  67. </table>
  68. </div>
  69. </div>
  70. </div>
  71. </section>
  72. '''
  73. ggnr_html = detail_info[1].get('richtext') or ""
  74. try:
  75. f_list = detail_info[-1].get('noticeFileBOList')
  76. except:
  77. f_list = None
  78. if f_list:
  79. ff_html = ""
  80. index = 1
  81. for f in f_list:
  82. f_id = f.get('rowGuid')
  83. version = "".join(re.findall('new/jygg/(.*?)/',list_item.href))
  84. f_url = f"https://ygp.gdzwfw.gov.cn/ggzy-portal/base/sys-file/download/{version}/{f_id}"
  85. f_name = f.get('fileName').strip()
  86. temp = f'''
  87. <li>
  88. <span>附件名称 {index}</span>
  89. <div>
  90. <div>
  91. <a href="{f_url}">{f_name}</a>
  92. </div>
  93. </div>
  94. </li>
  95. '''
  96. index += 1
  97. ff_html += temp
  98. f_type = extract_file_type(f_name, f_url)
  99. if f_type:
  100. attachment = AttachmentDownloader().fetch_attachment(
  101. file_name=f_name, file_type=f_type, download_url=f_url,
  102. proxies=request.get_proxies())
  103. attachments[str(len(attachments) + 1)] = attachment
  104. file_html = f'''
  105. <div class="fileList">
  106. <h2 id="相关附件" class="subtitle">相关附件</h2>
  107. <ul>
  108. {ff_html}
  109. </ul>
  110. </div>
  111. '''
  112. else:
  113. file_html = ""
  114. list_item.contenthtml = ggxx_html + ggnr_html + file_html
  115. iframe_url = Selector(ggnr_html).xpath('//iframe/@src').extract_first()
  116. fm_type = extract_file_type('公告内容', iframe_url)
  117. if fm_type:
  118. attachmentf = AttachmentDownloader().fetch_attachment(
  119. file_name='公告内容', file_type=fm_type, download_url=iframe_url,
  120. proxies=request.get_proxies())
  121. attachments[str(len(attachments) + 1)] = attachmentf
  122. file_list = Selector(ggnr_html).xpath('//a[@href]')
  123. if file_list:
  124. for info in file_list:
  125. file_name = "".join(info.xpath('.//text()').extract()).strip()
  126. file_url = info.xpath('./@href').extract_first()
  127. file_type = extract_file_type(file_name,file_url)
  128. if file_type:
  129. attachment = AttachmentDownloader().fetch_attachment(
  130. file_name=file_name, file_type=file_type, download_url=file_url,
  131. proxies=request.get_proxies())
  132. attachments[str(len(attachments) + 1)] = attachment
  133. if attachments:
  134. list_item.projectinfo = {"attachments": attachments}
  135. yield list_item
  136. if __name__ == '__main__':
  137. Details(redis_key="lzz:gdsggzyjypt_pwq_jygg").start()