土地矿业-详情页.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-05-22
  4. ---------
  5. @summary: 肇庆市公共资源交易平台
  6. ---------
  7. @author: lzz
  8. """
  9. import re
  10. import feapder
  11. from feapder.network.selector import Selector
  12. from items.spider_item import DataBakItem
  13. from untils.attachment import AttachmentDownloader
  14. from untils.tools import extract_file_type
  15. from gdzq_utils import *
  16. class Details(feapder.BiddingDetailSpider):
  17. def start_requests(self):
  18. data_list = self.get_tasks_by_rabbitmq(limit=20)
  19. for item in data_list:
  20. request_params = item.get("request_params")
  21. yield feapder.Request(url=item.get("parse_url"), item=item, proxies=False,
  22. deal_detail=item.get("deal_detail"), **request_params,
  23. callback='parse')
  24. def download_midware(self, request):
  25. en_str = get_enstr(request.params)
  26. request.proxies = get_proxy(socks5h=True)
  27. request.headers = {
  28. "Accept": "application/json, text/plain, */*",
  29. "Accept-Language": "zh-CN,zh;q=0.9",
  30. "Cache-Control": "no-cache",
  31. "Connection": "keep-alive",
  32. "Pragma": "no-cache",
  33. "Referer": "https://ygp.gdzwfw.gov.cn/",
  34. "Sec-Fetch-Dest": "empty",
  35. "Sec-Fetch-Mode": "cors",
  36. "Sec-Fetch-Site": "same-origin",
  37. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
  38. "X-Dgi-Req-App": en_str.get('X-Dgi-Req-App'),
  39. "X-Dgi-Req-Nonce": en_str.get('X-Dgi-Req-Nonce'),
  40. "X-Dgi-Req-Signature": en_str.get('X-Dgi-Req-Signature'),
  41. "X-Dgi-Req-Timestamp": en_str.get('X-Dgi-Req-Timestamp'),
  42. }
  43. def parse(self, request, response):
  44. attachments = {}
  45. items = request.item
  46. list_item = DataBakItem(**items)
  47. detail_info = response.json.get('data').get('tradingNoticeColumnModelList')
  48. ggxx_info = detail_info[0].get('multiKeyValueTableList')[0]
  49. tphtml = ""
  50. if ggxx_info:
  51. for gd in ggxx_info:
  52. temps = f'''
  53. <tr>
  54. <th colspan="1"><span>{gd.get('key')}</span></th>
  55. <td colspan="3"><span>{gd.get('value', '无')}</span>
  56. </td>
  57. </tr>
  58. '''
  59. tphtml += temps
  60. ggxx_html = f'''
  61. <section>
  62. <h2 id="公告信息" class="subtitle">公告信息</h2>
  63. <div class="mt-2">
  64. <div>
  65. <div>
  66. <table>
  67. <tbody>
  68. {tphtml}
  69. </tbody>
  70. </table>
  71. </div>
  72. </div>
  73. </div>
  74. </section>
  75. '''
  76. ggnr_html = detail_info[1].get('richtext') or ""
  77. if not ggnr_html:
  78. try:
  79. ggnr_html = detail_info[2].get('richtext') or ""
  80. htxx_info = detail_info[1].get('multiKeyValueTableList')[0]
  81. hhh = ""
  82. for hd in htxx_info:
  83. ttp = f'''
  84. <tr>
  85. <th colspan="1"><span>{hd.get('key')}</span></th>
  86. <td colspan="3"><span>{hd.get('value')}</span></td>
  87. </tr>
  88. '''
  89. hhh += ttp
  90. htxx_html = f'''
  91. <h2 id="合同信息" class="subtitle">合同信息</h2>
  92. <table>
  93. <tbody>
  94. {hhh}
  95. </tbody>
  96. </table>
  97. '''
  98. except:
  99. ggnr_html = ""
  100. kb_info = detail_info[1].get('tradingNoticeTableColumnModel')
  101. title_info = kb_info.get('columnHeaderList')
  102. nr_info = kb_info.get('dataList')
  103. tt_html = ""
  104. for hd in title_info:
  105. tmp1 = f'''
  106. <th>{hd.get('name')}</th>
  107. '''
  108. tt_html += tmp1
  109. nr_html = ""
  110. idx = 1
  111. for nr in nr_info:
  112. tmp2 = f'''
  113. <tr>
  114. <td style="width: 60px;">
  115. <div>{idx}</div>
  116. </td>
  117. <td>
  118. <div><span>{nr.get('bidderName')}</span></div>
  119. </td>
  120. <td>
  121. <div><span>{nr.get('bidderOrgCode')}</span></div>
  122. </td>
  123. <td>
  124. <div><span>{nr.get('bidManager')}</span></div>
  125. </td>
  126. <td>
  127. <div><span>{nr.get('isCommitMargin')}</span></div>
  128. </td>
  129. <td>
  130. <div><span>{nr.get('checkinTime')}</span></div>
  131. </td>
  132. </tr>
  133. '''
  134. nr_html += tmp2
  135. idx += 1
  136. htxx_html = f'''
  137. <table>
  138. <thead>
  139. <tr>
  140. <th style="width: 60px;">序号</th>
  141. {tt_html}
  142. </tr>
  143. </thead>
  144. <tbody>
  145. {nr_html}
  146. </tbody>
  147. </table>
  148. '''
  149. else:
  150. htxx_html = ""
  151. try:
  152. f_list = detail_info[-1].get('noticeFileBOList')
  153. except:
  154. f_list = None
  155. if f_list:
  156. ff_html = ""
  157. for index,f in enumerate(f_list):
  158. f_id = f.get('rowGuid')
  159. version = "".join(re.findall('new/jygg/(.*?)/', list_item.href))
  160. f_url = f"https://ygp.gdzwfw.gov.cn/ggzy-portal/base/sys-file/download/{version}/{f_id}"
  161. f_name = f.get('fileName').strip()
  162. temp = f'''
  163. <li>
  164. <span>附件名称 {index+1}</span>
  165. <div>
  166. <div>
  167. <a href="{f_url}">{f_name}</a>
  168. </div>
  169. </div>
  170. </li>
  171. '''
  172. ff_html += temp
  173. f_type = extract_file_type(f_name, f_url,['zbs'])
  174. if f_type:
  175. attachment = AttachmentDownloader().fetch_attachment(
  176. file_name=f_name, file_type=f_type, download_url=f_url,
  177. proxies=request.proxies)
  178. attachments[str(len(attachments) + 1)] = attachment
  179. file_html = f'''
  180. <div class="fileList">
  181. <h2 id="相关附件" class="subtitle">相关附件</h2>
  182. <ul>
  183. {ff_html}
  184. </ul>
  185. </div>
  186. '''
  187. else:
  188. file_html = ""
  189. list_item.contenthtml = ggxx_html + htxx_html + ggnr_html + file_html
  190. iframe_url = Selector(ggnr_html).xpath('//iframe/@src').extract_first()
  191. fm_type = extract_file_type('公告内容', iframe_url)
  192. if fm_type:
  193. attachmentf = AttachmentDownloader().fetch_attachment(
  194. file_name='公告内容', file_type=fm_type, download_url=iframe_url,
  195. proxies=request.proxies)
  196. attachments[str(len(attachments) + 1)] = attachmentf
  197. file_list = Selector(ggnr_html).xpath('//a[@href]')
  198. if file_list:
  199. for info in file_list:
  200. file_name = "".join(info.xpath('.//text()').extract()).strip()
  201. file_url = info.xpath('./@href').extract_first()
  202. file_type = extract_file_type(file_name, file_url,['zbs'])
  203. if file_type:
  204. attachment = AttachmentDownloader().fetch_attachment(
  205. file_name=file_name, file_type=file_type, download_url=file_url)
  206. attachments[str(len(attachments) + 1)] = attachment
  207. fi_list = Selector(ggnr_html).xpath('//img[@src]')
  208. if fi_list:
  209. for idx,info in enumerate(fi_list):
  210. fi_name = str(idx+1)
  211. fi_url = info.xpath('./@src').extract_first()
  212. if "download" in fi_url:
  213. attachment = AttachmentDownloader().fetch_attachment(
  214. file_name=fi_name, file_type='jpg', download_url=fi_url)
  215. attachments[str(len(attachments) + 1)] = attachment
  216. if attachments:
  217. list_item.projectinfo = {"attachments": attachments}
  218. yield list_item
  219. if __name__ == '__main__':
  220. Details(redis_key="lzz:zqsggzyjypt_gcjs").start()