工程建设-详情页.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-01-04
  4. ---------
  5. @summary: 广东省公共资源交易平台
  6. ---------
  7. @author: lzz
  8. """
  9. import re
  10. import feapder
  11. from feapder.network.selector import Selector
  12. from items.spider_item import DataBakItem
  13. from untils.attachment import AttachmentDownloader
  14. from untils.tools import extract_file_type
  15. from gd_utils import *
  16. class Spider(feapder.BiddingDetailSpider):
  17. def start_callback(self):
  18. self._downloader = AttachmentDownloader()
  19. def start_requests(self):
  20. data_list = self.get_tasks_by_rabbitmq(limit=30)
  21. for item in data_list:
  22. request_params = item.get("request_params")
  23. yield feapder.Request(url=item.get("parse_url"),
  24. item=item,
  25. proxies=False,
  26. deal_detail=item.get("deal_detail"),
  27. **request_params)
  28. def download_midware(self, request):
  29. en_str = get_enstr(request.params)
  30. request.proxies = get_proxy(socks5h=True)
  31. request.headers = {
  32. "Accept": "application/json, text/plain, */*",
  33. "Accept-Language": "zh-CN,zh;q=0.9",
  34. "Cache-Control": "no-cache",
  35. "Connection": "keep-alive",
  36. "Pragma": "no-cache",
  37. "Referer": "https://ygp.gdzwfw.gov.cn/ggzy-portal/",
  38. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
  39. "X-Dgi-Req-App": en_str.get('X-Dgi-Req-App'),
  40. "X-Dgi-Req-Nonce": en_str.get('X-Dgi-Req-Nonce'),
  41. "X-Dgi-Req-Signature": en_str.get('X-Dgi-Req-Signature'),
  42. "X-Dgi-Req-Timestamp": en_str.get('X-Dgi-Req-Timestamp'),
  43. }
  44. def parse(self, request, response):
  45. attachments = {}
  46. items = request.item
  47. data_item = DataBakItem(**items)
  48. detail_info = response.json.get('data').get('tradingNoticeColumnModelList')
  49. ggxx_info = detail_info[0].get('multiKeyValueTableList')[0]
  50. tphtml = ""
  51. if ggxx_info:
  52. for gd in ggxx_info:
  53. temps = f'''
  54. <tr>
  55. <th colspan="1"><span>{gd.get('key')}</span></th>
  56. <td colspan="3"><span>{gd.get('value', '无')}</span>
  57. </td>
  58. </tr>
  59. '''
  60. tphtml += temps
  61. ggxx_html = f'''
  62. <section>
  63. <h2 id="公告信息" class="subtitle">公告信息</h2>
  64. <div class="mt-2">
  65. <div>
  66. <div>
  67. <table>
  68. <tbody>
  69. {tphtml}
  70. </tbody>
  71. </table>
  72. </div>
  73. </div>
  74. </div>
  75. </section>
  76. '''
  77. ggnr_html = detail_info[1].get('richtext') or ""
  78. if not ggnr_html:
  79. try:
  80. ggnr_html = detail_info[2].get('richtext') or ""
  81. htxx_info = detail_info[1].get('multiKeyValueTableList')[0]
  82. htxx_dict = {}
  83. for hd in htxx_info:
  84. htxx_dict[hd.get('key')] = hd.get('value')
  85. htxx_html = f'''
  86. <h2 id="合同信息" class="subtitle">合同信息</h2>
  87. <table>
  88. <tbody>
  89. <tr>
  90. <th colspan="1"><span>合同名称</span></th>
  91. <td colspan="3"><span>{htxx_dict.get('合同名称')}</span></td>
  92. </tr>
  93. <tr>
  94. <th colspan="1"><span>招标人名称</span></th>
  95. <td colspan="1"><span>{htxx_dict.get('招标人名称')}</span></td>
  96. <th colspan="1"><span>中标人名称</span></th>
  97. <td colspan="1"><span>{htxx_dict.get('中标人名称')}</span></td>
  98. </tr>
  99. <tr>
  100. <th colspan="1"><span>合同期限</span></th>
  101. <td colspan="1"><span>{htxx_dict.get('合同期限')}</span></td>
  102. <th colspan="1"><span>合同签署时间</span></th>
  103. <td colspan="1"><span>{htxx_dict.get('合同签署时间')}</span></td>
  104. </tr>
  105. <tr>
  106. <th colspan="1"><span>合同金额</span></th>
  107. <td colspan="1"><span>{htxx_dict.get('合同金额')}</span></td>
  108. <th colspan="1"><span>其它形式合同报价</span></th>
  109. <td colspan="1"><span>{htxx_dict.get('其它形式合同报价')}</span></td>
  110. </tr>
  111. </tbody>
  112. </table>
  113. '''
  114. except:
  115. ggnr_html = ""
  116. kb_info = detail_info[1].get('tradingNoticeTableColumnModel')
  117. title_info = kb_info.get('columnHeaderList')
  118. nr_info = kb_info.get('dataList')
  119. tt_html = ""
  120. for hd in title_info:
  121. tmp1 = f'''
  122. <th>{hd.get('name')}</th>
  123. '''
  124. tt_html += tmp1
  125. nr_html = ""
  126. idx = 1
  127. for nr in nr_info:
  128. tmp2 = f'''
  129. <tr>
  130. <td style="width: 60px;">
  131. <div>{idx}</div>
  132. </td>
  133. <td>
  134. <div><span>{nr.get('bidderName')}</span></div>
  135. </td>
  136. <td>
  137. <div><span>{nr.get('bidderOrgCode')}</span></div>
  138. </td>
  139. <td>
  140. <div><span>{nr.get('bidManager')}</span></div>
  141. </td>
  142. <td>
  143. <div><span>{nr.get('isCommitMargin')}</span></div>
  144. </td>
  145. <td>
  146. <div><span>{nr.get('checkinTime')}</span></div>
  147. </td>
  148. </tr>
  149. '''
  150. nr_html += tmp2
  151. idx += 1
  152. htxx_html = f'''
  153. <table>
  154. <thead>
  155. <tr>
  156. <th style="width: 60px;">序号</th>
  157. {tt_html}
  158. </tr>
  159. </thead>
  160. <tbody>
  161. {nr_html}
  162. </tbody>
  163. </table>
  164. '''
  165. else:
  166. htxx_html = ""
  167. try:
  168. f_list = detail_info[-1].get('noticeFileBOList')
  169. except:
  170. f_list = None
  171. if f_list:
  172. ff_html = ""
  173. index = 1
  174. for f in f_list:
  175. f_id = f.get('rowGuid')
  176. version = "".join(re.findall('new/jygg/(.*?)/', data_item.href))
  177. f_url = f"https://ygp.gdzwfw.gov.cn/ggzy-portal/base/sys-file/download/{version}/{f_id}"
  178. f_name = f.get('fileName').strip()
  179. temp = f'''
  180. <li>
  181. <span>附件名称 {index}</span>
  182. <div>
  183. <div>
  184. <a href="{f_url}">{f_name}</a>
  185. </div>
  186. </div>
  187. </li>
  188. '''
  189. index += 1
  190. ff_html += temp
  191. f_type = extract_file_type(f_name, f_url)
  192. if f_type:
  193. attachment = self._downloader.fetch_attachment(
  194. file_name=f_name,
  195. file_type=f_type,
  196. download_url=f_url,
  197. proxies=request.get_proxies()
  198. )
  199. attachments[str(len(attachments) + 1)] = attachment
  200. file_html = f'''
  201. <div class="fileList">
  202. <h2 id="相关附件" class="subtitle">相关附件</h2>
  203. <ul>
  204. {ff_html}
  205. </ul>
  206. </div>
  207. '''
  208. else:
  209. file_html = ""
  210. data_item.contenthtml = ggxx_html + htxx_html + ggnr_html + file_html
  211. file_list = Selector(ggnr_html).xpath('//a[@href]')
  212. if file_list:
  213. for info in file_list:
  214. file_name = "".join(info.xpath('.//text()').extract()).strip()
  215. file_url = info.xpath('./@href').extract_first()
  216. file_type = extract_file_type(file_name, file_url)
  217. if file_type:
  218. attachment = self._downloader.fetch_attachment(
  219. file_name=file_name,
  220. file_type=file_type,
  221. download_url=file_url
  222. )
  223. attachments[str(len(attachments) + 1)] = attachment
  224. if len(attachments) > 0:
  225. data_item.projectinfo = {"attachments": attachments}
  226. yield data_item
  227. if __name__ == '__main__':
  228. Spider(redis_key="lzz:gdsggzyjypt_gcjs").start()