政府采购-省级平台-兰州新区-详情页.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-04-19
  4. ---------
  5. @summary: 甘肃省公共资源交易网
  6. ---------
  7. @author: lzz
  8. """
  9. import time
  10. import feapder
  11. from feapder.utils.tools import log
  12. from items.spider_item import DataBakItem
  13. from untils.attachment import AttachmentDownloader
  14. import requests
  15. from untils.tools import text_search,extract_file_type
  16. from feapder.network.selector import Selector
  17. import re,random
  18. headers = {
  19. "Accept": "*/*",
  20. "Accept-Language": "zh-CN,zh;q=0.9",
  21. "Cache-Control": "no-cache",
  22. "Connection": "keep-alive",
  23. "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
  24. "Origin": "http://47.110.59.239:9207",
  25. "Pragma": "no-cache",
  26. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
  27. "X-Requested-With": "XMLHttpRequest"
  28. }
  29. def get_bdxx(hid,ptp):
  30. url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowBidpackage"
  31. data = {
  32. "tenderprojectid": hid,
  33. "bidpackages": "",
  34. "projectType": ptp
  35. }
  36. response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
  37. data_info = Selector(response.text).xpath('//div[@class="sAblock"]').extract_first()
  38. return data_info
  39. def get_ggxx(hid,area):
  40. url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
  41. data = {
  42. "bidpackages": "",
  43. "tenderprojectid": hid,
  44. "index": "1",
  45. "area": area
  46. }
  47. response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
  48. root = Selector(response.text)
  49. data_info = root.xpath('//div[@class="jxGonggaoInformationDetail "]').extract_first()
  50. if data_info == None:
  51. return "",""
  52. file_list = root.xpath('//div[@class="jxGonggaoInformationDetail "][1]//a')
  53. if text_search(data_info).total < 10:
  54. data_info = "详情请访问原网页!"
  55. return data_info,file_list
  56. def get_kpbxx(hid,area):
  57. url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
  58. data = {
  59. "bidpackages": "",
  60. "tenderprojectid": hid,
  61. "index": "4",
  62. "area": area
  63. }
  64. response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
  65. root = Selector(response.text)
  66. data_info = root.xpath('//div[@class="xTouBiaoTable"]').extract_first()
  67. if data_info == None:
  68. return "",""
  69. file_list = root.xpath('//div[@class="xTouBiaoTable"]//a')
  70. if text_search(data_info).total < 10:
  71. data_info = "详情请访问原网页!"
  72. return data_info,file_list
  73. def get_zbjg(hid,area):
  74. url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
  75. data = {
  76. "bidpackages": "",
  77. "tenderprojectid": hid,
  78. "index": "5",
  79. "area": area
  80. }
  81. response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
  82. root = Selector(response.text)
  83. data_info = root.xpath('//div[@class="jxTradingPublicDetail"]').extract_first()
  84. if data_info == None:
  85. return "",""
  86. file_list = root.xpath('//div[@class="jxTradingPublicDetail"]//a')
  87. if text_search(data_info).total < 10:
  88. data_info = "详情请访问原网页!"
  89. return data_info,file_list
  90. class FirefoxDetails(feapder.BiddingDetailSpider):
  91. def start_requests(self):
  92. data_list = self.get_tasks_by_rabbitmq(limit=10)
  93. for item in data_list:
  94. # log.debug(item)
  95. request_params = item.get("request_params")
  96. yield feapder.Request(url=item.get("parse_url"),item=item,files_info=item.get("files"),
  97. deal_detail=item.get("deal_detail"),**request_params,
  98. callback=eval(item.get("parse")),proxies=False)
  99. def detail_get(self,request,response):
  100. hid = request.data.get('tenderprojectid')
  101. area = request.data.get('area')
  102. items = request.item
  103. list_item = DataBakItem(**items)
  104. list_item.title += "_项目信息"
  105. list_item.href += f"?t={int(time.time())}"
  106. html1 = response.xpath('//div[@class="jxTenderObjMain"]').extract_first("")
  107. ptp = "".join(re.findall('projectType: "(.*?)"',response.text))
  108. html2 = get_bdxx(hid,ptp)
  109. list_item.contenthtml = html1 + html2
  110. yield list_item
  111. items = request.item
  112. list_item = DataBakItem(**items)
  113. list_item.title += "_公告信息"
  114. list_item.href += f"?t={int(time.time())}"
  115. html,file_list = get_ggxx(hid,area)
  116. if html:
  117. list_item.contenthtml = html
  118. attachments = {}
  119. if file_list:
  120. for f1 in file_list:
  121. file_url = f1.xpath('./@href').extract_first("")
  122. file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
  123. file_type = extract_file_type(file_name=file_name,file_url=file_url,file_type_list=['html'])
  124. if file_type and file_url:
  125. attachment = AttachmentDownloader().fetch_attachment(
  126. file_name=file_name, file_type=file_type, download_url=file_url)
  127. attachments[str(len(attachments) + 1)] = attachment
  128. if attachments:
  129. list_item.projectinfo = {"attachments": attachments}
  130. yield list_item
  131. items = request.item
  132. list_item = DataBakItem(**items)
  133. list_item.title += "_开评标信息"
  134. list_item.href += f"?t={int(time.time())}"
  135. html, file_list = get_kpbxx(hid,area)
  136. if html:
  137. list_item.contenthtml = html
  138. attachments = {}
  139. if file_list:
  140. for f1 in file_list:
  141. file_url = f1.xpath('./@href').extract_first("")
  142. file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
  143. file_type = extract_file_type(file_name=file_name, file_url=file_url,file_type_list=['html'])
  144. if file_type and file_url:
  145. attachment = AttachmentDownloader().fetch_attachment(
  146. file_name=file_name, file_type=file_type, download_url=file_url)
  147. attachments[str(len(attachments) + 1)] = attachment
  148. if attachments:
  149. list_item.projectinfo = {"attachments": attachments}
  150. yield list_item
  151. items = request.item
  152. list_item = DataBakItem(**items)
  153. list_item.title += "_中标结果信息"
  154. list_item.href += f"?t={int(time.time())}"
  155. html, file_list = get_zbjg(hid,area)
  156. if html:
  157. list_item.contenthtml = html
  158. attachments = {}
  159. if file_list:
  160. for f1 in file_list:
  161. file_url = f1.xpath('./@href').extract_first("")
  162. file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
  163. file_type = extract_file_type(file_name=file_name, file_url=file_url,file_type_list=['html'])
  164. if file_type and file_url:
  165. attachment = AttachmentDownloader().fetch_attachment(
  166. file_name=file_name, file_type=file_type, download_url=file_url)
  167. attachments[str(len(attachments) + 1)] = attachment
  168. if attachments:
  169. list_item.projectinfo = {"attachments": attachments}
  170. yield list_item
  171. time.sleep(random.randint(3, 6))
  172. if __name__ == "__main__":
  173. FirefoxDetails(redis_key="lzz:gssggzyjyw_zfcg_lzxq").start()