工程建设-详情页.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-04-19
  4. ---------
  5. @summary: 甘肃省公共资源交易网
  6. ---------
  7. @author: lzz
  8. """
  9. import time
  10. import feapder
  11. from feapder.utils.tools import log
  12. from items.spider_item import DataBakItem
  13. from untils.attachment import AttachmentDownloader
  14. import requests
  15. from untils.tools import text_search,extract_file_type
  16. from feapder.network.selector import Selector
  17. import re,random
  18. headers = {
  19. "Accept": "*/*",
  20. "Accept-Language": "zh-CN,zh;q=0.9",
  21. "Cache-Control": "no-cache",
  22. "Connection": "keep-alive",
  23. "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
  24. "Origin": "http://47.110.59.239:9207",
  25. "Pragma": "no-cache",
  26. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
  27. "X-Requested-With": "XMLHttpRequest"
  28. }
  29. def get_bdxx(hid,ptp):
  30. url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowBidpackage"
  31. data = {
  32. "tenderprojectid": hid,
  33. "bidpackages": "",
  34. "projectType": ptp
  35. }
  36. response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
  37. data_info = Selector(response.text).xpath('//div[@class="sAblock"]').extract_first()
  38. return data_info
  39. def get_ggxx(hid,area):
  40. url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
  41. data = {
  42. "bidpackages": "",
  43. "tenderprojectid": hid,
  44. "index": "1",
  45. "area": area
  46. }
  47. response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
  48. root = Selector(response.text)
  49. data_info = root.xpath('//div[@class="jxGonggaoInformationDetail "]').extract_first()
  50. if data_info == None:
  51. return "",""
  52. file_list = root.xpath('//div[@class="jxGonggaoInformationDetail "][1]//a')
  53. if text_search(data_info).total < 10:
  54. data_info = "详情请访问原网页!"
  55. return data_info,file_list
  56. def get_kpbxx(hid,area):
  57. url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
  58. data = {
  59. "bidpackages": "",
  60. "tenderprojectid": hid,
  61. "index": "4",
  62. "area": area
  63. }
  64. response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
  65. root = Selector(response.text)
  66. data_info = root.xpath('//div[@class="xTouBiaoTable"]').extract_first()
  67. if data_info == None:
  68. return "",""
  69. file_list = root.xpath('//div[@class="xTouBiaoTable"]//a')
  70. if text_search(data_info).total < 10:
  71. data_info = "详情请访问原网页!"
  72. return data_info,file_list
  73. def get_zbjg(hid,area):
  74. url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
  75. data = {
  76. "bidpackages": "",
  77. "tenderprojectid": hid,
  78. "index": "5",
  79. "area": area
  80. }
  81. response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
  82. root = Selector(response.text)
  83. data_info = root.xpath('//div[@class="jxTradingPublicDetail"]').extract_first()
  84. if data_info == None:
  85. return "",""
  86. file_list = root.xpath('//div[@class="jxTradingPublicDetail"]//a')
  87. if text_search(data_info).total < 10:
  88. data_info = "详情请访问原网页!"
  89. return data_info,file_list
  90. class FirefoxDetails(feapder.BiddingDetailSpider):
  91. def start_requests(self):
  92. data_list = self.get_tasks_by_rabbitmq(limit=10)
  93. for item in data_list:
  94. # log.debug(item)
  95. request_params = item.get("request_params")
  96. yield feapder.Request(url=item.get("parse_url"),item=item,files_info=item.get("files"),
  97. deal_detail=item.get("deal_detail"),**request_params,
  98. callback=eval(item.get("parse")),proxies=False)
  99. def detail_get(self,request,response):
  100. yield_list = []
  101. hid = request.data.get('tenderprojectid')
  102. area = request.data.get('area')
  103. items = request.item
  104. list_item = DataBakItem(**items)
  105. list_item.title += "_项目信息"
  106. list_item.href += f"?t={int(time.time())}"
  107. html1 = response.xpath('//div[@class="jxTenderObjMain"]').extract_first("")
  108. ptp = "".join(re.findall('projectType: "(.*?)"',response.text))
  109. html2 = get_bdxx(hid,ptp)
  110. list_item.contenthtml = html1 + html2
  111. yield_list.append(list_item)
  112. items = request.item
  113. list_item = DataBakItem(**items)
  114. list_item.title += "_公告信息"
  115. list_item.href += f"?t={int(time.time())}"
  116. html,file_list = get_ggxx(hid,area)
  117. if html:
  118. list_item.contenthtml = html
  119. attachments = {}
  120. if file_list:
  121. for f1 in file_list:
  122. file_url = f1.xpath('./@href').extract_first("")
  123. file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
  124. file_type = extract_file_type(file_name=file_name,file_url=file_url,file_type_list=['html'])
  125. if file_type and file_url:
  126. attachment = AttachmentDownloader().fetch_attachment(
  127. file_name=file_name, file_type=file_type, download_url=file_url)
  128. attachments[str(len(attachments) + 1)] = attachment
  129. if attachments:
  130. list_item.projectinfo = {"attachments": attachments}
  131. yield_list.append(list_item)
  132. items = request.item
  133. list_item = DataBakItem(**items)
  134. list_item.title += "_开评标信息"
  135. list_item.href += f"?t={int(time.time())}"
  136. html, file_list = get_kpbxx(hid,area)
  137. if html:
  138. list_item.contenthtml = html
  139. attachments = {}
  140. if file_list:
  141. for f1 in file_list:
  142. file_url = f1.xpath('./@href').extract_first("")
  143. file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
  144. file_type = extract_file_type(file_name=file_name, file_url=file_url,file_type_list=['html'])
  145. if file_type and file_url:
  146. attachment = AttachmentDownloader().fetch_attachment(
  147. file_name=file_name, file_type=file_type, download_url=file_url)
  148. attachments[str(len(attachments) + 1)] = attachment
  149. if attachments:
  150. list_item.projectinfo = {"attachments": attachments}
  151. yield_list.append(list_item)
  152. items = request.item
  153. list_item = DataBakItem(**items)
  154. list_item.title += "_中标结果信息"
  155. list_item.href += f"?t={int(time.time())}"
  156. html, file_list = get_zbjg(hid,area)
  157. if html:
  158. list_item.contenthtml = html
  159. attachments = {}
  160. if file_list:
  161. for f1 in file_list:
  162. file_url = f1.xpath('./@href').extract_first("")
  163. file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
  164. file_type = extract_file_type(file_name=file_name, file_url=file_url,file_type_list=['html'])
  165. if file_type and file_url:
  166. attachment = AttachmentDownloader().fetch_attachment(
  167. file_name=file_name, file_type=file_type, download_url=file_url)
  168. attachments[str(len(attachments) + 1)] = attachment
  169. if attachments:
  170. list_item.projectinfo = {"attachments": attachments}
  171. yield_list.append(list_item)
  172. for yl in yield_list:
  173. yield yl
  174. time.sleep(random.randint(3, 6))
  175. if __name__ == "__main__":
  176. FirefoxDetails(redis_key="lzz:gssggzyjyw_zfcg_lzxq").start()