政府采购限额以下-详情页.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-04-19
  4. ---------
  5. @summary: 甘肃省公共资源交易网
  6. ---------
  7. @author: lzz
  8. """
  9. import re
  10. import time
  11. import feapder
  12. import requests
  13. from feapder.network.selector import Selector
  14. from items.spider_item import DataBakItem
  15. from untils.attachment import AttachmentDownloader
  16. from untils.tools import text_search, extract_file_type
  17. headers = {
  18. "accept": "text/html, */*; q=0.01",
  19. "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
  20. "cache-control": "no-cache",
  21. "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
  22. "origin": "https://ygjy.ggzyjy.gansu.gov.cn:3045",
  23. "pragma": "no-cache",
  24. "priority": "u=1, i",
  25. "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
  26. "x-requested-with": "XMLHttpRequest"
  27. }
  28. def get_cggg(annoId,projectId,proxies=False):
  29. url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getAnnoDetail"
  30. data = {
  31. "annoId": annoId,
  32. "projectId": projectId
  33. }
  34. response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
  35. data_info = Selector(response.text).xpath('//div[@class="yAnnounceLayer"]').extract_first()
  36. return data_info
  37. def get_jjjg(projectId,proxies=False):
  38. url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getBidResult"
  39. data = {
  40. "projectId": projectId
  41. }
  42. response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
  43. data_info = Selector(response.text).xpath('//div[@class="yThingsWrap"]').extract_first()
  44. return data_info
  45. def get_cjgs(projectId,proxies=False):
  46. url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getPublicityDetail"
  47. data = {
  48. "projectId": projectId,
  49. "publicityId": "0"
  50. }
  51. response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
  52. data_info = Selector(response.text).xpath('//div[@class="yDealMain"]').extract_first()
  53. return data_info
  54. def get_ht(projectId,proxies=False):
  55. url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getContractList"
  56. data = {
  57. "projectId": projectId
  58. }
  59. response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
  60. data_info = Selector(response.text).xpath('//div[@class="yThingsWrap"]').extract_first()
  61. return data_info
  62. class Spider(feapder.BiddingDetailSpider):
  63. def start_requests(self):
  64. data_list = self.get_tasks(limit=50)
  65. for item in data_list:
  66. request_params = item.get("request_params")
  67. yield feapder.Request(url=item.get("parse_url"),
  68. callback=eval(item.get("parse")),
  69. item=item,
  70. deal_detail=item.get("deal_detail"),
  71. files_info=item.get("files"),
  72. **request_params)
  73. def detail_get(self, request, response):
  74. yield_list = []
  75. base_id = re.findall('data:\{annoId:(.*?)}',response.text)[0]
  76. annoId = "".join(re.findall("'(.*?)',",base_id))
  77. projectId = "".join(re.findall("projectId:'(.*?)'",base_id))
  78. items = request.item
  79. list_item1 = DataBakItem(**items)
  80. list_item1.title += "_采购公告"
  81. list_item1.href += f"?t={int(time.time())}"
  82. html = get_cggg(annoId,projectId,request.get_proxies())
  83. list_item1.contenthtml = html
  84. s_title = Selector(html).xpath('//h4[@class="yAnnounceName"]/text()').extract_first("").strip()
  85. if s_title and s_title != list_item1.title:
  86. list_item1.s_title = s_title
  87. file_list = Selector(html).xpath('//a')
  88. attachments = {}
  89. if file_list:
  90. for f1 in file_list:
  91. file_url = f1.xpath('./@href').extract_first("")
  92. file_name = f1.xpath('./text()').extract_first("").strip() or list_item1.title
  93. file_type = extract_file_type(file_name=file_name, file_url=file_url, file_type_list=['html'])
  94. if file_type and file_url:
  95. attachment = AttachmentDownloader().fetch_attachment(
  96. file_name=file_name, file_type=file_type, download_url=file_url)
  97. attachments[str(len(attachments) + 1)] = attachment
  98. if attachments:
  99. list_item1.projectinfo = {"attachments": attachments}
  100. yield_list.append(list_item1)
  101. items = request.item
  102. list_item2 = DataBakItem(**items)
  103. list_item2.title += "_竞价结果"
  104. list_item2.href += f"?t={int(time.time())}"
  105. html = get_jjjg(projectId,request.get_proxies())
  106. if html or text_search(html).total > 10:
  107. list_item2.contenthtml = html
  108. yield_list.append(list_item2)
  109. items = request.item
  110. list_item3 = DataBakItem(**items)
  111. list_item3.title += "_成交公示"
  112. list_item3.href += f"?t={int(time.time())}"
  113. html = get_cjgs(projectId,request.get_proxies())
  114. s_title = Selector(html).xpath('//h6[@class="yDealMainTitle"]/text()').extract_first("").strip()
  115. if s_title and s_title != list_item3.title:
  116. list_item3.s_title = s_title
  117. if html and text_search(html).total > 10:
  118. list_item3.contenthtml = html
  119. yield_list.append(list_item3)
  120. items = request.item
  121. list_item4 = DataBakItem(**items)
  122. list_item4.title += "_合同"
  123. list_item4.href += f"?t={int(time.time())}"
  124. html = get_ht(projectId,request.get_proxies())
  125. if html and text_search(html).total > 10 and "qyPrintContract" not in html:
  126. list_item4.contenthtml = html
  127. yield_list.append(list_item4)
  128. for yd in yield_list:
  129. yield yd
  130. if __name__ == "__main__":
  131. Spider(redis_key="lzz:gssggzyjyw_xeyxgcjsqb").start()