市县采购公告-列表页.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-04-22
  4. ---------
  5. @summary: 宁夏政府采购公共服务平台
  6. ---------
  7. @author: lzz
  8. """
  9. import json
  10. import re
  11. from collections import namedtuple
  12. import feapder
  13. import requests
  14. from items.spider_item import BidingListItem
  15. from untils.get_imgcode import get_code
  16. def get_ck(proxies=False):
  17. session = requests.session()
  18. session.proxies = proxies
  19. headers = {
  20. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  21. "Accept-Language": "zh-CN,zh;q=0.9",
  22. "Cache-Control": "no-cache",
  23. "Connection": "keep-alive",
  24. "Pragma": "no-cache",
  25. "Upgrade-Insecure-Requests": "1",
  26. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
  27. }
  28. url = "http://www.ccgp-ningxia.gov.cn/public/NXGPPNEW/dynamic/contents/SXCGGG/index.jsp"
  29. params = {
  30. "cid": "2010",
  31. "sid": "1"
  32. }
  33. res = session.get(url, headers=headers, params=params, timeout=30, verify=False)
  34. url1 = "http://www.ccgp-ningxia.gov.cn/TrafficStatistics.do"
  35. res1 = session.get(url1, headers=headers, timeout=30, verify=False)
  36. yzm_url = "http://www.ccgp-ningxia.gov.cn/admin/AuthCode_too.do"
  37. res_yzm = session.get(yzm_url, headers=headers, timeout=30, verify=False)
  38. code = get_code(res_yzm.content).upper()
  39. headers = {
  40. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  41. "Accept-Language": "zh-CN,zh;q=0.9",
  42. "Cache-Control": "no-cache",
  43. "Connection": "keep-alive",
  44. "Content-Type": "application/x-www-form-urlencoded",
  45. "Origin": "http://www.ccgp-ningxia.gov.cn",
  46. "Pragma": "no-cache",
  47. "Upgrade-Insecure-Requests": "1",
  48. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
  49. }
  50. url = "http://www.ccgp-ningxia.gov.cn//site/InteractionQuestion_findVNoticeNew.do"
  51. data = {
  52. "type": "ALL",
  53. "page": "0",
  54. "tab": "SX",
  55. "authCode": f"{code}",
  56. "noticeTab": "CGYX",
  57. "keyword_all": "",
  58. "departmentName_all": "",
  59. "date1_all": "",
  60. "date2_all": "",
  61. "regionId_all": "",
  62. "keyword_each": "",
  63. "departmentName_each": "",
  64. "agentName_each": "",
  65. "projectNumber_each": "",
  66. "planNumber_each": "",
  67. "date1_each": "",
  68. "date2_each": "",
  69. "title_cgyx": "",
  70. "departmentName_cgyx": "",
  71. "date1_cgyx": "",
  72. "date2_cgyx": "",
  73. "projectName_cgyxxm": "",
  74. "departmentName_cgyxxm": "",
  75. "yjcgsj_cgyxxm": "",
  76. "date1_cgyxxm": "",
  77. "date2_cgyxxm": "",
  78. "purchaseItem_cgyxxm": "",
  79. "agreCode_htgs": "",
  80. "departmentName_htgs": "",
  81. "supplierName_htgs": "",
  82. "date1_htgs": "",
  83. "date2_htgs": "",
  84. "agreCode_ysjggg": "",
  85. "reportCode_ysjggg": "",
  86. "departmentName_ysjggg": "",
  87. "supplierName_ysjggg": "",
  88. "date1_ysjggg": "",
  89. "date2_ysjggg": ""
  90. }
  91. resp = session.post(url, headers=headers, params=params, timeout=30, data=data, verify=False)
  92. cookies = session.cookies.get_dict()
  93. return cookies
  94. class Spider(feapder.BiddingListSpider):
  95. def start_callback(self):
  96. Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
  97. self.site = "宁夏政府采购公共服务平台"
  98. self.menus = [
  99. Menu('市县采购公告', 'nx_nxzfcgggfwpt_sxcggg', 2),
  100. ]
  101. self.headers = {
  102. "Accept": "*/*",
  103. "Accept-Language": "zh-CN,zh;q=0.9",
  104. "Cache-Control": "no-cache",
  105. "Connection": "keep-alive",
  106. "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
  107. "Origin": "http://www.ccgp-ningxia.gov.cn",
  108. "Pragma": "no-cache",
  109. "Referer": "http://www.ccgp-ningxia.gov.cn/public/NXGPPNEW/dynamic/contents/SXCGGG/index.jsp?cid=2010&sid=1",
  110. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
  111. "X-Requested-With": "XMLHttpRequest"
  112. }
  113. self.cookies = get_ck()
  114. def start_requests(self):
  115. url = "http://www.ccgp-ningxia.gov.cn//site/InteractionQuestion_findVNoticeNew.do"
  116. for menu in self.menus:
  117. yield feapder.Request(url, item=menu._asdict(), page=1, proxies=False)
  118. def download_midware(self, request):
  119. page = request.page
  120. data = {
  121. "type": "ALL",
  122. "page": f"{page - 1}",
  123. "tab": "SX",
  124. "authCode": "",
  125. "noticeTab": "CGYX",
  126. "keyword_all": "",
  127. "departmentName_all": "",
  128. "date1_all": "",
  129. "date2_all": "",
  130. "regionId_all": "",
  131. "keyword_each": "",
  132. "departmentName_each": "",
  133. "agentName_each": "",
  134. "projectNumber_each": "",
  135. "planNumber_each": "",
  136. "date1_each": "",
  137. "date2_each": "",
  138. "title_cgyx": "",
  139. "departmentName_cgyx": "",
  140. "date1_cgyx": "",
  141. "date2_cgyx": "",
  142. "projectName_cgyxxm": "",
  143. "departmentName_cgyxxm": "",
  144. "yjcgsj_cgyxxm": "",
  145. "date1_cgyxxm": "",
  146. "date2_cgyxxm": "",
  147. "purchaseItem_cgyxxm": "",
  148. "agreCode_htgs": "",
  149. "departmentName_htgs": "",
  150. "supplierName_htgs": "",
  151. "date1_htgs": "",
  152. "date2_htgs": "",
  153. "agreCode_ysjggg": "",
  154. "reportCode_ysjggg": "",
  155. "departmentName_ysjggg": "",
  156. "supplierName_ysjggg": "",
  157. "date1_ysjggg": "",
  158. "date2_ysjggg": ""
  159. }
  160. request.data = data
  161. request.headers = self.headers
  162. request.cookies = get_ck()
  163. def validate(self, request, response):
  164. data = response.content.decode()
  165. data_str = "[{" + "".join(re.findall('\[\{(.*?)}]', data, re.S)).strip() + "}]"
  166. info_list = json.loads(data_str.replace('\\', ''), strict=False)
  167. assert len(info_list) > 0
  168. def parse(self, request, response):
  169. menu = request.item
  170. data_str = "[{" + "".join(re.findall('\[\{(.*?)}]', response.text, re.S)).strip() + "}]"
  171. info_list = json.loads(data_str.replace('\\', ''), strict=False)
  172. for info in info_list:
  173. href = info.get('url')
  174. if 'http' not in href:
  175. href = "http://www.ccgp-ningxia.gov.cn/public/NXGPPNEW/dynamic/" + href
  176. title = info.get('title').strip()
  177. create_time = info.get('publish_time')
  178. area = "宁夏"
  179. city = ""
  180. list_item = BidingListItem() # 存储数据的管道
  181. list_item.href = href # 标书链接
  182. list_item.unique_key = ('href',)
  183. list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
  184. list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
  185. list_item.title = title # 标题
  186. list_item.site = self.site
  187. list_item.publishtime = create_time
  188. list_item.area = area # 城市默认:全国
  189. list_item.city = city # 城市 默认为空
  190. list_item.parse = "self.detail_get" # 详情页回调方法
  191. list_item.request_params = {'rm_list': ['//div[@class="curt-row"]',
  192. '//p[@class="sub-tt"]']}
  193. list_item.deal_detail = ['//div[@class="table1"]', '//div[@class="gw-paper"]',
  194. '//div[@class="newAgreShow"]', '//div[@class="main"]'] # 抽取正文xpath
  195. list_item.proxies = True
  196. list_item.parse_url = href
  197. list_item.files = { # 附件采集规则
  198. "list_xpath": '//div[@class="main"]//a[@href]',
  199. "url_xpath": './@href',
  200. "name_xpath": './text()',
  201. "files_type": ('zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
  202. 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg'), # 需要下载的附件类型
  203. # "file_type":'pdf', # 默认的附件类型,用于url中未带附件类型的
  204. "url_key": 'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
  205. "host": '', # 需要拼接url的host
  206. }
  207. yield list_item
  208. # 无限翻页
  209. request = self.infinite_pages(request, response)
  210. yield request
  211. def exception_request(self, request, response):
  212. self.cookies = get_ck(request.get_proxies())
  213. yield request
  214. if __name__ == "__main__":
  215. Spider(redis_key="detail:chrome").start()