变更公告-列表页.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-04-29
  4. ---------
  5. @summary: 中国联通采购与招标网
  6. ---------
  7. @author: lzz
  8. """
  9. import json
  10. from collections import namedtuple
  11. import feapder
  12. from items.spider_item import BidingListItem
  13. from untils.WebCookiePool import WebCookiePool
  14. from untils.tools import get_proxy
  15. class Spider(feapder.BiddingListSpider):
  16. def start_callback(self):
  17. Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
  18. self.site = "中国联通采购与招标网"
  19. self.menus = [
  20. Menu('变更公告', 'a_zgltcgyzbw_bggg', '001003', 1),
  21. ]
  22. self.headers = {
  23. "Accept": "application/json",
  24. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  25. "Authorization": "null",
  26. "Connection": "keep-alive",
  27. "Content-Type": "application/json;charset=UTF-8",
  28. "Origin": "http://www.chinaunicombidding.cn",
  29. "Referer": "http://www.chinaunicombidding.cn/bidInformation",
  30. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
  31. "roleId;": ""
  32. }
  33. self.cookie_pool = WebCookiePool(redis_key="zgydcgyzbw_ck",
  34. page_url="http://www.chinaunicombidding.cn/bidInformation",
  35. cookie_key="jqmEwVYRfTEJT")
  36. self.cookie_pool.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
  37. def start_requests(self):
  38. url = "http://www.cupb.cn/api/v1/bizAnno/getAnnoList"
  39. for menu in self.menus:
  40. proxies = get_proxy()
  41. yield feapder.Request(url, item=menu._asdict(), page=1, proxies=proxies)
  42. def download_midware(self, request):
  43. page = request.page
  44. menu = request.item
  45. data = {
  46. "current": page,
  47. "pageSize": 10,
  48. "modeNo": "BizAnnoVoMtable",
  49. "pageNo": page,
  50. "annoType": menu.get('tid')
  51. }
  52. request.data = json.dumps(data, separators=(',', ':'))
  53. self.cookie_pool.proxies(proxy=request.get_proxy())
  54. request.cookies = self.cookie_pool.create_cookie()
  55. request.headers = self.headers
  56. def validate(self, request, response):
  57. if response.status_code != 200:
  58. raise ConnectionRefusedError
  59. return True
  60. def parse(self, request, response):
  61. menu = request.item
  62. info_list = response.json.get('data').get('records')
  63. for info in info_list:
  64. hid = info.get('id')
  65. href = f"http://www.chinaunicombidding.cn/bidInformation/detail?id={hid}"
  66. title = info.get('annoName').strip()
  67. create_time = info.get('createDate')
  68. cty = info.get('provinceName','').replace('其他','')
  69. area = cty
  70. city = ""
  71. list_item = BidingListItem() # 存储数据的管道
  72. list_item.href = href # 标书链接
  73. list_item.unique_key = ('href',)
  74. list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
  75. list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
  76. list_item.title = title # 标题
  77. list_item.site = self.site
  78. list_item.publishtime = create_time
  79. list_item.area = area or "全国" # 城市默认:全国
  80. list_item.city = city # 城市 默认为空
  81. list_item.unique_key = ('href',)
  82. list_item.parse = "self.detail_get" # 详情页回调方法
  83. list_item.deal_detail = [] # 抽取正文xpath
  84. list_item.proxies = False
  85. list_item.parse_url = f"http://www.chinaunicombidding.cn/api/v1/bizAnno/getAnnoDetailed/{hid}"
  86. yield list_item
  87. # 无限翻页
  88. request = self.infinite_pages(request, response)
  89. yield request
  90. def exception_request(self, request, response):
  91. request.proxies = get_proxy()
  92. yield request
  93. if __name__ == "__main__":
  94. Spider(redis_key="lzz:zgydcgyzbw_cgxqgs").start()