交易信息-招标-列表页.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-01-17
  4. ---------
  5. @summary: 深圳阳光采购平台
  6. ---------
  7. @author: lzz
  8. """
  9. from collections import namedtuple
  10. import feapder
  11. from feapder.utils.tools import timestamp_to_date
  12. from items.spider_item import BidingListItem
  13. class Spider(feapder.BiddingListSpider):
  14. def start_callback(self):
  15. Menu = namedtuple('Menu', ['channel', 'code', 'com', 'ggLeiXing', 'crawl_page'])
  16. self.site = "深圳阳光采购平台"
  17. self.menus = [
  18. Menu('交易信息-招标', 'gd_szygcgpt_jyxx_zb', 'Purchase', 1, 1),
  19. Menu('交易信息-招标', 'gd_szygcgpt_jyxx_zb', 'Change', 2, 1),
  20. Menu('交易信息-招标', 'gd_szygcgpt_jyxx_zb', 'Candidate', 3, 1),
  21. Menu('交易信息-招标', 'gd_szygcgpt_jyxx_zb', 'CaliTender', 6, 1),
  22. Menu('交易信息-招标', 'gd_szygcgpt_jyxx_zb', 'Result', 4, 1),
  23. Menu('交易信息-招标', 'gd_szygcgpt_jyxx_zb', 'Invitation', 5, 1),
  24. Menu('交易信息-招标', 'gd_szygcgpt_jyxx_zb', 'Project', 7, 1),
  25. Menu('交易信息-招标', 'gd_szygcgpt_jyxx_zb', 'Contract', 8, 1),
  26. ]
  27. self.headers = {
  28. "Accept": "application/json, text/plain, */*",
  29. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  30. "Cache-Control": "no-cache",
  31. "Content-Type": "application/json;charset=UTF-8",
  32. "Origin": "https://www.szygcgpt.com",
  33. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
  34. }
  35. def start_requests(self):
  36. url = "https://www.szygcgpt.com/app/home/pageGGList.do"
  37. for menu in self.menus:
  38. yield feapder.Request(url, item=menu._asdict(), page=1, proxies=False)
  39. def download_midware(self, request):
  40. page = request.page
  41. menu = request.item
  42. data = {
  43. "page": page,
  44. "rows": 100,
  45. "xmLeiXing": "",
  46. "caiGouType": 0,
  47. "ggLeiXing": menu.get('ggLeiXing'),
  48. "isShiShuGuoQi": "",
  49. "isZhanLueYingJiWuZi": "",
  50. "keyWords": ""
  51. }
  52. request.json = data
  53. request.headers = self.headers
  54. def parse(self, request, response):
  55. menu = request.item
  56. info_list = response.json.get('data').get('list')
  57. for info in info_list:
  58. guid = info.get('guid')
  59. ggGuid = info.get('ggGuid')
  60. bdGuid = info.get('bdGuid')
  61. ggLeiXing = menu.get('ggLeiXing')
  62. dataSource = info.get('dataSource')
  63. href = f"https://www.szygcgpt.com/ygcg/detailTop?com={menu.get('com')}&guid={guid}&ggGuid={ggGuid}&bdGuid={bdGuid}&ggLeiXing={ggLeiXing}&dataSource={dataSource}&type=purchase"
  64. title = info.get('ggName').strip()
  65. create_time = timestamp_to_date(int(str(info.get('faBuTime'))[:10]),time_format="%Y-%m-%d")
  66. area = "广东"
  67. city = "深圳市"
  68. list_item = BidingListItem() # 存储数据的管道
  69. list_item.href = href # 标书链接
  70. list_item.unique_key = ('href',)
  71. list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
  72. list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
  73. list_item.title = title # 标题
  74. list_item.site = self.site
  75. list_item.publishtime = create_time
  76. list_item.area = area # 城市默认:全国
  77. list_item.city = city # 城市 默认为空
  78. list_item.parse = "self.detail_get" # 详情页回调方法
  79. list_item.request_params = {
  80. "rm_list": [
  81. '//td[contains(text(),"原公告地址:")]/..',
  82. '//span[@class="tijiao"]'
  83. ]
  84. }
  85. list_item.deal_detail = ['//div[@class="contentDetail"]'] # 抽取正文xpath
  86. list_item.parse_url = href
  87. list_item.files = { # 附件采集规则
  88. "list_xpath": '//div[@class="contentDetail"]//a[@href]',
  89. "url_xpath": './@href',
  90. "name_xpath": './text()',
  91. "files_type": (
  92. 'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
  93. 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg'
  94. ), # 需要下载的附件类型
  95. # "file_type":'pdf', # 默认的附件类型,用于url中未带附件类型的
  96. "url_key": 'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
  97. "host": '', # 需要拼接url的host
  98. }
  99. yield list_item
  100. # 无限翻页
  101. request = self.infinite_pages(request, response)
  102. yield request
  103. if __name__ == "__main__":
  104. Spider(redis_key="detail:chrome").start()