cfxggzyjyw_details.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-05-24
  4. ---------
  5. @summary: 长丰县公共资源交易网
  6. ---------
  7. @author: lzz
  8. """
  9. import feapder
  10. from items.spider_item import DataBakItem
  11. from untils.attachment import AttachmentDownloader
  12. from untils.tools import extract_file_type
  13. from crawl_func.jsl_5s import DTCookiePool
  14. headers = {
  15. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  16. "Accept-Language": "zh-CN,zh;q=0.9",
  17. "Cache-Control": "no-cache",
  18. "Connection": "keep-alive",
  19. "Pragma": "no-cache",
  20. "Upgrade-Insecure-Requests": "1",
  21. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
  22. }
  23. class Details(feapder.BiddingDetailSpider):
  24. ct = 0
  25. cookie_pool = DTCookiePool(
  26. page_url='https://www.changfeng.gov.cn/content/column/30298029?pageIndex=1',
  27. header=headers, redis_key="cfxggzyjyw_dtcookie")
  28. def start_requests(self):
  29. data_list = self.get_tasks_by_rabbitmq(limit=20)
  30. for item in data_list:
  31. request_params = item.get("request_params")
  32. yield feapder.Request(url=item.get("parse_url"), item=item, deal_detail=item.get("deal_detail"),
  33. callback=eval(item.get("parse")), **request_params, proxies=False)
  34. def download_midware(self, request):
  35. request.headers = headers
  36. request.cookies = self.cookie_pool.get_cookie()
  37. def detail_get(self, request, response):
  38. if self.ct > 5:
  39. return
  40. if response.status_code == 521:
  41. self.ct += 1
  42. self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
  43. yield request
  44. else:
  45. self.ct = 0
  46. items = request.item
  47. list_item = DataBakItem(**items)
  48. html = ''
  49. for xpath in request.deal_detail:
  50. html = response.xpath(xpath).extract_first() # 标书详细内 容
  51. if html is not None:
  52. break
  53. list_item.contenthtml = html
  54. files = response.xpath('//div[@id="zoom"]//a[@href]')
  55. if len(files) > 0:
  56. attachments = {}
  57. for info in files:
  58. file_url = info.xpath('./@href').extract_first()
  59. file_name = info.xpath('./text()').extract_first("").strip()
  60. file_type = extract_file_type(file_name, file_url)
  61. if file_type:
  62. headers['Referer'] = request.url
  63. attachment = AttachmentDownloader().fetch_attachment(
  64. file_name=file_name, file_type=file_type, download_url=file_url, headers=headers,
  65. cookies=request.cookies)
  66. attachments[str(len(attachments) + 1)] = attachment
  67. if attachments:
  68. list_item.projectinfo = {"attachments": attachments}
  69. yield list_item
  70. if __name__ == "__main__":
  71. Details(redis_key="lzz:Cfxggzyjyw").start()