交易网-详情页.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-06-04
  4. ---------
  5. @summary: 济宁市公共资源交易网
  6. ---------
  7. @author: lzz
  8. """
  9. import feapder
  10. from items.spider_item import DataBakItem
  11. from feapder.utils.tools import log
  12. from untils.attachment import AttachmentDownloader
  13. from untils.tools import extract_file_type,remove_htmldata
  14. class Details(feapder.BiddingDetailSpider):
  15. def start_requests(self):
  16. data_list = self.get_tasks_by_rabbitmq(limit=20)
  17. for item in data_list:
  18. # log.debug(item)
  19. request_params = item.get("request_params")
  20. yield feapder.Request(url=item.get("parse_url"), item=item,
  21. deal_detail=item.get("deal_detail"),proxies=False,
  22. callback=eval(item.get("parse")), **request_params)
  23. def detail_get(self, request, response):
  24. items = request.item
  25. list_item = DataBakItem(**items)
  26. html = response.xpath('//div[@class="card-body"]').extract_first() # 标书详细内容
  27. rm_list =['//div[@class="bm-side-link"]','下载招标文件','//ul[@id="list-detail"]','//ul[@class="clearfix"]',
  28. '//div[@class="text-center margin-top-40 margin-bottom-40"]','//h4[@style="margin:10px;"]']
  29. html = remove_htmldata(rm_list, html, response)
  30. list_item.contenthtml = html
  31. file_list = response.xpath('//div[@class="card-body"]//a[@href]|//div[@id="ctn-detail"]//a[@href]')
  32. if file_list:
  33. attachments = {}
  34. for info in file_list:
  35. file_url = info.xpath('./@href').extract_first()
  36. file_name = "".join(info.xpath('.//text()').extract())
  37. if file_name:
  38. file_name = file_name.strip()
  39. file_type = extract_file_type(file_name,file_url,['zbd','zbdx'])
  40. if file_type:
  41. attachment = AttachmentDownloader().fetch_attachment(
  42. file_name=file_name, file_type=file_type, download_url=file_url,)
  43. if attachment.get('size'):
  44. attachments[str(len(attachments) + 1)] = attachment
  45. if attachments:
  46. list_item.projectinfo = {"attachments": attachments}
  47. yield list_item
  48. if __name__ == "__main__":
  49. Details(redis_key="lzz:jnsggzyjyw_lzzqgg").start()