dfxwszhcgpt_details.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-01-06
  4. ---------
  5. @summary: 东方希望数字化采购平台
  6. ---------
  7. @author: lzz
  8. """
  9. import feapder
  10. from items.spider_item import DataBakItem
  11. class Spider(feapder.BiddingDetailSpider):
  12. def start_requests(self):
  13. data_list = self.get_tasks_by_rabbitmq(limit=50)
  14. for item in data_list:
  15. # log.debug(item)
  16. request_params = item.get("request_params")
  17. yield feapder.Request(url=item.get("parse_url"),
  18. render=True,
  19. render_time=5,
  20. callback=eval(item.get("parse")),
  21. item=item,
  22. deal_detail=item.get("deal_detail"),
  23. **request_params)
  24. def download_midware(self, request):
  25. request.headers = {
  26. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  27. "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
  28. "Cache-Control": "max-age=0",
  29. "Upgrade-Insecure-Requests": "1",
  30. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
  31. }
  32. def detail_get(self, request, response):
  33. items = request.item
  34. data_item = DataBakItem(**items)
  35. html = ''
  36. for xpath in request.deal_detail:
  37. html = response.xpath(xpath).extract_first() # 标书详细内容
  38. if html is not None:
  39. break
  40. html = html.replace('我要参与</button>', '').replace('我要提问</button>', '')
  41. html = html.replace('<div>招标技术附件</div>', '').replace('附件列表</a>', '')
  42. html = html.replace('<div>招标商务附件</div>', '').replace('附件列表 </a>', '')
  43. html = html.replace('<div>附件</div>', '').replace('查看</a>', '')
  44. html = html.replace('浏览详情</div>', '')
  45. data_item.contenthtml = html
  46. yield data_item
  47. if __name__ == "__main__":
  48. Spider(redis_key="lzz:DfxwszhcgptNew").start()