交易大厅-详情页.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-06-02
  4. ---------
  5. @summary: 江苏土地市场网
  6. ---------
  7. @author: lzz
  8. """
  9. import feapder
  10. from items.spider_item import DataBakItem
  11. from untils.tools import extract_file_type
  12. from untils.attachment import AttachmentDownloader
  13. import re
  14. def fileDown(hid):
  15. return hid
  16. class Details(feapder.BiddingDetailSpider):
  17. def start_requests(self):
  18. data_lsit = self.get_tasks_by_rabbitmq(limit=10)
  19. for item in data_lsit:
  20. request_params = item.get("request_params")
  21. timeout = request_params.get('timeout', 10)
  22. request_params.pop('timeout', None)
  23. yield feapder.Request(url=item.get("parse_url"), item=item, proxies=False,render_time=5,render=True,
  24. deal_detail=item.get("deal_detail"), callback=eval(item.get("parse")),
  25. **request_params, timeout=timeout)
  26. def detail_get(self, request, response):
  27. items = request.item
  28. list_item = DataBakItem(**items)
  29. html1 = response.xpath('//div[@id="resultInfo"]').extract_first("")
  30. html2 = response.xpath('//div[@id="bargainInfo"]').extract_first("")
  31. html3 = response.xpath('//div[@id="formInfo"]').extract_first("")
  32. html4 = response.xpath('//div[@id="afficheInfo"]').extract_first("")
  33. s_title = "".join("".join(response.xpath('//div[@class="mainTwo-middle notice"]/ul[1]/li[1]//text()').extract()).split())
  34. if s_title:
  35. list_item.title = s_title
  36. list_item.s_title = s_title
  37. list_item.contenthtml = html1 + html2 + html3 + html4
  38. attachments = {}
  39. file_list = response.xpath('//div[@class="mainContent"]//a[contains(@href, "downLoadAttch")]')
  40. if file_list:
  41. for info in file_list:
  42. fid = "".join(re.findall("javascript:downLoadAttch\('(.*?)'",info.xpath('./@href').extract_first("")))
  43. file_name = info.xpath('./text()').extract_first("").strip()
  44. file_url = f"http://www.landjs.com/tAfficheParcel/fileDownLoad/{fid}"
  45. file_type = extract_file_type(file_name=file_name, file_url=file_url)
  46. if file_type:
  47. attachment = AttachmentDownloader().fetch_attachment(
  48. file_name=file_name, file_type=file_type, download_url=file_url, )
  49. attachments[str(len(attachments) + 1)] = attachment
  50. if attachments:
  51. list_item.projectinfo = {"attachments": attachments}
  52. yield list_item
  53. if __name__ == "__main__":
  54. Details(redis_key="lzz:jstdscw_jydt").start()