yct_details_firefox.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2023-7-22
  4. ---------
  5. @summary: 云采通高校采购联盟-结果公告
  6. ---------
  7. @author: lzz
  8. """
  9. from urllib.parse import urljoin
  10. import feapder
  11. from feapder.network.selector import Selector
  12. from feapder.utils.log import log
  13. from items.spider_item import DataBakItem
  14. from untils.attachment import AttachmentDownloader
  15. from untils.tools import remove_htmldata
  16. class Spider(feapder.BiddingDetailSpider):
  17. def start_requests(self):
  18. data_list = self.get_tasks_by_rabbitmq(limit=30)
  19. for item in data_list:
  20. log.debug(item)
  21. request_params = item.get("request_params")
  22. yield feapder.Request(url=item.get("parse_url"),
  23. render=True,
  24. render_time=item.get("render_time"),
  25. callback=eval(item.get("parse")),
  26. item=item,
  27. files_info=item.get("files"),
  28. deal_detail=item.get("deal_detail"),
  29. **request_params)
  30. def detail_get(self, request, response):
  31. iframe = response.xpath('//iframe[@id="contentFrame"]').extract_first()
  32. if iframe:
  33. content_frame = response.browser.tab('#contentFrame')
  34. response = Selector(content_frame.html)
  35. items = request.item
  36. data_item = DataBakItem(**items)
  37. html = ''
  38. detail_path = [
  39. '//table[@class="MsoNormalTable"]',
  40. '//div[@class="project-details positionrl"]',
  41. '//div[@class="content"]',
  42. '//div[@class="project-war medium"]',
  43. '/html/body'
  44. ]
  45. for xpath in detail_path:
  46. html = response.xpath(xpath).extract_first() # 标书详细内容
  47. if html is not None:
  48. break
  49. rm_lsit = [
  50. '//div[@class="project-bid print-hide"]',
  51. '//div[@class="select-item ml10"]',
  52. '//div[@id="guide8"]',
  53. '打印公告'
  54. ]
  55. data_item.contenthtml = remove_htmldata(rm_lsit, html, response)
  56. attachments = {}
  57. if request.files_info:
  58. files_info = request.files_info
  59. files = response.xpath(files_info.get("list_xpath"))
  60. if len(files) > 0:
  61. for index, info in enumerate(files):
  62. file_url = info.xpath(files_info.get("url_xpath")).extract_first()
  63. file_name = info.xpath(files_info.get("name_xpath")).extract_first()
  64. file_name = (file_name or data_item.title)
  65. if 'http' not in file_url and files_info.get("host"):
  66. file_url = urljoin(files_info.get("host"), file_url)
  67. if not files_info.get("file_type"):
  68. file_type = file_url.split(".")[-1].lower()
  69. if file_type not in files_info.get("files_type"):
  70. file_type = file_name.split(".")[-1].lower()
  71. else:
  72. file_type = files_info.get("file_type")
  73. if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
  74. if file_type in file_name:
  75. file_name = file_name.replace(f'.{file_type}', '').strip()
  76. attachment = AttachmentDownloader().fetch_attachment(
  77. file_name=file_name,
  78. file_type=file_type,
  79. download_url=file_url,
  80. proxies=request.get_proxies()
  81. )
  82. attachments[str(len(attachments) + 1)] = attachment
  83. if len(attachments) > 0:
  84. data_item.projectinfo = {"attachments": attachments}
  85. yield data_item
  86. if __name__ == "__main__":
  87. Spider(redis_key="lzz:yctgxcglm_jggg").start()