陕西采购与招标网-详情页.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2025-01-03
  4. ---------
  5. @summary: 陕西采购与招标网
  6. ---------
  7. @author: lzz
  8. """
  9. import random
  10. import time
  11. import execjs
  12. import feapder
  13. import requests
  14. from items.spider_item import DataBakItem
  15. from untils.attachment import AttachmentDownloader
  16. from untils.tools import get_proxy
  17. headers = {
  18. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  19. "Accept-Language": "zh-CN,zh;q=0.9",
  20. "Cache-Control": "no-cache",
  21. "Connection": "keep-alive",
  22. "Pragma": "no-cache",
  23. "Upgrade-Insecure-Requests": "1",
  24. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
  25. }
  26. class Details(feapder.BiddingDetailSpider):
  27. proxy = get_proxy()
  28. def start_requests(self):
  29. data_list = self.get_tasks_by_rabbitmq(limit=50)
  30. for item in data_list:
  31. # log.debug(item)
  32. request_params = item.get("request_params")
  33. yield feapder.Request(url=item.get("parse_url"),
  34. deal_detail=item.get("deal_detail"),
  35. callback=eval(item.get("parse")),
  36. item=item,
  37. **request_params)
  38. def detail_get(self, request, response):
  39. items = request.item
  40. list_item = DataBakItem(**items)
  41. html = response.xpath('//div[@class="mian_list"]').extract_first() # 标书详细内容
  42. list_item.contenthtml = html
  43. attachments = {}
  44. fid = response.xpath('//div[@class="mian_list_03"]/@index').extract_first()
  45. if fid:
  46. for i in range(5):
  47. list_item.contenthtml = "详情请访问原网页!"
  48. fheaders = {
  49. "Accept": "*/*",
  50. "Accept-Language": "zh-CN,zh;q=0.9",
  51. "Cache-Control": "no-cache",
  52. "Connection": "keep-alive",
  53. "Content-Length": "0",
  54. "Origin": "http://bulletin.sntba.com",
  55. "Pragma": "no-cache",
  56. "Referer": "http://bulletin.sntba.com/",
  57. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
  58. }
  59. furl = "http://39.107.102.206:8087/permission/getSecretKey"
  60. resp = requests.post(furl, headers=fheaders, timeout=120, proxies=self.proxy, verify=False)
  61. pdata = resp.text.strip()
  62. with open('./sxcgyzbw_file.js', 'r') as f:
  63. ex_js = f.read()
  64. ctx = execjs.compile(ex_js)
  65. file_url = ctx.call('get_key', pdata, fid)
  66. file_name = list_item.title
  67. file_type = "pdf"
  68. attachment = AttachmentDownloader().fetch_attachment(
  69. file_name=file_name,
  70. file_type=file_type,
  71. download_url=file_url,
  72. proxies=self.proxy,
  73. is_check=True)
  74. if attachment.get('size'):
  75. attachments[str(len(attachments) + 1)] = attachment
  76. break
  77. time.sleep(random.randint(3, 6))
  78. self.proxy = get_proxy()
  79. if i == 4:
  80. raise FileExistsError
  81. if len(attachments) > 0:
  82. list_item.projectinfo = {"attachments": attachments}
  83. yield list_item
  84. if __name__ == "__main__":
  85. Details(redis_key="lzz:sxcgyzbw_zgysgg2").start()