detail_cookie.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2023-04-27
  4. ---------
  5. @summary: 生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
  6. ---------
  7. @author:
  8. """
  9. import time
  10. import json
  11. import re
  12. import copy
  13. import feapder
  14. from items.spider_item import DataBakItem
  15. from untils.cookie_pool import PageCookiePool
  16. class Details(feapder.BiddingDetailSpider):
  17. def start_requests(self):
  18. data_list = self.get_tasks_by_rabbitmq(limit=50)
  19. for item in data_list:
  20. request_params = item.get("request_params")
  21. down_mid = copy.copy(item.get("down_mid"))
  22. key = down_mid.get("key")
  23. page_url = down_mid.get("page_url")
  24. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  25. down_mid["cookie_pool"] = cookie_pool
  26. if item.get("ex_python"):
  27. exec(item.get("ex_python"))
  28. yield feapder.Request(url=item.get("parse_url"),
  29. callback=eval(item.get("parse")),
  30. item=item,
  31. down_mid=item.get("down_mid"),
  32. deal_detail=item.get("deal_detail"),
  33. **request_params)
  34. def download_midware(self, request):
  35. down_mid = request.down_mid
  36. key = down_mid.get("key")
  37. page_url = down_mid.get("page_url")
  38. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  39. request.cookies = cookie_pool.get_cookie()
  40. return request
  41. def detail_get(self, request, response):
  42. """
  43. 处理html格式的返回结果
  44. :param request:
  45. :param response:
  46. :return:
  47. """
  48. if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
  49. '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
  50. down_mid = copy.copy(request.down_mid)
  51. key = down_mid.get("key")
  52. page_url = down_mid.get("page_url")
  53. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  54. cookie_pool.del_cookie(request.cookies)
  55. yield request
  56. if response.code in (request.down_mid.get("code")):
  57. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  58. down_mid = copy.copy(request.down_mid)
  59. key = down_mid.get("key")
  60. page_url = down_mid.get("page_url")
  61. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  62. cookie_pool.del_cookie(request.cookies)
  63. yield request
  64. items = request.item if isinstance(request.item, dict) else request.item.to_dict
  65. data_item = DataBakItem(**items)
  66. html = ''
  67. for xpath in request.deal_detail:
  68. htmls = response.xpath(xpath).extract_first() # 标书详细内容
  69. if request.to_dict.get('conn_html', None):
  70. if htmls is not None:
  71. html += htmls
  72. else:
  73. if htmls is not None:
  74. html = htmls
  75. break
  76. data_item.contenthtml = html
  77. yield data_item
  78. def detail_json(self, request, response):
  79. """
  80. 处理json串及其他格式的返回结果
  81. :param request:
  82. :param response:
  83. :return:
  84. """
  85. if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
  86. '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
  87. down_mid = copy.copy(request.down_mid)
  88. key = down_mid.get("key")
  89. page_url = down_mid.get("page_url")
  90. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  91. cookie_pool.del_cookie(request.cookies)
  92. yield request
  93. if response.code in (request.down_mid.get("code")):
  94. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  95. down_mid = copy.copy(request.down_mid)
  96. key = down_mid.get("key")
  97. page_url = down_mid.get("page_url")
  98. cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
  99. cookie_pool.del_cookie(request.cookies)
  100. yield request
  101. items = request.item
  102. data_item = DataBakItem(**items)
  103. html = ''
  104. exec(request.deal_detail)
  105. data_item.contenthtml = html
  106. yield data_item
  107. if __name__ == "__main__":
  108. Details(redis_key="detail:cookie").start()