details_webcookie.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2023-04-27
  4. ---------
  5. @summary: 生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
  6. ---------
  7. @author:
  8. """
  9. import time
  10. import json
  11. import re
  12. import copy
  13. from urllib.parse import urljoin
  14. import feapder
  15. from items.spider_item import DataBakItem
  16. from untils.WebCookiePool import WebCookiePool
  17. from untils.attachment import AttachmentDownloader
  18. class Details(feapder.BiddingDetailSpider):
  19. def start_requests(self):
  20. data_list = self.get_tasks_by_rabbitmq(limit=50)
  21. for item in data_list:
  22. request_params = item.get("request_params")
  23. if item.get("ex_python"):
  24. exec(item.get("ex_python"))
  25. if item.get("proxies"):
  26. yield feapder.Request(url=item.get("parse_url"),
  27. callback=eval(item.get("parse")),
  28. item=item,
  29. down_mid=item.get("down_mid"),
  30. files_info=item.get("files"),
  31. deal_detail=item.get("deal_detail"),
  32. **request_params)
  33. else:
  34. yield feapder.Request(url=item.get("parse_url"),
  35. proxies=False,
  36. callback=eval(item.get("parse")),
  37. item=item,
  38. down_mid=item.get("down_mid"),
  39. files_info=item.get("files"),
  40. deal_detail=item.get("deal_detail"),
  41. **request_params)
  42. def download_midware(self, request):
  43. down_mid = request.down_mid
  44. key = down_mid.get("key")
  45. page_url = down_mid.get("page_url")
  46. cookie_key = down_mid.get("cookie_key")
  47. cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
  48. request.cookies = cookie_pool.get_cookie()
  49. return request
  50. def detail_get(self, request, response):
  51. """
  52. 处理html格式的返回结果
  53. :param request:
  54. :param response:
  55. :return:
  56. """
  57. if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
  58. '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
  59. down_mid = copy.copy(request.down_mid)
  60. key = down_mid.get("key")
  61. page_url = down_mid.get("page_url")
  62. cookie_key = down_mid.get("cookie_key")
  63. cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
  64. cookie_pool.del_cookie(request.cookies)
  65. yield request
  66. elif response.status_code in (request.down_mid.get("code")):
  67. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  68. down_mid = copy.copy(request.down_mid)
  69. key = down_mid.get("key")
  70. page_url = down_mid.get("page_url")
  71. cookie_key = down_mid.get("cookie_key")
  72. cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
  73. cookie_pool.del_cookie(request.cookies)
  74. yield request
  75. else:
  76. items = request.item
  77. data_item = DataBakItem(**items)
  78. html = ''
  79. for xpath in request.deal_detail:
  80. htmls = response.xpath(xpath).extract_first() # 标书详细内容
  81. if request.to_dict.get('conn_html', None):
  82. if htmls is not None:
  83. html += htmls
  84. else:
  85. if htmls is not None:
  86. html = htmls
  87. break
  88. data_item.contenthtml = html
  89. attachments = {}
  90. if request.files_info:
  91. files_info = request.files_info
  92. files = response.xpath(files_info.get("list_xpath"))
  93. for index,info in enumerate(files):
  94. file_url = info.xpath(files_info.get("url_xpath")).extract_first()
  95. file_name = info.xpath(files_info.get("name_xpath")).extract()
  96. if not file_url or not file_name:
  97. continue
  98. if files_info.get("host"):
  99. file_url = urljoin(files_info.get("host"), file_url)
  100. file_name = "".join("".join(file_name).split()).strip()
  101. if not files_info.get("file_type"):
  102. file_type = file_url.split("?")[0].split(".")[-1].lower()
  103. if file_type not in files_info.get("files_type"):
  104. file_type = file_name.split("?")[0].split(".")[-1].lower()
  105. elif files_info.get("file_type") == "file_name":
  106. file_type = file_name.split("?")[0].split(".")[-1].lower()
  107. else:
  108. file_type = files_info.get("file_type")
  109. if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
  110. attachment = AttachmentDownloader().fetch_attachment(
  111. file_name=file_name,
  112. file_type=file_type,
  113. download_url=file_url,
  114. enable_proxy=False
  115. )
  116. attachments[str(len(attachments)+1)] = attachment
  117. if len(attachments) > 0:
  118. data_item.projectinfo = {"attachments": attachments}
  119. yield data_item
  120. def detail_json(self, request, response):
  121. """
  122. 处理json串及其他格式的返回结果
  123. :param request:
  124. :param response:
  125. :return:
  126. """
  127. if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
  128. '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
  129. down_mid = copy.copy(request.down_mid)
  130. key = down_mid.get("key")
  131. cookie_key = down_mid.get("cookie_key")
  132. page_url = down_mid.get("page_url")
  133. cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
  134. cookie_pool.del_cookie(request.cookies)
  135. yield request
  136. elif response.status_code in request.down_mid.get("code"):
  137. '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
  138. down_mid = copy.copy(request.down_mid)
  139. key = down_mid.get("key")
  140. page_url = down_mid.get("page_url")
  141. cookie_key = down_mid.get("cookie_key")
  142. cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key)
  143. cookie_pool.del_cookie(request.cookies)
  144. yield request
  145. else:
  146. items = request.item
  147. data_item = DataBakItem(**items)
  148. exec(request.deal_detail)
  149. yield data_item
  150. if __name__ == "__main__":
  151. Details(redis_key="detail:webcookie").start()