# -*- coding: utf-8 -*- """ Created on 2023-04-27 --------- @summary: 生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip --------- @author: """ import time import json import re import copy from urllib.parse import urljoin import feapder from items.spider_item import DataBakItem from untils.WebCookiePool import WebCookiePool from untils.attachment import AttachmentDownloader class Details(feapder.BiddingDetailSpider): def start_requests(self): data_list = self.get_tasks_by_rabbitmq(limit=50) for item in data_list: request_params = item.get("request_params") if item.get("ex_python"): exec(item.get("ex_python")) if item.get("proxies"): yield feapder.Request(url=item.get("parse_url"), callback=eval(item.get("parse")), item=item, down_mid=item.get("down_mid"), files_info=item.get("files"), deal_detail=item.get("deal_detail"), **request_params) else: yield feapder.Request(url=item.get("parse_url"), proxies=False, callback=eval(item.get("parse")), item=item, down_mid=item.get("down_mid"), files_info=item.get("files"), deal_detail=item.get("deal_detail"), **request_params) def download_midware(self, request): down_mid = request.down_mid key = down_mid.get("key") page_url = down_mid.get("page_url") cookie_key = down_mid.get("cookie_key") cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key) request.cookies = cookie_pool.get_cookie() return request def detail_get(self, request, response): """ 处理html格式的返回结果 :param request: :param response: :return: """ if request.down_mid.get("text") and request.down_mid.get("text") in response.text: '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie''' down_mid = copy.copy(request.down_mid) key = down_mid.get("key") page_url = down_mid.get("page_url") cookie_key = down_mid.get("cookie_key") cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key) cookie_pool.del_cookie(request.cookies) yield request elif response.status_code in (request.down_mid.get("code")): '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie''' down_mid = copy.copy(request.down_mid) key = down_mid.get("key") page_url = down_mid.get("page_url") cookie_key = down_mid.get("cookie_key") cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key) cookie_pool.del_cookie(request.cookies) yield request else: items = request.item data_item = DataBakItem(**items) html = '' for xpath in request.deal_detail: htmls = response.xpath(xpath).extract_first() # 标书详细内容 if request.to_dict.get('conn_html', None): if htmls is not None: html += htmls else: if htmls is not None: html = htmls break data_item.contenthtml = html attachments = {} if request.files_info: files_info = request.files_info files = response.xpath(files_info.get("list_xpath")) for index,info in enumerate(files): file_url = info.xpath(files_info.get("url_xpath")).extract_first() file_name = info.xpath(files_info.get("name_xpath")).extract() if not file_url or not file_name: continue if files_info.get("host"): file_url = urljoin(files_info.get("host"), file_url) file_name = "".join("".join(file_name).split()).strip() if not files_info.get("file_type"): file_type = file_url.split("?")[0].split(".")[-1].lower() if file_type not in files_info.get("files_type"): file_type = file_name.split("?")[0].split(".")[-1].lower() elif files_info.get("file_type") == "file_name": file_type = file_name.split("?")[0].split(".")[-1].lower() else: file_type = files_info.get("file_type") if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url: attachment = AttachmentDownloader().fetch_attachment( file_name=file_name, file_type=file_type, download_url=file_url, enable_proxy=False ) attachments[str(len(attachments)+1)] = attachment if len(attachments) > 0: data_item.projectinfo = {"attachments": attachments} yield data_item def detail_json(self, request, response): """ 处理json串及其他格式的返回结果 :param request: :param response: :return: """ if request.down_mid.get("text") and request.down_mid.get("text") in response.text: '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie''' down_mid = copy.copy(request.down_mid) key = down_mid.get("key") cookie_key = down_mid.get("cookie_key") page_url = down_mid.get("page_url") cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key) cookie_pool.del_cookie(request.cookies) yield request elif response.status_code in request.down_mid.get("code"): '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie''' down_mid = copy.copy(request.down_mid) key = down_mid.get("key") page_url = down_mid.get("page_url") cookie_key = down_mid.get("cookie_key") cookie_pool = WebCookiePool(redis_key=key, page_url=page_url, cookie_key=cookie_key) cookie_pool.del_cookie(request.cookies) yield request else: items = request.item data_item = DataBakItem(**items) exec(request.deal_detail) yield data_item if __name__ == "__main__": Details(redis_key="detail:webcookie").start()