123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131 |
- # -*- coding: utf-8 -*-
- """
- Created on 2023-04-27
- ---------
- @summary: 生成一定有效期cookie,并使用的detail 详情处理方案,默认不限制ip
- ---------
- @author:
- """
- import time
- import json
- import re
- import copy
- import feapder
- from items.spider_item import DataBakItem
- from untils.cookie_pool import PageCookiePool
- class Details(feapder.BiddingDetailSpider):
- def start_requests(self):
- data_list = self.get_tasks_by_rabbitmq(limit=50)
- for item in data_list:
- request_params = item.get("request_params")
- down_mid = copy.copy(item.get("down_mid"))
- key = down_mid.get("key")
- page_url = down_mid.get("page_url")
- cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
- down_mid["cookie_pool"] = cookie_pool
- if item.get("ex_python"):
- exec(item.get("ex_python"))
- yield feapder.Request(url=item.get("parse_url"),
- callback=eval(item.get("parse")),
- item=item,
- down_mid=item.get("down_mid"),
- deal_detail=item.get("deal_detail"),
- **request_params)
- def download_midware(self, request):
- down_mid = request.down_mid
- key = down_mid.get("key")
- page_url = down_mid.get("page_url")
- cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
- request.cookies = cookie_pool.get_cookie()
- return request
- def detail_get(self, request, response):
- """
- 处理html格式的返回结果
- :param request:
- :param response:
- :return:
- """
- if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
- '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
- down_mid = copy.copy(request.down_mid)
- key = down_mid.get("key")
- page_url = down_mid.get("page_url")
- cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
- cookie_pool.del_cookie(request.cookies)
- yield request
- if response.code in (request.down_mid.get("code")):
- '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
- down_mid = copy.copy(request.down_mid)
- key = down_mid.get("key")
- page_url = down_mid.get("page_url")
- cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
- cookie_pool.del_cookie(request.cookies)
- yield request
- items = request.item if isinstance(request.item, dict) else request.item.to_dict
- data_item = DataBakItem(**items)
- html = ''
- for xpath in request.deal_detail:
- htmls = response.xpath(xpath).extract_first() # 标书详细内容
- if request.to_dict.get('conn_html', None):
- if htmls is not None:
- html += htmls
- else:
- if htmls is not None:
- html = htmls
- break
- data_item.contenthtml = html
- yield data_item
- def detail_json(self, request, response):
- """
- 处理json串及其他格式的返回结果
- :param request:
- :param response:
- :return:
- """
- if request.down_mid.get("text") and request.down_mid.get("text") in response.text:
- '''失败处理,当text设置不为None,且在resposne.text中时,删除当前cookie并重新生产cookie'''
- down_mid = copy.copy(request.down_mid)
- key = down_mid.get("key")
- page_url = down_mid.get("page_url")
- cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
- cookie_pool.del_cookie(request.cookies)
- yield request
- if response.code in (request.down_mid.get("code")):
- '''失败处理,response——code不为正确的状态码时,删除当前cookie并重新生产cookie'''
- down_mid = copy.copy(request.down_mid)
- key = down_mid.get("key")
- page_url = down_mid.get("page_url")
- cookie_pool = PageCookiePool(redis_key=key, page_url=page_url, selenium=False)
- cookie_pool.del_cookie(request.cookies)
- yield request
- items = request.item
- data_item = DataBakItem(**items)
- html = ''
- exec(request.deal_detail)
- data_item.contenthtml = html
- yield data_item
- if __name__ == "__main__":
- Details(redis_key="detail:cookie").start()
|