# -*- coding: utf-8 -*- """ Created on 2023-08-07 --------- @summary: selenium 采集 FIREFOX --------- @author: """ from urllib.parse import urljoin, quote import feapder from items.spider_item import DataBakItem from untils.attachment import AttachmentDownloader from untils.tools import remove_htmldata, extract_file_type try: import time import json import re except ImportError: pass headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" } DRISSIONPAGE = dict( pool_size=10, # 浏览器标签页的数量 user_agent=None, # 字符串 load_images=False, # 是否加载图片 proxy=None, # xxx.xxx.xxx.xxx:xxxx headless=True, # 是否为无头浏览器 timeout=30, # 请求超时时间 retry=1, # 连接失败重试次数 interval=0.5, # 连接失败重试间隔(秒) page_load=30, render_time=0, # 渲染时长,即打开网页等待加载超时时间 window_size=(1024, 800), # 窗口大小 driver_type="chromium", load_mode="normal", # 网页加载策略, 可选值:"normal", "eager", "none" download_path=None, # 下载文件的路径 custom_argument=[ "--no-sandbox", "--ignore-certificate-errors" ] ) class Spider(feapder.BiddingDetailSpider): __custom_setting__ = dict( PROXY_EXTRACT_API="http://172.17.162.28:16001/sam", PROXY_POOL="feapder.network.proxy_pool.SpringBoardProxyPool", DRISSIONPAGE=DRISSIONPAGE ) def start_requests(self): data_list = self.get_tasks_by_rabbitmq(limit=100, timeout=60) for item in data_list: request_params = item.get('request_params') timeout = request_params.pop('timeout', 30) if item.get('ex_python'): exec(item.get('ex_python')) yield feapder.Request(url=item.get('parse_url'), timeout=timeout, render=True, render_time=item.get('render_time', 5), callback=eval(item.get('parse')), item=item, files_info=item.get('files'), deal_detail=item.get('deal_detail'), **request_params) def detail_get(self, request, response): items = request.item data_item = DataBakItem(**items) html = '' for xpath in request.deal_detail: htmls = response.xpath(xpath).extract_first() # 标书详细内容 if request.to_dict.get('conn_html', None): if htmls is not None: html += htmls else: if htmls is not None: html = htmls break if request.to_dict.get('rm_list', None) and html: rm_list = request.rm_list html = remove_htmldata(rm_list, html, response) data_item.contenthtml = html attachments = {} if request.files_info: files_info = request.files_info files = response.xpath(files_info.get('list_xpath')) for index, info in enumerate(files): file_url = info.xpath(files_info.get('url_xpath')).extract_first() file_name = info.xpath(files_info.get('name_xpath')).extract() if not file_url or not file_name: continue file_name = ''.join(''.join(file_name).split()).strip() if files_info.get('host'): file_url = urljoin(files_info.get('host'), file_url) if not files_info.get('file_type'): file_type = extract_file_type(file_name, file_url) else: file_type = files_info.get('file_type') if request.get_proxies(): fpx = request.get_proxies() else: fpx = False cookie_json = response.cookies.get_dict() or {} if file_type and files_info.get('url_key') in file_url: headers['Referer'] = quote(file_url, safe=';/?:@&=+$,', encoding='utf-8') attachment = AttachmentDownloader().fetch_attachment( file_name=file_name, file_type=file_type, download_url=file_url, headers=headers, proxies=fpx, cookies=cookie_json) attachments[str(len(attachments) + 1)] = attachment headers.pop('Referer', '') if len(attachments) > 0: data_item.projectinfo = {'attachments': attachments} yield data_item if __name__ == "__main__": Spider(redis_key='detail:firefox', thread_count=10).start()