# -*- coding: utf-8 -*- """ Created on 2024-03-22 --------- @summary: 通用采集 --------- @author: """ import execjs import time import json import re from urllib.parse import urljoin import feapder from items.spider_item import DataBakItem from untils.attachment import AttachmentDownloader from untils.tools import remove_htmldata, extract_file_type class Details(feapder.BiddingDetailSpider): def start_requests(self): data_list = self.get_tasks_by_rabbitmq(limit=500, timeout=60) for item in data_list: request_params = item.get("request_params") timeout = request_params.pop('timeout', 10) if item.get("js"): eval(item.get("js")) if item.get("ex_python"): exec(item.get("ex_python")) if item.get("proxies"): yield feapder.Request(url=item.get("parse_url"), timeout=timeout, callback=eval(item.get("parse")), item=item, files_info=item.get("files"), deal_detail=item.get("deal_detail"), **request_params) else: yield feapder.Request(url=item.get("parse_url"), proxies=False, timeout=timeout, callback=eval(item.get("parse")), item=item, files_info=item.get("files"), deal_detail=item.get("deal_detail"), **request_params) def detail_get(self, request, response): items = request.item data_item = DataBakItem(**items) html = '' for xpath in request.deal_detail: htmls = response.xpath(xpath).extract_first() # 标书详细内容 if request.to_dict.get('conn_html', None): if htmls is not None: html += htmls else: if htmls is not None: html = htmls break if request.to_dict.get('rm_list', None) and html: rm_list = request.rm_list html = remove_htmldata(rm_list, html, response) if request.to_dict.get('title_xpath', None): for sxpath in request.title_xpath: title = response.xpath(sxpath).extract_first("").strip() # 三级页标题 if title: data_item.title = title break data_item.contenthtml = html attachments = {} if request.files_info: files_info = request.files_info files = response.xpath(files_info.get('list_xpath')) for index, info in enumerate(files): file_url = info.xpath(files_info.get('url_xpath')).extract_first() file_name = info.xpath(files_info.get('name_xpath')).extract() if not file_url or not file_name: continue file_name = ''.join(''.join(file_name).split()).strip() if files_info.get('host'): file_url = urljoin(files_info.get('host'), file_url) if not files_info.get('file_type'): file_type = extract_file_type(file_name, file_url) else: file_type = files_info.get('file_type') if request.get_proxies(): fpx = request.get_proxies() else: fpx = False if file_type and files_info.get('url_key') in file_url: attachment = AttachmentDownloader().fetch_attachment( file_name=file_name, file_type=file_type, download_url=file_url, headers=request.to_dict.get('headers', None), proxies=fpx, ) attachments[str(len(attachments) + 1)] = attachment if len(attachments) > 0: data_item.projectinfo = {'attachments': attachments} yield data_item def detail_json(self, request, response): items = request.item list_item = DataBakItem(**items) exec(request.deal_detail) yield list_item def detail_post(self, request, response): items = request.item data_item = DataBakItem(**items) exec(request.deal_detail) yield data_item if __name__ == "__main__": Details(redis_key="detail:normal_details", thread_count=10).start()