123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- # -*- coding: utf-8 -*-
- """
- Created on 2023-7-22
- ---------
- @summary: 云采通高校采购联盟-结果公告
- ---------
- @author: lzz
- """
- from urllib.parse import urljoin
- import feapder
- from feapder.network.selector import Selector
- from feapder.utils.log import log
- from items.spider_item import DataBakItem
- from untils.attachment import AttachmentDownloader
- from untils.tools import remove_htmldata
- class Spider(feapder.BiddingDetailSpider):
- def start_requests(self):
- data_list = self.get_tasks_by_rabbitmq(limit=30)
- for item in data_list:
- log.debug(item)
- request_params = item.get("request_params")
- yield feapder.Request(url=item.get("parse_url"),
- render=True,
- render_time=item.get("render_time"),
- callback=eval(item.get("parse")),
- item=item,
- files_info=item.get("files"),
- deal_detail=item.get("deal_detail"),
- **request_params)
- def detail_get(self, request, response):
- iframe = response.xpath('//iframe[@id="contentFrame"]').extract_first()
- if iframe:
- content_frame = response.browser.tab('#contentFrame')
- response = Selector(content_frame.html)
- items = request.item
- data_item = DataBakItem(**items)
- html = ''
- detail_path = [
- '//table[@class="MsoNormalTable"]',
- '//div[@class="project-details positionrl"]',
- '//div[@class="content"]',
- '//div[@class="project-war medium"]',
- '/html/body'
- ]
- for xpath in detail_path:
- html = response.xpath(xpath).extract_first() # 标书详细内容
- if html is not None:
- break
- rm_lsit = [
- '//div[@class="project-bid print-hide"]',
- '//div[@class="select-item ml10"]',
- '//div[@id="guide8"]',
- '打印公告'
- ]
- data_item.contenthtml = remove_htmldata(rm_lsit, html, response)
- attachments = {}
- if request.files_info:
- files_info = request.files_info
- files = response.xpath(files_info.get("list_xpath"))
- if len(files) > 0:
- for index, info in enumerate(files):
- file_url = info.xpath(files_info.get("url_xpath")).extract_first()
- file_name = info.xpath(files_info.get("name_xpath")).extract_first()
- file_name = (file_name or data_item.title)
- if 'http' not in file_url and files_info.get("host"):
- file_url = urljoin(files_info.get("host"), file_url)
- if not files_info.get("file_type"):
- file_type = file_url.split(".")[-1].lower()
- if file_type not in files_info.get("files_type"):
- file_type = file_name.split(".")[-1].lower()
- else:
- file_type = files_info.get("file_type")
- if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
- if file_type in file_name:
- file_name = file_name.replace(f'.{file_type}', '').strip()
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name,
- file_type=file_type,
- download_url=file_url,
- proxies=request.get_proxies()
- )
- attachments[str(len(attachments) + 1)] = attachment
- if len(attachments) > 0:
- data_item.projectinfo = {"attachments": attachments}
- yield data_item
- if __name__ == "__main__":
- Spider(redis_key="lzz:yctgxcglm_jggg").start()
|