data_spider
/
platform-spiders


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
							# -*- coding: utf-8 -*-
"""
Created on 2025-05-24
---------
@summary: 长丰县公共资源交易网
---------
@author: lzz
"""
import feapder
from items.spider_item import DataBakItem
from untils.attachment import AttachmentDownloader
from untils.tools import extract_file_type
from crawl_func.jsl_5s import DTCookiePool

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
}


class Details(feapder.BiddingDetailSpider):
    ct = 0
    cookie_pool = DTCookiePool(
        page_url='https://www.changfeng.gov.cn/content/column/30298029?pageIndex=1',
        header=headers, redis_key="cfxggzyjyw_dtcookie")

    def start_requests(self):

        data_list = self.get_tasks_by_rabbitmq(limit=20)
        for item in data_list:
            request_params = item.get("request_params")
            yield feapder.Request(url=item.get("parse_url"), item=item, deal_detail=item.get("deal_detail"),
                                  callback=eval(item.get("parse")), **request_params, proxies=False)

    def download_midware(self, request):
        request.headers = headers
        request.cookies = self.cookie_pool.get_cookie()

    def detail_get(self, request, response):
        if self.ct > 5:
            return

        if response.status_code == 521:
            self.ct += 1
            self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
            yield request
        else:
            self.ct = 0
            items = request.item
            list_item = DataBakItem(**items)

            html = ''
            for xpath in request.deal_detail:
                html = response.xpath(xpath).extract_first()  # 标书详细内 容
                if html is not None:
                    break

            list_item.contenthtml = html

            files = response.xpath('//div[@id="zoom"]//a[@href]')
            if len(files) > 0:
                attachments = {}
                for info in files:
                    file_url = info.xpath('./@href').extract_first()
                    file_name = info.xpath('./text()').extract_first("").strip()
                    file_type = extract_file_type(file_name, file_url)
                    if file_type:
                        headers['Referer'] = request.url
                        attachment = AttachmentDownloader().fetch_attachment(
                            file_name=file_name, file_type=file_type, download_url=file_url, headers=headers,
                            cookies=request.cookies)
                        attachments[str(len(attachments) + 1)] = attachment

                if attachments:
                    list_item.projectinfo = {"attachments": attachments}

            yield list_item


if __name__ == "__main__":
    Details(redis_key="lzz:Cfxggzyjyw").start()