data_spider
/
platform-spiders


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
							# -*- coding: utf-8 -*-
"""
Created on 2025-04-19
---------
@summary: 甘肃省公共资源交易网
---------
@author: lzz
"""
import re
import time

import feapder
import requests
from feapder.network.selector import Selector
from items.spider_item import DataBakItem
from untils.attachment import AttachmentDownloader
from untils.tools import text_search, extract_file_type

headers = {
    "accept": "text/html, */*; q=0.01",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "cache-control": "no-cache",
    "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
    "origin": "https://ygjy.ggzyjy.gansu.gov.cn:3045",
    "pragma": "no-cache",
    "priority": "u=1, i",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
    "x-requested-with": "XMLHttpRequest"
}


def get_cggg(annoId,projectId,proxies=False):
    url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getAnnoDetail"
    data = {
        "annoId": annoId,
        "projectId": projectId
    }
    response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
    data_info = Selector(response.text).xpath('//div[@class="yAnnounceLayer"]').extract_first()
    return data_info


def get_jjjg(projectId,proxies=False):
    url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getBidResult"
    data = {
        "projectId": projectId
    }
    response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
    data_info = Selector(response.text).xpath('//div[@class="yThingsWrap"]').extract_first()
    return data_info


def get_cjgs(projectId,proxies=False):
    url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getPublicityDetail"
    data = {
        "projectId": projectId,
        "publicityId": "0"
    }
    response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
    data_info = Selector(response.text).xpath('//div[@class="yDealMain"]').extract_first()
    return data_info


def get_ht(projectId,proxies=False):
    url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getContractList"
    data = {
        "projectId": projectId
    }
    response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
    data_info = Selector(response.text).xpath('//div[@class="yThingsWrap"]').extract_first()
    return data_info


class Spider(feapder.BiddingDetailSpider):

    def start_requests(self):
        data_list = self.get_tasks(limit=50)
        for item in data_list:
            request_params = item.get("request_params")
            yield feapder.Request(url=item.get("parse_url"),
                                  callback=eval(item.get("parse")),
                                  item=item,
                                  deal_detail=item.get("deal_detail"),
                                  files_info=item.get("files"),
                                  **request_params)

    def detail_get(self, request, response):
        yield_list = []
        base_id = re.findall('data:\{annoId:(.*?)}',response.text)[0]
        annoId = "".join(re.findall("'(.*?)',",base_id))
        projectId = "".join(re.findall("projectId:'(.*?)'",base_id))

        items = request.item
        list_item1 = DataBakItem(**items)
        list_item1.title += "_采购公告"
        list_item1.href += f"?t={int(time.time())}"

        html = get_cggg(annoId,projectId,request.get_proxies())
        list_item1.contenthtml = html

        s_title = Selector(html).xpath('//h4[@class="yAnnounceName"]/text()').extract_first("").strip()
        if s_title and s_title != list_item1.title:
            list_item1.s_title = s_title

        file_list = Selector(html).xpath('//a')
        attachments = {}
        if file_list:
            for f1 in file_list:
                file_url = f1.xpath('./@href').extract_first("")
                file_name = f1.xpath('./text()').extract_first("").strip() or list_item1.title
                file_type = extract_file_type(file_name=file_name, file_url=file_url, file_type_list=['html'])
                if file_type and file_url:
                    attachment = AttachmentDownloader().fetch_attachment(
                        file_name=file_name, file_type=file_type, download_url=file_url)
                    attachments[str(len(attachments) + 1)] = attachment

        if attachments:
            list_item1.projectinfo = {"attachments": attachments}

        yield_list.append(list_item1)

        items = request.item
        list_item2 = DataBakItem(**items)
        list_item2.title += "_竞价结果"
        list_item2.href += f"?t={int(time.time())}"

        html = get_jjjg(projectId,request.get_proxies())
        if html or text_search(html).total > 10:
            list_item2.contenthtml = html
            yield_list.append(list_item2)

        items = request.item
        list_item3 = DataBakItem(**items)
        list_item3.title += "_成交公示"
        list_item3.href += f"?t={int(time.time())}"

        html = get_cjgs(projectId,request.get_proxies())

        s_title = Selector(html).xpath('//h6[@class="yDealMainTitle"]/text()').extract_first("").strip()
        if s_title and s_title != list_item3.title:
            list_item3.s_title = s_title
        if html and text_search(html).total > 10:
            list_item3.contenthtml = html
            yield_list.append(list_item3)

        items = request.item
        list_item4 = DataBakItem(**items)
        list_item4.title += "_合同"
        list_item4.href += f"?t={int(time.time())}"

        html = get_ht(projectId,request.get_proxies())
        if html and text_search(html).total > 10 and "qyPrintContract" not in html:
            list_item4.contenthtml = html
            yield_list.append(list_item4)

        for yd in yield_list:
            yield yd


if __name__ == "__main__":
    Spider(redis_key="lzz:gssggzyjyw_xeyxgcjsqb").start()