data_spider
/
platform-spiders


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
							# -*- coding: utf-8 -*-
"""
Created on 2025-04-19
---------
@summary: 甘肃省公共资源交易网
---------
@author: lzz
"""
import time

import feapder
from feapder.utils.tools import log
from items.spider_item import DataBakItem
from untils.attachment import AttachmentDownloader
import requests
from untils.tools import text_search,extract_file_type
from feapder.network.selector import Selector
import re,random

headers = {
    "Accept": "*/*",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Origin": "http://47.110.59.239:9207",
    "Pragma": "no-cache",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}

def get_bdxx(hid,ptp):

    url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowBidpackage"
    data = {
        "tenderprojectid": hid,
        "bidpackages": "",
        "projectType": ptp
    }
    response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
    data_info = Selector(response.text).xpath('//div[@class="sAblock"]').extract_first()
    return data_info

def get_ggxx(hid,area):
    url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
    data = {
        "bidpackages": "",
        "tenderprojectid": hid,
        "index": "1",
        "area": area
    }
    response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
    root = Selector(response.text)
    data_info = root.xpath('//div[@class="jxGonggaoInformationDetail "]').extract_first()
    if data_info == None:
        return "",""
    file_list = root.xpath('//div[@class="jxGonggaoInformationDetail "][1]//a')
    if text_search(data_info).total < 10:
        data_info = "详情请访问原网页！"
    return data_info,file_list

def get_kpbxx(hid,area):
    url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
    data = {
        "bidpackages": "",
        "tenderprojectid": hid,
        "index": "4",
        "area": area
    }
    response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
    root = Selector(response.text)
    data_info = root.xpath('//div[@class="xTouBiaoTable"]').extract_first()
    if data_info == None:
        return "",""
    file_list = root.xpath('//div[@class="xTouBiaoTable"]//a')
    if text_search(data_info).total < 10:
        data_info = "详情请访问原网页！"
    return data_info,file_list

def get_zbjg(hid,area):
    url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
    data = {
        "bidpackages": "",
        "tenderprojectid": hid,
        "index": "5",
        "area": area
    }
    response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
    root = Selector(response.text)
    data_info = root.xpath('//div[@class="jxTradingPublicDetail"]').extract_first()
    if data_info == None:
        return "",""
    file_list = root.xpath('//div[@class="jxTradingPublicDetail"]//a')
    if text_search(data_info).total < 10:
        data_info = "详情请访问原网页！"
    return data_info,file_list

class FirefoxDetails(feapder.BiddingDetailSpider):


    def start_requests(self):
        data_list = self.get_tasks_by_rabbitmq(limit=10)
        for item in data_list:
            # log.debug(item)
            request_params = item.get("request_params")
            yield feapder.Request(url=item.get("parse_url"),item=item,files_info=item.get("files"),
                                  deal_detail=item.get("deal_detail"),**request_params,
                                  callback=eval(item.get("parse")),proxies=False)

    def detail_get(self,request,response):
        hid = request.data.get('tenderprojectid')
        area = request.data.get('area')
        items = request.item
        list_item = DataBakItem(**items)
        list_item.title += "_项目信息"
        list_item.href += f"?t={int(time.time())}"

        html1 = response.xpath('//div[@class="jxTenderObjMain"]').extract_first("")
        ptp = "".join(re.findall('projectType: "(.*?)"',response.text))
        html2 = get_bdxx(hid,ptp)

        list_item.contenthtml = html1 + html2
        yield list_item

        items = request.item
        list_item = DataBakItem(**items)
        list_item.title += "_公告信息"
        list_item.href += f"?t={int(time.time())}"

        html,file_list = get_ggxx(hid,area)
        if html:
            list_item.contenthtml = html
            attachments = {}
            if file_list:
                for f1 in file_list:
                    file_url = f1.xpath('./@href').extract_first("")
                    file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
                    file_type = extract_file_type(file_name=file_name,file_url=file_url,file_type_list=['html'])
                    if file_type and file_url:
                        attachment = AttachmentDownloader().fetch_attachment(
                            file_name=file_name, file_type=file_type, download_url=file_url)
                        attachments[str(len(attachments) + 1)] = attachment

            if attachments:
                list_item.projectinfo = {"attachments": attachments}

            yield list_item

        items = request.item
        list_item = DataBakItem(**items)
        list_item.title += "_开评标信息"
        list_item.href += f"?t={int(time.time())}"

        html, file_list = get_kpbxx(hid,area)
        if html:
            list_item.contenthtml = html
            attachments = {}
            if file_list:
                for f1 in file_list:
                    file_url = f1.xpath('./@href').extract_first("")
                    file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
                    file_type = extract_file_type(file_name=file_name, file_url=file_url,file_type_list=['html'])
                    if file_type and file_url:
                        attachment = AttachmentDownloader().fetch_attachment(
                            file_name=file_name, file_type=file_type, download_url=file_url)
                        attachments[str(len(attachments) + 1)] = attachment

            if attachments:
                list_item.projectinfo = {"attachments": attachments}

            yield list_item

        items = request.item
        list_item = DataBakItem(**items)
        list_item.title += "_中标结果信息"
        list_item.href += f"?t={int(time.time())}"

        html, file_list = get_zbjg(hid,area)
        if html:
            list_item.contenthtml = html
            attachments = {}
            if file_list:
                for f1 in file_list:
                    file_url = f1.xpath('./@href').extract_first("")
                    file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
                    file_type = extract_file_type(file_name=file_name, file_url=file_url,file_type_list=['html'])
                    if file_type and file_url:
                        attachment = AttachmentDownloader().fetch_attachment(
                            file_name=file_name, file_type=file_type, download_url=file_url)
                        attachments[str(len(attachments) + 1)] = attachment

            if attachments:
                list_item.projectinfo = {"attachments": attachments}

            yield list_item
        time.sleep(random.randint(3, 6))

if __name__ == "__main__":
    FirefoxDetails(redis_key="lzz:gssggzyjyw_zfcg_lzxq").start()