123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- # -*- coding: utf-8 -*-
- """
- Created on 2025-04-19
- ---------
- @summary: 甘肃省公共资源交易网
- ---------
- @author: lzz
- """
- import re
- import time
- import feapder
- import requests
- from feapder.network.selector import Selector
- from items.spider_item import DataBakItem
- from untils.attachment import AttachmentDownloader
- from untils.tools import text_search, extract_file_type
- headers = {
- "accept": "text/html, */*; q=0.01",
- "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
- "cache-control": "no-cache",
- "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
- "origin": "https://ygjy.ggzyjy.gansu.gov.cn:3045",
- "pragma": "no-cache",
- "priority": "u=1, i",
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
- "x-requested-with": "XMLHttpRequest"
- }
- def get_cggg(annoId,projectId,proxies=False):
- url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getAnnoDetail"
- data = {
- "annoId": annoId,
- "projectId": projectId
- }
- response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
- data_info = Selector(response.text).xpath('//div[@class="yAnnounceLayer"]').extract_first()
- return data_info
- def get_jjjg(projectId,proxies=False):
- url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getBidResult"
- data = {
- "projectId": projectId
- }
- response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
- data_info = Selector(response.text).xpath('//div[@class="yThingsWrap"]').extract_first()
- return data_info
- def get_cjgs(projectId,proxies=False):
- url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getPublicityDetail"
- data = {
- "projectId": projectId,
- "publicityId": "0"
- }
- response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
- data_info = Selector(response.text).xpath('//div[@class="yDealMain"]').extract_first()
- return data_info
- def get_ht(projectId,proxies=False):
- url = "https://ygjy.ggzyjy.gansu.gov.cn:3045/f/engineer/getContractList"
- data = {
- "projectId": projectId
- }
- response = requests.post(url, headers=headers, timeout=30, proxies=proxies, data=data, verify=False)
- data_info = Selector(response.text).xpath('//div[@class="yThingsWrap"]').extract_first()
- return data_info
- class Spider(feapder.BiddingDetailSpider):
- def start_requests(self):
- data_list = self.get_tasks(limit=50)
- for item in data_list:
- request_params = item.get("request_params")
- yield feapder.Request(url=item.get("parse_url"),
- callback=eval(item.get("parse")),
- item=item,
- deal_detail=item.get("deal_detail"),
- files_info=item.get("files"),
- **request_params)
- def detail_get(self, request, response):
- yield_list = []
- base_id = re.findall('data:\{annoId:(.*?)}',response.text)[0]
- annoId = "".join(re.findall("'(.*?)',",base_id))
- projectId = "".join(re.findall("projectId:'(.*?)'",base_id))
- items = request.item
- list_item1 = DataBakItem(**items)
- list_item1.title += "_采购公告"
- list_item1.href += f"?t={int(time.time())}"
- html = get_cggg(annoId,projectId,request.get_proxies())
- list_item1.contenthtml = html
- s_title = Selector(html).xpath('//h4[@class="yAnnounceName"]/text()').extract_first("").strip()
- if s_title and s_title != list_item1.title:
- list_item1.s_title = s_title
- file_list = Selector(html).xpath('//a')
- attachments = {}
- if file_list:
- for f1 in file_list:
- file_url = f1.xpath('./@href').extract_first("")
- file_name = f1.xpath('./text()').extract_first("").strip() or list_item1.title
- file_type = extract_file_type(file_name=file_name, file_url=file_url, file_type_list=['html'])
- if file_type and file_url:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name, file_type=file_type, download_url=file_url)
- attachments[str(len(attachments) + 1)] = attachment
- if attachments:
- list_item1.projectinfo = {"attachments": attachments}
- yield_list.append(list_item1)
- items = request.item
- list_item2 = DataBakItem(**items)
- list_item2.title += "_竞价结果"
- list_item2.href += f"?t={int(time.time())}"
- html = get_jjjg(projectId,request.get_proxies())
- if html or text_search(html).total > 10:
- list_item2.contenthtml = html
- yield_list.append(list_item2)
- items = request.item
- list_item3 = DataBakItem(**items)
- list_item3.title += "_成交公示"
- list_item3.href += f"?t={int(time.time())}"
- html = get_cjgs(projectId,request.get_proxies())
- s_title = Selector(html).xpath('//h6[@class="yDealMainTitle"]/text()').extract_first("").strip()
- if s_title and s_title != list_item3.title:
- list_item3.s_title = s_title
- if html and text_search(html).total > 10:
- list_item3.contenthtml = html
- yield_list.append(list_item3)
- items = request.item
- list_item4 = DataBakItem(**items)
- list_item4.title += "_合同"
- list_item4.href += f"?t={int(time.time())}"
- html = get_ht(projectId,request.get_proxies())
- if html and text_search(html).total > 10 and "qyPrintContract" not in html:
- list_item4.contenthtml = html
- yield_list.append(list_item4)
- for yd in yield_list:
- yield yd
- if __name__ == "__main__":
- Spider(redis_key="lzz:gssggzyjyw_xeyxgcjsqb").start()
|