123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199 |
- # -*- coding: utf-8 -*-
- """
- Created on 2025-04-19
- ---------
- @summary: 甘肃省公共资源交易网
- ---------
- @author: lzz
- """
- import time
- import feapder
- from feapder.utils.tools import log
- from items.spider_item import DataBakItem
- from untils.attachment import AttachmentDownloader
- import requests
- from untils.tools import text_search,extract_file_type
- from feapder.network.selector import Selector
- import re,random
- headers = {
- "Accept": "*/*",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
- "Origin": "http://47.110.59.239:9207",
- "Pragma": "no-cache",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
- "X-Requested-With": "XMLHttpRequest"
- }
- def get_bdxx(hid,ptp):
- url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowBidpackage"
- data = {
- "tenderprojectid": hid,
- "bidpackages": "",
- "projectType": ptp
- }
- response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
- data_info = Selector(response.text).xpath('//div[@class="sAblock"]').extract_first()
- return data_info
- def get_ggxx(hid,area):
- url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
- data = {
- "bidpackages": "",
- "tenderprojectid": hid,
- "index": "1",
- "area": area
- }
- response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
- root = Selector(response.text)
- data_info = root.xpath('//div[@class="jxGonggaoInformationDetail "]').extract_first()
- if data_info == None:
- return "",""
- file_list = root.xpath('//div[@class="jxGonggaoInformationDetail "][1]//a')
- if text_search(data_info).total < 10:
- data_info = "详情请访问原网页!"
- return data_info,file_list
- def get_kpbxx(hid,area):
- url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
- data = {
- "bidpackages": "",
- "tenderprojectid": hid,
- "index": "4",
- "area": area
- }
- response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
- root = Selector(response.text)
- data_info = root.xpath('//div[@class="xTouBiaoTable"]').extract_first()
- if data_info == None:
- return "",""
- file_list = root.xpath('//div[@class="xTouBiaoTable"]//a')
- if text_search(data_info).total < 10:
- data_info = "详情请访问原网页!"
- return data_info,file_list
- def get_zbjg(hid,area):
- url = "http://47.110.59.239:9207/f/newprovince/tenderproject/flowpage"
- data = {
- "bidpackages": "",
- "tenderprojectid": hid,
- "index": "5",
- "area": area
- }
- response = requests.post(url, headers=headers, timeout=30, data=data, verify=False)
- root = Selector(response.text)
- data_info = root.xpath('//div[@class="jxTradingPublicDetail"]').extract_first()
- if data_info == None:
- return "",""
- file_list = root.xpath('//div[@class="jxTradingPublicDetail"]//a')
- if text_search(data_info).total < 10:
- data_info = "详情请访问原网页!"
- return data_info,file_list
- class FirefoxDetails(feapder.BiddingDetailSpider):
- def start_requests(self):
- data_list = self.get_tasks_by_rabbitmq(limit=10)
- for item in data_list:
- # log.debug(item)
- request_params = item.get("request_params")
- yield feapder.Request(url=item.get("parse_url"),item=item,files_info=item.get("files"),
- deal_detail=item.get("deal_detail"),**request_params,
- callback=eval(item.get("parse")),proxies=False)
- def detail_get(self,request,response):
- hid = request.data.get('tenderprojectid')
- area = request.data.get('area')
- items = request.item
- list_item = DataBakItem(**items)
- list_item.title += "_项目信息"
- list_item.href += f"?t={int(time.time())}"
- html1 = response.xpath('//div[@class="jxTenderObjMain"]').extract_first("")
- ptp = "".join(re.findall('projectType: "(.*?)"',response.text))
- html2 = get_bdxx(hid,ptp)
- list_item.contenthtml = html1 + html2
- yield list_item
- items = request.item
- list_item = DataBakItem(**items)
- list_item.title += "_公告信息"
- list_item.href += f"?t={int(time.time())}"
- html,file_list = get_ggxx(hid,area)
- if html:
- list_item.contenthtml = html
- attachments = {}
- if file_list:
- for f1 in file_list:
- file_url = f1.xpath('./@href').extract_first("")
- file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
- file_type = extract_file_type(file_name=file_name,file_url=file_url,file_type_list=['html'])
- if file_type and file_url:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name, file_type=file_type, download_url=file_url)
- attachments[str(len(attachments) + 1)] = attachment
- if attachments:
- list_item.projectinfo = {"attachments": attachments}
- yield list_item
- items = request.item
- list_item = DataBakItem(**items)
- list_item.title += "_开评标信息"
- list_item.href += f"?t={int(time.time())}"
- html, file_list = get_kpbxx(hid,area)
- if html:
- list_item.contenthtml = html
- attachments = {}
- if file_list:
- for f1 in file_list:
- file_url = f1.xpath('./@href').extract_first("")
- file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
- file_type = extract_file_type(file_name=file_name, file_url=file_url,file_type_list=['html'])
- if file_type and file_url:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name, file_type=file_type, download_url=file_url)
- attachments[str(len(attachments) + 1)] = attachment
- if attachments:
- list_item.projectinfo = {"attachments": attachments}
- yield list_item
- items = request.item
- list_item = DataBakItem(**items)
- list_item.title += "_中标结果信息"
- list_item.href += f"?t={int(time.time())}"
- html, file_list = get_zbjg(hid,area)
- if html:
- list_item.contenthtml = html
- attachments = {}
- if file_list:
- for f1 in file_list:
- file_url = f1.xpath('./@href').extract_first("")
- file_name = f1.xpath('./text()').extract_first("").strip() or list_item.title
- file_type = extract_file_type(file_name=file_name, file_url=file_url,file_type_list=['html'])
- if file_type and file_url:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name, file_type=file_type, download_url=file_url)
- attachments[str(len(attachments) + 1)] = attachment
- if attachments:
- list_item.projectinfo = {"attachments": attachments}
- yield list_item
- time.sleep(random.randint(3, 6))
- if __name__ == "__main__":
- FirefoxDetails(redis_key="lzz:gssggzyjyw_zfcg_lzxq").start()
|