|
- # -*- coding: utf-8 -*-
- """
- Created on 2025-04-22
- ---------
- @summary: 宁夏政府采购公共服务平台
- ---------
- @author: lzz
- """
- import json
- import re
- from collections import namedtuple
- import feapder
- import requests
- from items.spider_item import BidingListItem
- from untils.get_imgcode import get_code
- def get_ck(proxies=False):
- session = requests.session()
- session.proxies = proxies
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Pragma": "no-cache",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
- }
- url = "http://www.ccgp-ningxia.gov.cn/public/NXGPPNEW/dynamic/contents/SXCGGG/index.jsp"
- params = {
- "cid": "2010",
- "sid": "1"
- }
- res = session.get(url, headers=headers, params=params, timeout=30, verify=False)
- url1 = "http://www.ccgp-ningxia.gov.cn/TrafficStatistics.do"
- res1 = session.get(url1, headers=headers, timeout=30, verify=False)
- yzm_url = "http://www.ccgp-ningxia.gov.cn/admin/AuthCode_too.do"
- res_yzm = session.get(yzm_url, headers=headers, timeout=30, verify=False)
- code = get_code(res_yzm.content).upper()
- headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Content-Type": "application/x-www-form-urlencoded",
- "Origin": "http://www.ccgp-ningxia.gov.cn",
- "Pragma": "no-cache",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
- }
- url = "http://www.ccgp-ningxia.gov.cn//site/InteractionQuestion_findVNoticeNew.do"
- data = {
- "type": "ALL",
- "page": "0",
- "tab": "SX",
- "authCode": f"{code}",
- "noticeTab": "CGYX",
- "keyword_all": "",
- "departmentName_all": "",
- "date1_all": "",
- "date2_all": "",
- "regionId_all": "",
- "keyword_each": "",
- "departmentName_each": "",
- "agentName_each": "",
- "projectNumber_each": "",
- "planNumber_each": "",
- "date1_each": "",
- "date2_each": "",
- "title_cgyx": "",
- "departmentName_cgyx": "",
- "date1_cgyx": "",
- "date2_cgyx": "",
- "projectName_cgyxxm": "",
- "departmentName_cgyxxm": "",
- "yjcgsj_cgyxxm": "",
- "date1_cgyxxm": "",
- "date2_cgyxxm": "",
- "purchaseItem_cgyxxm": "",
- "agreCode_htgs": "",
- "departmentName_htgs": "",
- "supplierName_htgs": "",
- "date1_htgs": "",
- "date2_htgs": "",
- "agreCode_ysjggg": "",
- "reportCode_ysjggg": "",
- "departmentName_ysjggg": "",
- "supplierName_ysjggg": "",
- "date1_ysjggg": "",
- "date2_ysjggg": ""
- }
- resp = session.post(url, headers=headers, params=params, timeout=30, data=data, verify=False)
- cookies = session.cookies.get_dict()
- return cookies
- class Spider(feapder.BiddingListSpider):
- def start_callback(self):
- Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
- self.site = "宁夏政府采购公共服务平台"
- self.menus = [
- Menu('市县采购公告', 'nx_nxzfcgggfwpt_sxcggg', 2),
- ]
- self.headers = {
- "Accept": "*/*",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Cache-Control": "no-cache",
- "Connection": "keep-alive",
- "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
- "Origin": "http://www.ccgp-ningxia.gov.cn",
- "Pragma": "no-cache",
- "Referer": "http://www.ccgp-ningxia.gov.cn/public/NXGPPNEW/dynamic/contents/SXCGGG/index.jsp?cid=2010&sid=1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
- "X-Requested-With": "XMLHttpRequest"
- }
- self.cookies = get_ck()
- def start_requests(self):
- url = "http://www.ccgp-ningxia.gov.cn//site/InteractionQuestion_findVNoticeNew.do"
- for menu in self.menus:
- yield feapder.Request(url, item=menu._asdict(), page=1, proxies=False)
- def download_midware(self, request):
- page = request.page
- data = {
- "type": "ALL",
- "page": f"{page - 1}",
- "tab": "SX",
- "authCode": "",
- "noticeTab": "CGYX",
- "keyword_all": "",
- "departmentName_all": "",
- "date1_all": "",
- "date2_all": "",
- "regionId_all": "",
- "keyword_each": "",
- "departmentName_each": "",
- "agentName_each": "",
- "projectNumber_each": "",
- "planNumber_each": "",
- "date1_each": "",
- "date2_each": "",
- "title_cgyx": "",
- "departmentName_cgyx": "",
- "date1_cgyx": "",
- "date2_cgyx": "",
- "projectName_cgyxxm": "",
- "departmentName_cgyxxm": "",
- "yjcgsj_cgyxxm": "",
- "date1_cgyxxm": "",
- "date2_cgyxxm": "",
- "purchaseItem_cgyxxm": "",
- "agreCode_htgs": "",
- "departmentName_htgs": "",
- "supplierName_htgs": "",
- "date1_htgs": "",
- "date2_htgs": "",
- "agreCode_ysjggg": "",
- "reportCode_ysjggg": "",
- "departmentName_ysjggg": "",
- "supplierName_ysjggg": "",
- "date1_ysjggg": "",
- "date2_ysjggg": ""
- }
- request.data = data
- request.headers = self.headers
- request.cookies = get_ck()
- def validate(self, request, response):
- data = response.content.decode()
- data_str = "[{" + "".join(re.findall('\[\{(.*?)}]', data, re.S)).strip() + "}]"
- info_list = json.loads(data_str.replace('\\', ''), strict=False)
- assert len(info_list) > 0
- def parse(self, request, response):
- menu = request.item
- data_str = "[{" + "".join(re.findall('\[\{(.*?)}]', response.text, re.S)).strip() + "}]"
- info_list = json.loads(data_str.replace('\\', ''), strict=False)
- for info in info_list:
- href = info.get('url')
- if 'http' not in href:
- href = "http://www.ccgp-ningxia.gov.cn/public/NXGPPNEW/dynamic/" + href
- title = info.get('title').strip()
- create_time = info.get('publish_time')
- area = "宁夏"
- city = ""
- list_item = BidingListItem() # 存储数据的管道
- list_item.href = href # 标书链接
- list_item.unique_key = ('href',)
- list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
- list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
- list_item.title = title # 标题
- list_item.site = self.site
- list_item.publishtime = create_time
- list_item.area = area # 城市默认:全国
- list_item.city = city # 城市 默认为空
- list_item.parse = "self.detail_get" # 详情页回调方法
- list_item.request_params = {'rm_list': ['//div[@class="curt-row"]',
- '//p[@class="sub-tt"]']}
- list_item.deal_detail = ['//div[@class="table1"]', '//div[@class="gw-paper"]',
- '//div[@class="newAgreShow"]', '//div[@class="main"]'] # 抽取正文xpath
- list_item.proxies = True
- list_item.parse_url = href
- list_item.files = { # 附件采集规则
- "list_xpath": '//div[@class="main"]//a[@href]',
- "url_xpath": './@href',
- "name_xpath": './text()',
- "files_type": ('zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
- 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg'), # 需要下载的附件类型
- # "file_type":'pdf', # 默认的附件类型,用于url中未带附件类型的
- "url_key": 'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
- "host": '', # 需要拼接url的host
- }
- yield list_item
- # 无限翻页
- request = self.infinite_pages(request, response)
- yield request
- def exception_request(self, request, response):
- self.cookies = get_ck(request.get_proxies())
- yield request
- if __name__ == "__main__":
- Spider(redis_key="detail:chrome").start()
|