# -*- coding: utf-8 -*- """ Created on 2025-04-16 --------- @summary: 黑龙江政府采购服务工程超市 --------- @author: lzz """ import feapder from items.spider_item import MgpListItem from collections import namedtuple from feapder.utils.tools import get_month,timestamp_to_date import json class Hljzfcgfwgccs(feapder.BiddingListSpider): def start_callback(self): Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'cid', 'crawl_page']) self.site = "黑龙江政府采购服务工程超市" self.menus = [ Menu('服务工程超市公告-中标成交公告', 'hl_hljzfcgfwgccs_fwgccsgg_zbcjgg', '3', '结果公告', 3), ] self.headers = { "Accept": "application/json, text/plain, */*", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Content-Type": "application/json", "Origin": "https://hljcg.hlj.gov.cn", "Pragma": "no-cache", "Referer": "https://hljcg.hlj.gov.cn/fwgccs/publicity?type=3", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36", "X-Requested-With": "XMLHttpRequest", "accessToken": "undefined", "loginToken": "undefined", "x-auth-token": "undefined" } def start_requests(self): for menu in self.menus: start_url = "https://hljcg.hlj.gov.cn/fwgccsapi/api/notice/queryNoticeList" yield feapder.Request(url=start_url,item=menu._asdict(),page=1,proxies=False) def parse(self, request, response): menu = request.item info_list = response.json.get('data').get('data') for info in info_list: href_id = info.get('id') href_type = info.get('type') t1 = info.get('unitName') or info.get('purchasingInformation') or info.get('purchaser') or "" href = f"https://hljcg.hlj.gov.cn/fwgccs/publicityDetails?id={href_id}&type={href_type}" title = info.get('name') or info.get('purchaseDemandName') title = t1 + title.strip() + menu.get('cid') create_time = timestamp_to_date(int(str(info.get('time'))[:10])) area = "黑龙江" # 省份 city = "" # 城市 list_item = MgpListItem() # 存储数据的管道 list_item.href = href # 标书链接 list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的) list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的) list_item.title = title # 标题 list_item.publishtime = create_time # 标书发布时间 list_item.site = self.site list_item.area = area # 城市默认:全国 list_item.city = city # 城市 默认为空 list_item.unique_key = ("href","title") list_item.parse = "self.detail_get" list_item.proxies = False list_item.request_params = {"rm_list":['//div[contains(@class,"relevant")]',]} list_item.deal_detail = ['//div[@class="page-content"]','//div[@class="right-content"]'] list_item.parse_url = href list_item.render_time = 5 list_item.files = { # 附件采集规则 "list_xpath": '//div[@class="right-content"]//a[@href]', "url_xpath": './@href', "name_xpath": './text()', "files_type": ('zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs', 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg'), # 需要下载的附件类型 # "file_type":'pdf', # 默认的附件类型,用于url中未带附件类型的 "url_key": 'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http "host": '', # 需要拼接url的host } yield list_item # 无限翻页设置 request = self.infinite_pages(request, response) yield request def download_midware(self, request): page = request.page menu = request.item data = { "data": { "businessId": "", "biddingType": "0", "purchasingInformation": "", "transactionSupplierName": "", "regionVal": [ "黑龙江省", "", "" ], "requestId": "1531362372728", "requirementName": "", "type": menu.get('tid'), "page": page, "pageSize": 20, "province": "黑龙江省", "city": "", "region": "", "startTime": f"{get_month(-3)}", "endTime": f"{get_month()}" } } data = json.dumps(data, separators=(',', ':')) request.data = data request.headers = self.headers if __name__ == "__main__": Hljzfcgfwgccs(redis_key="detail:chrome").start()