# -*- coding: utf-8 -*- """ Created on 2024-08-05 --------- @summary: 黑龙江政府采购服务工程超市-询价采购-已开标 --------- @author: lzz """ import json from collections import namedtuple import feapder from feapder.utils.tools import timestamp_to_date from items.spider_item import BidingListItem class Spider(feapder.BiddingListSpider): def start_callback(self): Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page']) self.site = "黑龙江政府采购服务工程超市" self.menus = [ Menu('询价采购-已开标', 'hlj_hljzfcgfwgccs_xjcg_ykb', 1) ] self.headers = { "Accept": "application/json, text/plain, */*", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Content-Type": "application/json", "Origin": "https://hljcg.hlj.gov.cn", "Pragma": "no-cache", "Referer": "https://hljcg.hlj.gov.cn/fwgccs/buyerHall?type=1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36", "X-Requested-With": "XMLHttpRequest", "accessToken": "undefined", "loginToken": "undefined", "x-auth-token": "undefined" } def start_requests(self): url = "https://hljcg.hlj.gov.cn/fwgccsapi/api/platform/queryPurchase" for menu in self.menus: yield feapder.Request(url, item=menu._asdict(), page=1) def parse(self, request, response): menu = request.item info_list = response.json.get('data').get('data') for info in info_list: href_id = info.get('originOrderNo') region = info.get('region').replace('本级', '') href = f"https://hljcg.hlj.gov.cn/fwgccs/newdetailsDemand?id={href_id}" + "&ykb" t1 = info.get('unitName') or info.get('purchasingInformation') or info.get('purchaser') or "" title = t1 + info.get('purchaseDemandName').strip() + "比价采购公告" create_time = timestamp_to_date(int(str(info.get('createTime'))[:-3])) area = "黑龙江" # 省份 city = f"{region}" # 城市 list_item = BidingListItem() # 存储数据的管道 list_item.href = href # 标书链接 list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的) list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的) list_item.title = title # 标题 list_item.publishtime = create_time # 标书发布时间 list_item.site = self.site list_item.area = area # 城市默认:全国 list_item.city = city # 城市 默认为空 list_item.unique_key = ("href", 'title', create_time) list_item.parse = "self.detail_get" list_item.deal_detail = ['//div[contains(@class,"detail-body")]/div[@class="panel hideInPrint"]', '//div[contains(@class,"detail-body")]/div[@class="text-14"]'] list_item.proxies = True d_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "no-cache", "Connection": "keep-alive", "Pragma": "no-cache", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36", } list_item.request_params = {"headers": d_headers, "conn_html":True, "rm_list":['//span[contains(text(),"询价通知书:")]/..', '//span[contains(text(),"比价通知书:")]/..', '//span[contains(text(),"联系人:")]/..', '//div[@class="gc-ui-modal"]', '//div[@class="cursor-pointer mt-20 text-primary"]']} list_item.parse_url = href yield list_item # 无限翻页设置 request = self.infinite_pages(request, response) yield request def download_midware(self, request): page = request.page data = { "data": { "purchasetypeLevel1Id": None, "purchasetypeLevel2Id": None, "purchasetypeLevel3Id": None, "purchasetypeLevel1IdName": "", "purchasetypeLevel2IdName": "", "purchasetypeLevel3IdName": "", "town": "", "expirytTimeSort": 0, "priceSort": 0, "releaseTimeSort": 0, "highPrice": None, "lowPrice": None, "page": page, "biddingType": "1", "purchaseState": 6, "baseAppId": "04", "appId": "HLJGCY", "pageSize": 40, "cityCode": "", "province": "", "city": "", "otherTown": "", "defaultRegion": "", "townType": 1, "title": "" } } data = json.dumps(data, separators=(',', ':')) request.data = data request.headers = self.headers if __name__ == "__main__": Spider(redis_key="detail:normal_details").start()