|
@@ -1,49 +1,30 @@
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
"""
|
|
-Created on 2023-02-25
|
|
|
|
|
|
+Created on 2024-12-12
|
|
---------
|
|
---------
|
|
@summary: 中国建设银行集采平台
|
|
@summary: 中国建设银行集采平台
|
|
---------
|
|
---------
|
|
@author: lzz
|
|
@author: lzz
|
|
"""
|
|
"""
|
|
import feapder
|
|
import feapder
|
|
-from items.spider_item import DataBakItem
|
|
|
|
|
|
+from items.spider_item import BidingListItem
|
|
from collections import namedtuple
|
|
from collections import namedtuple
|
|
-import time, random
|
|
|
|
-import execjs
|
|
|
|
-import requests
|
|
|
|
-from untils.attachment import AttachmentDownloader
|
|
|
|
-from untils.tools import extract_file_type,remove_htmldata
|
|
|
|
-from feapder.network.selector import Selector
|
|
|
|
|
|
|
|
|
|
|
|
-def get_ser(text):
|
|
|
|
- with open('./zgjsyhjcpt.js', 'r') as f:
|
|
|
|
- exjs = f.read()
|
|
|
|
- ctx = execjs.compile(exjs)
|
|
|
|
- ser = ctx.call("encryptRSA", f"{text}")
|
|
|
|
- return ser
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def transfer_url(url):
|
|
|
|
- url = url.replace("+", "%2B").replace("/", "%2F").replace("?", "%3F")
|
|
|
|
- url = url.replace("#", "%23").replace("&", "%26").replace("=", "%3D")
|
|
|
|
- return url
|
|
|
|
-
|
|
|
|
|
|
|
|
class Zgjsyhjcpt(feapder.BiddingListSpider):
|
|
class Zgjsyhjcpt(feapder.BiddingListSpider):
|
|
|
|
|
|
def start_callback(self):
|
|
def start_callback(self):
|
|
- Menu = namedtuple('Menu', ['channel', 'code', 'typeone', 'crawl_page'])
|
|
|
|
|
|
+ Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
|
|
self.site = "中国建设银行集采平台"
|
|
self.site = "中国建设银行集采平台"
|
|
|
|
|
|
self.menus = [
|
|
self.menus = [
|
|
- Menu('供应商征集', 'a_zgjsyhjcpt_gyszj', 'ccbgyszj', 3),
|
|
|
|
- Menu('采购专区-招标公告', 'a_zgjsyhjcpt_cgzq_zbgg', 'ccbbidzbgg', 1),
|
|
|
|
- Menu('采购专区-变更公告', 'a_zgjsyhjcpt_cgzq_bggg', 'ccbbidecgg', 1),
|
|
|
|
- Menu('采购专区-中标候选人公示', 'a_zgjsyhjcpt_cgzq_zbhxrgs', 'ccbbidzbgs', 1),
|
|
|
|
- Menu('采购专区-中标结果公示', 'a_zgjsyhjcpt_cgzq_zbjggs', 'ccbbidzbjggs', 1),
|
|
|
|
- Menu('采购公开信息', 'a_zgjsyhjcpt_cggkxx', 'ccbpurtzgg', 3),
|
|
|
|
|
|
+ Menu('供应商征集', 'a_zgjsyhjcpt_gyszj', '360', 1),
|
|
|
|
+ Menu('采购专区-招标公告', 'a_zgjsyhjcpt_cgzq_zbgg', '355', 1),
|
|
|
|
+ Menu('采购专区-变更公告', 'a_zgjsyhjcpt_cgzq_bggg', '357', 1),
|
|
|
|
+ Menu('采购专区-中标候选人公示', 'a_zgjsyhjcpt_cgzq_zbhxrgs', '358', 1),
|
|
|
|
+ Menu('采购专区-中标结果公示', 'a_zgjsyhjcpt_cgzq_zbjggs', '359', 1),
|
|
|
|
+ Menu('采购公开信息', 'a_zgjsyhjcpt_cggkxx', '353', 1),
|
|
]
|
|
]
|
|
|
|
|
|
self.headers = {
|
|
self.headers = {
|
|
@@ -60,27 +41,26 @@ class Zgjsyhjcpt(feapder.BiddingListSpider):
|
|
|
|
|
|
def start_requests(self):
|
|
def start_requests(self):
|
|
for menu in self.menus:
|
|
for menu in self.menus:
|
|
- start_url = f"https://ibuy.ccb.com/cms/channel/{menu.typeone}/index.htm"
|
|
|
|
|
|
+ start_url = f"https://ibuy.ccb.com/json/contentFile/{menu.tid}/1.json"
|
|
yield feapder.Request(url=start_url, item=menu._asdict(), page=1)
|
|
yield feapder.Request(url=start_url, item=menu._asdict(), page=1)
|
|
|
|
|
|
def parse(self, request, response):
|
|
def parse(self, request, response):
|
|
menu = request.item
|
|
menu = request.item
|
|
- info_list = response.xpath('//div[@class="infolist-main single-main bidlist"]/ul/li|//div[@class="infolist-main bidlist"]/ul/li')
|
|
|
|
- for info in info_list:
|
|
|
|
- title = "".join(info.xpath('./a/span/text()').extract()).strip()
|
|
|
|
- href_org = info.xpath('./a/@hrefurl').extract_first()
|
|
|
|
- href_hid = info.xpath('./a/@hid').extract_first()
|
|
|
|
- ser_p = transfer_url(get_ser(href_hid))
|
|
|
|
- href = f"https://ibuy.ccb.com{href_org}?pageNo={href_hid}&_ser_p={ser_p}"
|
|
|
|
- create_time = "".join(info.xpath('./a/em[1]/text()').extract()).strip()
|
|
|
|
- dedup_href = href.split("&_ser_p")[0]
|
|
|
|
|
|
+ info_list = response.json
|
|
|
|
+ for info in info_list[:30]:
|
|
|
|
+ title = info.get('title').strip()
|
|
|
|
+ hid = info.get('id')
|
|
|
|
+ pid = menu.get('tid')
|
|
|
|
+ href = f"https://ibuy.ccb.com/cms/index.html#/content?pId={pid}&id={hid}"
|
|
|
|
+ create_time = info.get('releaseDate').strip()
|
|
|
|
+
|
|
|
|
+ htm = create_time.split('-')[0]
|
|
|
|
|
|
area = "全国" # 省份
|
|
area = "全国" # 省份
|
|
city = "" # 城市
|
|
city = "" # 城市
|
|
|
|
|
|
- data_item = DataBakItem() # 存储数据的管道
|
|
|
|
|
|
+ data_item = BidingListItem() # 存储数据的管道
|
|
data_item.href = href # 标书链接
|
|
data_item.href = href # 标书链接
|
|
- data_item.unique_key = (dedup_href,title,create_time)
|
|
|
|
data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
data_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
data_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
data_item.title = title # 标题
|
|
data_item.title = title # 标题
|
|
@@ -89,32 +69,9 @@ class Zgjsyhjcpt(feapder.BiddingListSpider):
|
|
data_item.area = area # 城市默认:全国
|
|
data_item.area = area # 城市默认:全国
|
|
data_item.city = city # 城市 默认为空
|
|
data_item.city = city # 城市 默认为空
|
|
|
|
|
|
- time.sleep(random.randint(3, 5))
|
|
|
|
- res = requests.get(href, headers=self.headers, proxies=request.proxies(), verify=False, timeout=30)
|
|
|
|
- if res.status_code == 200:
|
|
|
|
- root = Selector(res.text)
|
|
|
|
- html = root.xpath('//div[@class="article-content"]').extract_first("")
|
|
|
|
-
|
|
|
|
- rm_list = ['//p[@class="collect"]','//div[@class="article-bottom"]']
|
|
|
|
- data_item.contenthtml = remove_htmldata(rm_list,html,root)
|
|
|
|
-
|
|
|
|
- attachments = {}
|
|
|
|
- files = root.xpath('//div[@class="article-content"]//a[@href]')
|
|
|
|
- for file in files:
|
|
|
|
- file_url = "https://ibuy.ccb.com" + file.xpath('./@href').extract_first("")
|
|
|
|
- file_name = file.xpath('./text()').extract_first("")
|
|
|
|
- file_type = extract_file_type(file_name, file_url)
|
|
|
|
-
|
|
|
|
- if file_type and 'download' in file_url:
|
|
|
|
- attachment = AttachmentDownloader().fetch_attachment(
|
|
|
|
- file_name=file_name, file_type=file_type, download_url=file_url,
|
|
|
|
- proxies=request.proxies())
|
|
|
|
- attachments[str(len(attachments) + 1)] = attachment
|
|
|
|
-
|
|
|
|
- if len(attachments) > 0:
|
|
|
|
- data_item.projectinfo = {"attachments": attachments}
|
|
|
|
-
|
|
|
|
-
|
|
|
|
|
|
+ data_item.unique_key = ('href',)
|
|
|
|
+ data_item.parse = "self.detail_get" # 详情页回调方法
|
|
|
|
+ data_item.parse_url = f"https://ibuy.ccb.com/json/contentFile/{pid}/{htm}/{hid}.json"
|
|
|
|
|
|
yield data_item
|
|
yield data_item
|
|
|
|
|
|
@@ -124,20 +81,6 @@ class Zgjsyhjcpt(feapder.BiddingListSpider):
|
|
|
|
|
|
def download_midware(self, request):
|
|
def download_midware(self, request):
|
|
page = request.page
|
|
page = request.page
|
|
- menu = request.item
|
|
|
|
- if menu.get('code') == "a_zgjsyhjcpt_gyszj":
|
|
|
|
- data = {
|
|
|
|
- "pageNo": f"{page}",
|
|
|
|
- "_ser_p": f"{get_ser(page)}",
|
|
|
|
- "collectStatus": "0"
|
|
|
|
- }
|
|
|
|
- else:
|
|
|
|
- data = {
|
|
|
|
- "pageNo": f"{page}",
|
|
|
|
- "_ser_p": f"{get_ser(page)}",
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- request.data = data
|
|
|
|
request.headers = self.headers
|
|
request.headers = self.headers
|
|
|
|
|
|
|
|
|