|
@@ -0,0 +1,94 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+Created on 2024-11-22
|
|
|
+---------
|
|
|
+@summary: 江苏产权市场网
|
|
|
+---------
|
|
|
+@author: lzz
|
|
|
+"""
|
|
|
+import feapder
|
|
|
+from items.spider_item import BidingListItem
|
|
|
+from collections import namedtuple
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+class ZtbpcFeapder(feapder.BiddingListSpider):
|
|
|
+ __custom_setting__ = dict(
|
|
|
+ WEBDRIVER=dict(
|
|
|
+ driver_type="FIREFOX",
|
|
|
+ pool_size=1,
|
|
|
+ headless=False,
|
|
|
+ )
|
|
|
+ )
|
|
|
+ def start_callback(self):
|
|
|
+
|
|
|
+ self.site = "江苏产权市场网"
|
|
|
+
|
|
|
+ Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
|
|
|
+
|
|
|
+ self.menus = [
|
|
|
+ Menu('全部', 'js_jscqscw_qb', 1),
|
|
|
+ ]
|
|
|
+
|
|
|
+ self.headers = {
|
|
|
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
+ "Cache-Control": "no-cache",
|
|
|
+ "Connection": "keep-alive",
|
|
|
+ "Pragma": "no-cache",
|
|
|
+ "Upgrade-Insecure-Requests": "1",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
|
+ }
|
|
|
+
|
|
|
+ def start_requests(self):
|
|
|
+ for menu in self.menus:
|
|
|
+ start_url = "https://www.jscq.com.cn/jscq/cqjy/jygg/qb/index.shtml"
|
|
|
+ yield feapder.Request(url=start_url, item=menu._asdict(), page=1, render_time=5, render=True, proxies=False)
|
|
|
+
|
|
|
+ def download_midware(self, request):
|
|
|
+ page = request.page
|
|
|
+ request.headers = self.headers
|
|
|
+
|
|
|
+ def parse(self, request, response):
|
|
|
+ menu = request.item
|
|
|
+ info_list = response.xpath('//ul[@class="pxcontent"]/li')
|
|
|
+ for info in info_list:
|
|
|
+ title = info.xpath('./a[@class="cairong1"]/@title').extract_first("").strip()
|
|
|
+ href = info.xpath('./a[@class="cairong1"]/@href').extract_first("").strip()
|
|
|
+ publish_time = info.xpath('./p[@class="qizhi"]/text()').extract_first("").strip()
|
|
|
+
|
|
|
+ area = "江苏"
|
|
|
+ city = ""
|
|
|
+
|
|
|
+ list_item = BidingListItem() # 存储数据的管道
|
|
|
+ list_item.href = href # 标书链接
|
|
|
+ list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
|
+ list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
|
+ list_item.title = title # 标题
|
|
|
+ list_item.publishtime = publish_time # 标书发布时间
|
|
|
+ list_item.site = self.site
|
|
|
+ list_item.area = area or "全国" # 省份 默认:全国
|
|
|
+ list_item.city = city # 城市 默认 为空
|
|
|
+
|
|
|
+ list_item.unique_key = ('href',)
|
|
|
+ list_item.parse = "self.detail_get" # 详情页回调方法
|
|
|
+ list_item.deal_detail = ['//div[@class="tab_bottom tab_bottom1"]']
|
|
|
+ list_item.parse_url = href
|
|
|
+
|
|
|
+ list_item.files = { # 附件采集规则
|
|
|
+ "list_xpath": '//div[@class="tab_bottom tab_bottom1"]//a[@href]',
|
|
|
+ "url_xpath": './@href',
|
|
|
+ "name_xpath": './text()',
|
|
|
+ # "file_type":'pdf', # 默认的附件类型,用于url中未带附件类型的
|
|
|
+ "url_key": 'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
|
|
|
+ "host": '', # 需要拼接url的host
|
|
|
+ }
|
|
|
+
|
|
|
+ yield list_item
|
|
|
+
|
|
|
+ request = self.infinite_pages(request, response)
|
|
|
+ yield request
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ ZtbpcFeapder(redis_key="detail:firefox").start()
|