8 月之前 · 02fe4c5cb4
--- a/江苏/江苏产权市场网-全部-列表页.py
+++ b/江苏/江苏产权市场网-全部-列表页.py
@@ -0,0 +1,94 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-11-22
			
 
				+---------
			
 
				+@summary: 江苏产权市场网
			
 
				+---------
			
 
				+@author: lzz
			
 
				+"""
			
 
				+import feapder
			
 
				+from items.spider_item import BidingListItem
			
 
				+from collections import namedtuple
			
 
				+
			
 
				+
			
 
				+
			
 
				+class ZtbpcFeapder(feapder.BiddingListSpider):
			
 
				+    __custom_setting__ = dict(
			
 
				+        WEBDRIVER=dict(
			
 
				+            driver_type="FIREFOX",
			
 
				+            pool_size=1,
			
 
				+            headless=False,
			
 
				+        )
			
 
				+    )
			
 
				+    def start_callback(self):
			
 
				+
			
 
				+        self.site = "江苏产权市场网"
			
 
				+
			
 
				+        Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
			
 
				+
			
 
				+        self.menus = [
			
 
				+            Menu('全部', 'js_jscqscw_qb', 1),
			
 
				+        ]
			
 
				+
			
 
				+        self.headers = {
			
 
				+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
			
 
				+            "Cache-Control": "no-cache",
			
 
				+            "Connection": "keep-alive",
			
 
				+            "Pragma": "no-cache",
			
 
				+            "Upgrade-Insecure-Requests": "1",
			
 
				+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
			
 
				+        }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for menu in self.menus:
			
 
				+            start_url = "https://www.jscq.com.cn/jscq/cqjy/jygg/qb/index.shtml"
			
 
				+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, render_time=5, render=True, proxies=False)
			
 
				+
			
 
				+    def download_midware(self, request):
			
 
				+        page = request.page
			
 
				+        request.headers = self.headers
			
 
				+
			
 
				+    def parse(self, request, response):
			
 
				+        menu = request.item
			
 
				+        info_list = response.xpath('//ul[@class="pxcontent"]/li')
			
 
				+        for info in info_list:
			
 
				+            title = info.xpath('./a[@class="cairong1"]/@title').extract_first("").strip()
			
 
				+            href = info.xpath('./a[@class="cairong1"]/@href').extract_first("").strip()
			
 
				+            publish_time = info.xpath('./p[@class="qizhi"]/text()').extract_first("").strip()
			
 
				+
			
 
				+            area = "江苏"
			
 
				+            city = ""
			
 
				+
			
 
				+            list_item = BidingListItem()  # 存储数据的管道
			
 
				+            list_item.href = href  # 标书链接
			
 
				+            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
			
 
				+            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
			
 
				+            list_item.title = title  # 标题
			
 
				+            list_item.publishtime = publish_time  # 标书发布时间
			
 
				+            list_item.site = self.site
			
 
				+            list_item.area = area or "全国"  # 省份 默认:全国
			
 
				+            list_item.city = city  # 城市 默认 为空
			
 
				+
			
 
				+            list_item.unique_key = ('href',)
			
 
				+            list_item.parse = "self.detail_get"  # 详情页回调方法
			
 
				+            list_item.deal_detail = ['//div[@class="tab_bottom tab_bottom1"]']
			
 
				+            list_item.parse_url = href
			
 
				+
			
 
				+            list_item.files = {  # 附件采集规则
			
 
				+                "list_xpath": '//div[@class="tab_bottom tab_bottom1"]//a[@href]',
			
 
				+                "url_xpath": './@href',
			
 
				+                "name_xpath": './text()',
			
 
				+                # "file_type":'pdf',                  # 默认的附件类型，用于url中未带附件类型的
			
 
				+                "url_key": 'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带，如无可填http
			
 
				+                "host": '',  # 需要拼接url的host
			
 
				+            }
			
 
				+
			
 
				+            yield list_item
			
 
				+
			
 
				+        request = self.infinite_pages(request, response)
			
 
				+        yield request
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    ZtbpcFeapder(redis_key="detail:firefox").start()
			
--- a/河南/郑州市中介超市服务平台-项目公告-列表页.py
+++ b/河南/郑州市中介超市服务平台-项目公告-列表页.py
@@ -0,0 +1,92 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-11-22
			
 
				+---------
			
 
				+@summary: 郑州市中介超市服务平台
			
 
				+---------
			
 
				+@author: lzz
			
 
				+"""
			
 
				+import feapder
			
 
				+from items.spider_item import BidingListItem
			
 
				+from collections import namedtuple
			
 
				+
			
 
				+
			
 
				+
			
 
				+class ZtbpcFeapder(feapder.BiddingListSpider):
			
 
				+
			
 
				+    def start_callback(self):
			
 
				+
			
 
				+        self.site = "郑州市中介超市服务平台"
			
 
				+
			
 
				+        Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
			
 
				+
			
 
				+        self.menus = [
			
 
				+            Menu('项目公告', 'hn_zzszjcsfwpt_xmgg', 1),
			
 
				+        ]
			
 
				+
			
 
				+        self.headers = {
			
 
				+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
			
 
				+            "Cache-Control": "no-cache",
			
 
				+            "Connection": "keep-alive",
			
 
				+            "Content-Type": "application/x-www-form-urlencoded",
			
 
				+            "Origin": "http://gcjs.zzdsj.zhengzhou.gov.cn:10000",
			
 
				+            "Pragma": "no-cache",
			
 
				+            "Referer": "http://gcjs.zzdsj.zhengzhou.gov.cn:10000/f/cggg",
			
 
				+            "Upgrade-Insecure-Requests": "1",
			
 
				+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
			
 
				+        }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for menu in self.menus:
			
 
				+            start_url = "http://gcjs.zzdsj.zhengzhou.gov.cn:10000/f/cggg"
			
 
				+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
			
 
				+
			
 
				+    def download_midware(self, request):
			
 
				+        page = request.page
			
 
				+        data = {
			
 
				+            "pageNo": f"{page}",
			
 
				+            "pageSize": "5",
			
 
				+            "orderBy": "",
			
 
				+            "cgxmmc": "",
			
 
				+            "ggksrq": "",
			
 
				+            "ggjsrq": ""
			
 
				+        }
			
 
				+        request.data = data
			
 
				+        request.headers = self.headers
			
 
				+
			
 
				+    def parse(self, request, response):
			
 
				+        menu = request.item
			
 
				+        info_list = response.xpath('//table[@class="tables"]/tbody/tr')
			
 
				+        for info in info_list:
			
 
				+            title = info.xpath('./td[1]/a/text()').extract_first("").replace('null','').replace('【】','').strip()
			
 
				+            href = info.xpath('./td[1]/a/@href').extract_first("").strip()
			
 
				+            publish_time = info.xpath('./td[2]/text()').extract_first("").strip()
			
 
				+
			
 
				+            area = "河南"
			
 
				+            city = "郑州市"
			
 
				+
			
 
				+            list_item = BidingListItem()  # 存储数据的管道
			
 
				+            list_item.href = href  # 标书链接
			
 
				+            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 （编辑器定的）
			
 
				+            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code（编辑器定的）
			
 
				+            list_item.title = title  # 标题
			
 
				+            list_item.publishtime = publish_time  # 标书发布时间
			
 
				+            list_item.site = self.site
			
 
				+            list_item.area = area or "全国"  # 省份 默认:全国
			
 
				+            list_item.city = city  # 城市 默认 为空
			
 
				+
			
 
				+            list_item.unique_key = ('href',)
			
 
				+            list_item.parse = "self.detail_get"  # 详情页回调方法
			
 
				+            list_item.deal_detail = ['//div[contains(@class,"detail__content")]']
			
 
				+            list_item.request_params = {"rm_list":['null','【】']}
			
 
				+            list_item.parse_url = href
			
 
				+
			
 
				+            yield list_item
			
 
				+
			
 
				+        request = self.infinite_pages(request, response)
			
 
				+        yield request
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    ZtbpcFeapder(redis_key="detail:chrome").start()