Browse Source

代码提交

lizongze 8 months ago
parent
commit
02fe4c5cb4

+ 94 - 0
江苏/江苏产权市场网-全部-列表页.py

@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-11-22
+---------
+@summary: 江苏产权市场网
+---------
+@author: lzz
+"""
+import feapder
+from items.spider_item import BidingListItem
+from collections import namedtuple
+
+
+
+class ZtbpcFeapder(feapder.BiddingListSpider):
+    __custom_setting__ = dict(
+        WEBDRIVER=dict(
+            driver_type="FIREFOX",
+            pool_size=1,
+            headless=False,
+        )
+    )
+    def start_callback(self):
+
+        self.site = "江苏产权市场网"
+
+        Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
+
+        self.menus = [
+            Menu('全部', 'js_jscqscw_qb', 1),
+        ]
+
+        self.headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Pragma": "no-cache",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+        }
+
+    def start_requests(self):
+        for menu in self.menus:
+            start_url = "https://www.jscq.com.cn/jscq/cqjy/jygg/qb/index.shtml"
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, render_time=5, render=True, proxies=False)
+
+    def download_midware(self, request):
+        page = request.page
+        request.headers = self.headers
+
+    def parse(self, request, response):
+        menu = request.item
+        info_list = response.xpath('//ul[@class="pxcontent"]/li')
+        for info in info_list:
+            title = info.xpath('./a[@class="cairong1"]/@title').extract_first("").strip()
+            href = info.xpath('./a[@class="cairong1"]/@href').extract_first("").strip()
+            publish_time = info.xpath('./p[@class="qizhi"]/text()').extract_first("").strip()
+
+            area = "江苏"
+            city = ""
+
+            list_item = BidingListItem()  # 存储数据的管道
+            list_item.href = href  # 标书链接
+            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+            list_item.title = title  # 标题
+            list_item.publishtime = publish_time  # 标书发布时间
+            list_item.site = self.site
+            list_item.area = area or "全国"  # 省份 默认:全国
+            list_item.city = city  # 城市 默认 为空
+
+            list_item.unique_key = ('href',)
+            list_item.parse = "self.detail_get"  # 详情页回调方法
+            list_item.deal_detail = ['//div[@class="tab_bottom tab_bottom1"]']
+            list_item.parse_url = href
+
+            list_item.files = {  # 附件采集规则
+                "list_xpath": '//div[@class="tab_bottom tab_bottom1"]//a[@href]',
+                "url_xpath": './@href',
+                "name_xpath": './text()',
+                # "file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
+                "url_key": 'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
+                "host": '',  # 需要拼接url的host
+            }
+
+            yield list_item
+
+        request = self.infinite_pages(request, response)
+        yield request
+
+
+if __name__ == "__main__":
+    ZtbpcFeapder(redis_key="detail:firefox").start()

+ 92 - 0
河南/郑州市中介超市服务平台-项目公告-列表页.py

@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-11-22
+---------
+@summary: 郑州市中介超市服务平台
+---------
+@author: lzz
+"""
+import feapder
+from items.spider_item import BidingListItem
+from collections import namedtuple
+
+
+
+class ZtbpcFeapder(feapder.BiddingListSpider):
+
+    def start_callback(self):
+
+        self.site = "郑州市中介超市服务平台"
+
+        Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
+
+        self.menus = [
+            Menu('项目公告', 'hn_zzszjcsfwpt_xmgg', 1),
+        ]
+
+        self.headers = {
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "application/x-www-form-urlencoded",
+            "Origin": "http://gcjs.zzdsj.zhengzhou.gov.cn:10000",
+            "Pragma": "no-cache",
+            "Referer": "http://gcjs.zzdsj.zhengzhou.gov.cn:10000/f/cggg",
+            "Upgrade-Insecure-Requests": "1",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+        }
+
+    def start_requests(self):
+        for menu in self.menus:
+            start_url = "http://gcjs.zzdsj.zhengzhou.gov.cn:10000/f/cggg"
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
+
+    def download_midware(self, request):
+        page = request.page
+        data = {
+            "pageNo": f"{page}",
+            "pageSize": "5",
+            "orderBy": "",
+            "cgxmmc": "",
+            "ggksrq": "",
+            "ggjsrq": ""
+        }
+        request.data = data
+        request.headers = self.headers
+
+    def parse(self, request, response):
+        menu = request.item
+        info_list = response.xpath('//table[@class="tables"]/tbody/tr')
+        for info in info_list:
+            title = info.xpath('./td[1]/a/text()').extract_first("").replace('null','').replace('【】','').strip()
+            href = info.xpath('./td[1]/a/@href').extract_first("").strip()
+            publish_time = info.xpath('./td[2]/text()').extract_first("").strip()
+
+            area = "河南"
+            city = "郑州市"
+
+            list_item = BidingListItem()  # 存储数据的管道
+            list_item.href = href  # 标书链接
+            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+            list_item.title = title  # 标题
+            list_item.publishtime = publish_time  # 标书发布时间
+            list_item.site = self.site
+            list_item.area = area or "全国"  # 省份 默认:全国
+            list_item.city = city  # 城市 默认 为空
+
+            list_item.unique_key = ('href',)
+            list_item.parse = "self.detail_get"  # 详情页回调方法
+            list_item.deal_detail = ['//div[contains(@class,"detail__content")]']
+            list_item.request_params = {"rm_list":['null','【】']}
+            list_item.parse_url = href
+
+            yield list_item
+
+        request = self.infinite_pages(request, response)
+        yield request
+
+
+if __name__ == "__main__":
+    ZtbpcFeapder(redis_key="detail:chrome").start()