lizongze пре 8 месеци
родитељ
комит
a2db7585e9

+ 118 - 0
中国邮政储蓄银行邮银易采/招标信息-列表页.py

@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-05
+---------
+@summary: 中国邮政储蓄银行邮银易采
+---------
+@author: lzz
+"""
+import feapder
+from items.spider_item import BidingListItem
+from collections import namedtuple
+from untils.WebCookiePool import WebCookiePool
+
+
+class ZtbpcFeapder(feapder.BiddingListSpider):
+
+    def start_callback(self):
+
+        self.site = "中国邮政储蓄银行邮银易采"
+
+        Menu = namedtuple('Menu', ['channel', 'code', 'cid', 'crawl_page'])
+
+        self.menus = [
+            Menu('招标信息-招标公告', 'a_zgyzcxyhyyyc_zbxx_zbgg', '223', 1),
+            Menu('招标信息-中标候选人公示', 'a_zgyzcxyhyyyc_zbxx_zbhxrgs', '224', 1),
+            Menu('招标信息-变更公告', 'a_zgyzcxyhyyyc_zbxx_bggg', '225', 1),
+            Menu('招标信息-其他公告', 'a_zgyzcxyhyyyc_zbxx_qtgg', '226', 1),
+            Menu('招标信息-中标公告', 'a_zgyzcxyhyyyc_zbxx_qtgg', '235', 1),
+            # Menu('招标信息-资格预审公告', 'a_zgyzcxyhyyyc_zbxx_zgysgg', '222', 1),  暂无数据
+        ]
+
+        self.headers = {
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json; charset=UTF-8",
+            "Origin": "https://cg.psbc.com",
+            "Pragma": "no-cache",
+            "Referer": "https://cg.psbc.com/cms/default/webfile/1ywgg2/index.html",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "X-Requested-With": "XMLHttpRequest",
+        }
+
+        self.cookie_pool = WebCookiePool(redis_key="zgyzcxyhyyyc_zbxx",page_url="https://cg.psbc.com/cms/default/webfile/1ywgg2/index.html",
+                                         cookie_key="V3iEwBUtWULVP",driver_type="FIREFOX")
+
+        self.ct = 0
+
+    def start_requests(self):
+        for menu in self.menus:
+            start_url = "https://cg.psbc.com/cms/api/dynamicData/queryContentPage"
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
+
+    def download_midware(self, request):
+        page = request.page
+        menu = request.item
+        data = {
+            "pageNo": page,
+            "pageSize": "50",
+            "dto": {
+                "siteId": "725",
+                "categoryId": menu.get('cid'),
+                "city": "",
+                "county": "",
+                "purchaseMode": "",
+                "secondCompanyId": ""
+            }
+        }
+        cookies = self.cookie_pool.get_cookie()
+        request.cookies = cookies
+        request.json = data
+        request.headers = self.headers
+
+    def parse(self, request, response):
+        if self.ct > 5:
+            return
+        if response.status_code != 200:
+            self.ct += 1
+            self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
+            yield request
+        else:
+            self.ct = 0
+            menu = request.item
+            info_list = response.json.get('res').get('rows')
+            for info in info_list:
+                title = info.get('title').strip()
+                hid = info.get('url')
+                href = f"https://cg.psbc.com/cms/default/webfile{hid}"
+                publish_time = info.get('publishDate').replace('T',' ').split('.')[0]
+
+                did = href.split('/')[-1].split('.')[0]
+
+                area = "全国"
+                city = ""
+
+                list_item = BidingListItem()  # 存储数据的管道
+                list_item.href = href  # 标书链接
+                list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+                list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+                list_item.title = title  # 标题
+                list_item.publishtime = publish_time  # 标书发布时间
+                list_item.site = self.site
+                list_item.area = area or "全国"  # 省份 默认:全国
+                list_item.city = city  # 城市 默认 为空
+
+                list_item.unique_key = ('href',)
+                list_item.parse = "self.detail_get"  # 详情页回调方法
+                list_item.parse_url = f"https://cg.psbc.com/cms/api/dynamicData/queryContentInfo?contentId={did}"
+
+                yield list_item
+
+            request = self.infinite_pages(request, response)
+            yield request
+
+
+if __name__ == "__main__":
+    ZtbpcFeapder(redis_key="lzz:zgyzcxyhyyyc_zbxx_zbgg").start()

+ 108 - 0
中国邮政储蓄银行邮银易采/招标信息-详情页.py

@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-05
+---------
+@summary: 中国邮政储蓄银行邮银易采
+---------
+@author: lzz
+"""
+import feapder
+from items.spider_item import DataBakItem
+from untils.WebCookiePool import WebCookiePool
+from feapder.network.selector import Selector
+from untils.tools import extract_file_type
+from untils.attachment import AttachmentDownloader
+import requests
+
+headers = {
+    "Accept": "*/*",
+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+    "Cache-Control": "no-cache",
+    "Connection": "keep-alive",
+    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+    "Origin": "http://lxjypt.cn",
+    "Pragma": "no-cache",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+    "X-Requested-With": "XMLHttpRequest"
+}
+
+
+def get_file(href, cookies, proxies=False):
+    attachments = {}
+    try:
+        headers["Referer"] = href
+        response = requests.get(href, headers=headers, cookies=cookies, proxies=proxies, timeout=20, verify=False)
+        file_list = Selector(response.content.decode()).xpath('//div[@class="fileList bottomListItem"]//a')
+        if file_list:
+            for info in file_list:
+                file_url = info.xpath('./@href').extract_first("").strip()
+                file_name = "".join(info.xpath('.//text()').extract()).strip()
+                file_type = extract_file_type(file_name, file_url)
+
+                if file_type and "download" in file_url:
+                    fheaders = {
+                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+                        "Accept-Language": "zh-CN,zh;q=0.9",
+                        "Cache-Control": "no-cache",
+                        # "Connection": "keep-alive",
+                        "Pragma": "no-cache",
+                        "Referer": file_url,
+                        "Upgrade-Insecure-Requests": "1",
+                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+                    }
+                    attachment = AttachmentDownloader().fetch_attachment(
+                        file_name=file_name, file_type=file_type, download_url=file_url,
+                        cookies=cookies, headers=fheaders)
+                    if attachment.__contains__("fid"):
+                        attachments[str(len(attachments) + 1)] = attachment
+    except:
+        pass
+    return attachments
+
+
+class Details(feapder.BiddingDetailSpider):
+    cookie_pool = WebCookiePool(redis_key="zgyzcxyhyyyc_zbxx",
+                                page_url="https://cg.psbc.com/cms/default/webfile/1ywgg2/index.html",
+                                cookie_key="V3iEwBUtWULVP", driver_type="FIREFOX")
+
+    ct = 0
+
+    def start_requests(self):
+        data_lsit = self.get_tasks_by_rabbitmq(limit=30)
+        for item in data_lsit:
+            request_params = item.get("request_params")
+            timeout = request_params.get('timeout', 20)
+            request_params.pop('timeout', None)
+            yield feapder.Request(url=item.get("parse_url"), item=item,
+                                  deal_detail=item.get("deal_detail"), callback=eval(item.get("parse")),
+                                  **request_params, timeout=timeout, proxies=False)
+
+    def download_midware(self, request):
+        cookies = self.cookie_pool.get_cookie()
+        request.cookies = cookies
+        request.headers = headers
+
+    def detail_get(self, request, response):
+        if self.ct > 5:
+            return
+        if response.status_code != 200:
+            self.ct += 1
+            self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
+            yield request
+        else:
+            self.ct = 0
+            items = request.item
+            list_item = DataBakItem(**items)
+
+            html = response.json.get('res').get('content').get('text')
+            list_item.contenthtml = html
+
+            attachments = get_file(list_item['href'], self.cookie_pool.get_cookie())
+            if attachments:
+                list_item.projectinfo = {"attachments": attachments}
+
+            yield list_item
+
+
+if __name__ == "__main__":
+    Details(redis_key="lzz:zgyzcxyhyyyc_zbxx_zbgg").start()

+ 118 - 0
中国邮政储蓄银行邮银易采/非招标信息-列表页.py

@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-12-05
+---------
+@summary: 中国邮政储蓄银行邮银易采
+---------
+@author: lzz
+"""
+import feapder
+from items.spider_item import BidingListItem
+from collections import namedtuple
+from untils.WebCookiePool import WebCookiePool
+
+
+class ZtbpcFeapder(feapder.BiddingListSpider):
+
+    def start_callback(self):
+
+        self.site = "中国邮政储蓄银行邮银易采"
+
+        Menu = namedtuple('Menu', ['channel', 'code', 'cid', 'crawl_page'])
+
+        self.menus = [
+            Menu('非招标信息-采购公告', 'a_zgyzcxyhyyyc_fzbxx_cggg', '229', 1),
+            Menu('非招标信息-成交结果公示', 'a_zgyzcxyhyyyc_fzbxx_cjjggs', '230', 1),
+            Menu('非招标信息-成交结果公告', 'a_zgyzcxyhyyyc_fzbxx_cjjggg', '231', 1),
+            Menu('非招标信息-变更公告', 'a_zgyzcxyhyyyc_fzbxx_bggg', '232', 1),
+            Menu('非招标信息-其他公告', 'a_zgyzcxyhyyyc_fzbxx_qtgg', '233', 1),
+            Menu('非招标信息-单一来源采前公示', 'a_zgyzcxyhyyyc_fzbxx_dylycqgs', '236', 1),
+        ]
+
+        self.headers = {
+            "Accept": "application/json, text/javascript, */*; q=0.01",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "application/json; charset=UTF-8",
+            "Origin": "https://cg.psbc.com",
+            "Pragma": "no-cache",
+            "Referer": "https://cg.psbc.com/cms/default/webfile/2ywgg1/index.html",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+            "X-Requested-With": "XMLHttpRequest",
+        }
+
+        self.cookie_pool = WebCookiePool(redis_key="zgyzcxyhyyyc_zbxx",page_url="https://cg.psbc.com/cms/default/webfile/1ywgg2/index.html",
+                                         cookie_key="V3iEwBUtWULVP",driver_type="FIREFOX")
+
+        self.ct = 0
+
+    def start_requests(self):
+        for menu in self.menus:
+            start_url = "https://cg.psbc.com/cms/api/dynamicData/queryContentPage"
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
+
+    def download_midware(self, request):
+        page = request.page
+        menu = request.item
+        data = {
+            "pageNo": page,
+            "pageSize": "10",
+            "dto": {
+                "siteId": "725",
+                "categoryId": menu.get('cid'),
+                "city": "",
+                "county": "",
+                "purchaseMode": "",
+                "secondCompanyId": ""
+            }
+        }
+        cookies = self.cookie_pool.get_cookie()
+        request.cookies = cookies
+        request.json = data
+        request.headers = self.headers
+
+    def parse(self, request, response):
+        if self.ct > 5:
+            return
+        if response.status_code != 200:
+            self.ct += 1
+            self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
+            yield request
+        else:
+            self.ct = 0
+            menu = request.item
+            info_list = response.json.get('res').get('rows')
+            for info in info_list:
+                title = info.get('title').strip()
+                hid = info.get('url')
+                href = f"https://cg.psbc.com/cms/default/webfile{hid}"
+                publish_time = info.get('publishDate').replace('T',' ').split('.')[0]
+
+                did = href.split('/')[-1].split('.')[0]
+
+                area = "全国"
+                city = ""
+
+                list_item = BidingListItem()  # 存储数据的管道
+                list_item.href = href  # 标书链接
+                list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+                list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+                list_item.title = title  # 标题
+                list_item.publishtime = publish_time  # 标书发布时间
+                list_item.site = self.site
+                list_item.area = area or "全国"  # 省份 默认:全国
+                list_item.city = city  # 城市 默认 为空
+
+                list_item.unique_key = ('href',)
+                list_item.parse = "self.detail_get"  # 详情页回调方法
+                list_item.parse_url = f"https://cg.psbc.com/cms/api/dynamicData/queryContentInfo?contentId={did}"
+
+                yield list_item
+
+            request = self.infinite_pages(request, response)
+            yield request
+
+
+if __name__ == "__main__":
+    ZtbpcFeapder(redis_key="lzz:zgyzcxyhyyyc_zbxx_zbgg").start()

+ 16 - 12
成都市城市管理委员会/公示公告-列表页.py

@@ -1,22 +1,28 @@
 # -*- coding: utf-8 -*-
 """
-Created on 2023-4-28
+Created on 2024-12-05
 ---------
 @summary: 成都市城市管理委员会-公示公告
 ---------
 @author: lzz
 """
-
 import feapder
 from items.spider_item import MgpListItem
 from collections import namedtuple
 from feapder.utils.tools import get_today_of_day
-import time,re
 
 
 
 
 class Feapder(feapder.BiddingListSpider):
+    __custom_setting__ = dict(
+        WEBDRIVER=dict(
+            driver_type="FIREFOX",
+            pool_size=1,
+            headless=True,
+            usages_local_driver=True
+        )
+    )
 
     def start_callback(self):
         Menu = namedtuple('Menu', ['channel', 'code', 'crawl_page'])
@@ -29,34 +35,33 @@ class Feapder(feapder.BiddingListSpider):
 
         self.headers = {
             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-            "Accept-Language": "zh-CN,zh;q=0.9",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
             "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
             "Pragma": "no-cache",
+            "Origin": "https://cgw.chengdu.gov.cn/cgw/c128900/sy.shtml",
             "Upgrade-Insecure-Requests": "1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
         }
 
 
     def start_requests(self):
         for menu in self.menus:
-            start_url = "http://cgw.chengdu.gov.cn/cgw/c128855/zwgk_list.shtml"
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
-                                  render_time=3, render=True, proxies=False)
+            yield feapder.Request(item=menu._asdict(), page=1, render_time=5, render=True, proxies=False)
 
     def download_midware(self, request):
         page = request.page
+        url = f"https://cgw.chengdu.gov.cn/es-search/search/727b28f8b5da4e1da3192d591edfa517?_template=zhaofa/cgwgb_list&_isAgg=1&_pageSize=20&page={page}"
+        request.url = url
         request.headers = self.headers
 
     def parse(self, request, response):
         menu = request.item
-        info_list = response.xpath('//ul[@class="cityList_menu2  list_content2"]/li')
+        info_list = response.xpath('//li')
         for info in info_list:
             href = info.xpath('./a/@href').extract_first().strip()
             title = info.xpath('./a/span/text()').extract_first().strip()
             create_time = get_today_of_day()
 
-            area = "四川" # 省份
+            area = "四川"      # 省份
             city = "成都市"    # 城市
 
             list_item = MgpListItem()  # 存储数据的管道
@@ -75,7 +80,6 @@ class Feapder(feapder.BiddingListSpider):
             list_item.proxies = False
             list_item.parse_url = href  # 详情页请求地址
 
-
             yield list_item
 
         # 无限翻页

+ 17 - 19
成都市城市管理委员会/公示公告-详情页.py

@@ -1,49 +1,49 @@
 # -*- coding: utf-8 -*-
 """
-Created on 2023-4-28
+Created on 2024-12-05
 ---------
 @summary: 成都市城市管理委员会-公示公告
 ---------
 @author: lzz
 """
-
 import feapder
 from items.spider_item import DataBakItem
 from untils.attachment import AttachmentDownloader
+from untils.tools import extract_file_type
 from feapder.utils.tools import log
 
 
 headers = {
     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-    "Accept-Language": "zh-CN,zh;q=0.9",
+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
     "Cache-Control": "no-cache",
-    "Connection": "keep-alive",
     "Pragma": "no-cache",
+    "Origin": "https://cgw.chengdu.gov.cn/cgw/c128900/sy.shtml",
     "Upgrade-Insecure-Requests": "1",
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
 }
 
 
+
 class Details(feapder.BiddingDetailSpider):
     __custom_setting__ = dict(
         WEBDRIVER=dict(
             driver_type="FIREFOX",
+            pool_size=1,
+            headless=True,
+            usages_local_driver=True
         )
     )
 
-    ct = 0
     def start_requests(self):
-        data_list = self.get_tasks_by_rabbitmq(limit=20)
+        data_list = self.get_tasks_by_rabbitmq(limit=10)
         for item in data_list:
-            log.debug(item)
+            # log.debug(item)
             request_params = item.get("request_params")
 
             yield feapder.Request(url=item.get("parse_url"), item=item,proxies=False,
-                                          deal_detail=item.get("deal_detail"),render=True,render_time=3,
+                                          deal_detail=item.get("deal_detail"),render=True,render_time=5,
                                           callback=eval(item.get("parse")),  **request_params)
 
-
-
     def detail_get(self,request,response):
 
         items = request.item
@@ -60,17 +60,15 @@ class Details(feapder.BiddingDetailSpider):
         if file_list:
             attachments = {}
             for info in file_list:
-                file_name = info.xpath('./text()').extract_first().strip()
-                file_url = info.xpath('./@href').extract_first().strip()
-                file_type = file_url.split('.')[-1].lower()
-                file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
-                              'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps']
+                file_name = info.xpath('./text()').extract_first("").strip()
+                file_url = info.xpath('./@href').extract_first("").strip()
+                file_type = extract_file_type(file_name,file_url)
                 ck = response.cookies.get_dict()
-                headers['Cookie'] = ";".join([i + "=" + ck.get(i) for i in ck])
-                if file_type in file_types:
+                if file_type:
+                    headers['Referer'] = file_url
                     attachment = AttachmentDownloader().fetch_attachment(
                         file_name=file_name, file_type=file_type, download_url=file_url,
-                        enable_proxy=False, headers=headers)
+                        cookies=ck, headers=headers)
                     attachments[str(len(attachments) + 1)] = attachment
 
             if attachments: