Browse Source

脚本修复

dzr 1 week ago
parent
commit
e21cf644f0

+ 71 - 77
a_zghnjtgs_gkxjgg/中国华能集团公司-公开询价公告-列表页.py

@@ -6,19 +6,18 @@ Created on 2025-04-22
 ---------
 @author: lzz
 """
-import feapder
-from items.spider_item import MgpListItem
+import re
 from collections import namedtuple
+
+import feapder
+from items.spider_item import BidingListItem
 from untils.WebCookiePool import WebCookiePool
 from untils.tools import get_proxy
-import re
 
 
-
-class ZtbpcFeapder(feapder.BiddingListSpider):
+class Spider(feapder.BiddingListSpider):
 
     def start_callback(self):
-
         self.site = "中国华能集团公司"
 
         Menu = namedtuple('Menu', ['channel', 'code', 'tp', 'crawl_page'])
@@ -39,22 +38,20 @@ class ZtbpcFeapder(feapder.BiddingListSpider):
             "Upgrade-Insecure-Requests": "1",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
         }
-        self.proxy = get_proxy()
-        self.ct = 0
+
         self.cookie_pool = WebCookiePool(redis_key="zghnjtgs_gkxjgg_ck",
                                          page_url="http://ec.chng.com.cn/ecmall/more.do",
-                                         cookie_key="S6J51OuUjLieT",
-                                         driver_type="FIREFOX")
+                                         cookie_key="S6J51OuUjLieP")
+        self.cookie_pool.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
 
     def start_requests(self):
+        url = "http://ec.chng.com.cn/ecmall/more.do"
         for menu in self.menus:
-            start_url = "http://ec.chng.com.cn/ecmall/more.do"
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
+            proxies = get_proxy()
+            yield feapder.Request(url, item=menu._asdict(), page=1, proxies=proxies)
 
     def download_midware(self, request):
         page = request.page
-        self.cookie_pool.proxy = self.proxy.get('http')
-        cookies = self.cookie_pool.get_cookie()
         menu = request.item
         data = {
             "type": "107",
@@ -65,73 +62,70 @@ class ZtbpcFeapder(feapder.BiddingListSpider):
             "limit": "50"
         }
         request.data = data
-        request.cookies = cookies
+
+        self.cookie_pool.proxies(request.get_proxy())
+        request.cookies = self.cookie_pool.get_cookie()
         request.headers = self.headers
 
-    def exception_request(self, request, response):
-        self.proxy = get_proxy()
-        yield request
+    def validate(self, request, response):
+        if response.status_code != 200:
+            raise ConnectionRefusedError
+        return True
 
     def parse(self, request, response):
-        if self.ct > 5:
-            return
-        if response.status_code != 200:
-            self.ct += 1
-            self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
-            self.proxy = get_proxy()
-            yield request
-        else:
-            self.ct = 0
-            menu = request.item
-            info_list = response.xpath('//ul[@class="main_r_con"]/li')
-            for info in info_list:
-                href_org = info.xpath('./a/@href').extract_first()
-                hid = "".join(re.findall("\('(.*?)'", href_org))
-                href = f"https://ec.chng.com.cn/ecmall/announcement/announcementDetail.do?announcementId={hid}"
-                title = info.xpath('./a/@title').extract_first("").strip()
-                publish_time = info.xpath('./p/text()').extract_first("").strip()
-
-                area = "全国"  # 省份
-                city = ""  # 城市
-                district = ""  # 区县
-
-                list_item = MgpListItem()  # 存储数据的管道
-                list_item.href = href  # 标书链接
-                list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
-                list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
-                list_item.title = title  # 标题
-                list_item.publishtime = publish_time  # 标书发布时间
-                list_item.site = self.site
-                list_item.area = area or "全国"  # 省份 默认:全国
-                list_item.city = city  # 城市 默认 为空
-                list_item.district = district  # 区县 默认 为空
-
-                list_item.unique_key = ('href',publish_time)
-                list_item.parse = "self.detail_get"  # 详情页回调方法
-                list_item.request_params = {"rm_list":['//div[@class="layui-layer-btnhz"]',
-                                                       '//div[@class="company"]',
-                                                       '//div[@class="main_r_t border_f4"]']}
-                list_item.deal_detail = ['//div[@class="detail_boxhz"]',
-                                         '//div[@class="detail_box qst_box"]',
-                                         '//div[@class="main_box"]']  # 抽取正文xpath
-                list_item.proxies = True
-                list_item.parse_url = href
-
-                list_item.files = {
-                    "list_xpath": '//a[@href]',
-                    "url_xpath": './@href',
-                    "name_xpath": './text()',
-                    # "file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
-                    "url_key": 'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
-                    "host": '',  # 需要拼接url的host
-                }
-
-                yield list_item
-
-
-            request = self.infinite_pages(request, response)
-            yield request
+        menu = request.item
+        info_list = response.xpath('//ul[@class="main_r_con"]/li')
+        for info in info_list:
+            href_org = info.xpath('./a/@href').extract_first()
+            hid = "".join(re.findall("\('(.*?)'", href_org))
+            href = f"https://ec.chng.com.cn/ecmall/announcement/announcementDetail.do?announcementId={hid}"
+            title = info.xpath('./a/@title').extract_first("").strip()
+            publish_time = info.xpath('./p/text()').extract_first("").strip()
+
+            area = "全国"  # 省份
+            city = ""  # 城市
+            district = ""  # 区县
+
+            list_item = BidingListItem()  # 存储数据的管道
+            list_item.href = href  # 标书链接
+            list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
+            list_item.spidercode = menu.get("code")  # 最上方定义的爬虫code(编辑器定的)
+            list_item.title = title  # 标题
+            list_item.publishtime = publish_time  # 标书发布时间
+            list_item.site = self.site
+            list_item.area = area or "全国"  # 省份 默认:全国
+            list_item.city = city  # 城市 默认 为空
+            list_item.district = district  # 区县 默认 为空
+
+            list_item.unique_key = ('href', publish_time)
+            list_item.parse = "self.detail_get"  # 详情页回调方法
+            list_item.request_params = {"rm_list":['//div[@class="layui-layer-btnhz"]',
+                                                   '//div[@class="company"]',
+                                                   '//div[@class="main_r_t border_f4"]']}
+            list_item.deal_detail = ['//div[@class="detail_boxhz"]',
+                                     '//div[@class="detail_box qst_box"]',
+                                     '//div[@class="main_box"]']  # 抽取正文xpath
+            list_item.proxies = True
+            list_item.parse_url = href
+
+            list_item.files = {
+                "list_xpath": '//a[@href]',
+                "url_xpath": './@href',
+                "name_xpath": './text()',
+                # "file_type":'pdf',                  # 默认的附件类型,用于url中未带附件类型的
+                "url_key": 'http',  # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
+                "host": '',  # 需要拼接url的host
+            }
+            yield list_item
+
+        request = self.infinite_pages(request, response)
+        yield request
+
+    def exception_request(self, request, response):
+        self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
+        request.proxies = get_proxy()
+        yield request
 
 
 if __name__ == "__main__":
-    ZtbpcFeapder(redis_key="detail:firefox").start()
+    Spider(redis_key="detail:firefox").start()