Browse Source

脚本修复

dzr 2 weeks ago
parent
commit
8a89f558d3
2 changed files with 57 additions and 44 deletions
  1. 27 20
      a_zgltcgyzbw_bggg/变更公告-列表页.py
  2. 30 24
      a_zgltcgyzbw_bggg/招标信息-详情页.py

+ 27 - 20
a_zgltcgyzbw_bggg/变更公告-列表页.py

@@ -6,15 +6,16 @@ Created on 2025-04-29
 ---------
 @author: lzz
 """
-import feapder
-from items.spider_item import MgpListItem
-from collections import namedtuple
-from untils.WebCookiePool import WebCookiePool
 import json
+from collections import namedtuple
 
+import feapder
+from items.spider_item import BidingListItem
+from untils.WebCookiePool import WebCookiePool
+from untils.tools import get_proxy
 
 
-class Feapder(feapder.BiddingListSpider):
+class Spider(feapder.BiddingListSpider):
 
     def start_callback(self):
         Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
@@ -37,15 +38,17 @@ class Feapder(feapder.BiddingListSpider):
             "roleId;": ""
         }
 
-        self.cookie_pool = WebCookiePool(redis_key="zgydcgyzbw_ck", page_url="http://www.chinaunicombidding.cn/bidInformation",
-                                         cookie_key="jqmEwVYRfTEJT", driver_type="FIREFOX",
-                                         usages_local_driver=True,headless=True)
+        self.cookie_pool = WebCookiePool(redis_key="zgydcgyzbw_ck",
+                                         page_url="http://www.chinaunicombidding.cn/bidInformation",
+                                         cookie_key="jqmEwVYRfTEJT")
 
+        self.cookie_pool.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
 
     def start_requests(self):
+        url = "http://www.cupb.cn/api/v1/bizAnno/getAnnoList"
         for menu in self.menus:
-            start_url = "http://www.cupb.cn/api/v1/bizAnno/getAnnoList"
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
+            proxies = get_proxy()
+            yield feapder.Request(url, item=menu._asdict(), page=1, proxies=proxies)
 
     def download_midware(self, request):
         page = request.page
@@ -57,15 +60,17 @@ class Feapder(feapder.BiddingListSpider):
             "pageNo": page,
             "annoType": menu.get('tid')
         }
-        data = json.dumps(data, separators=(',', ':'))
-
-        cookies = self.cookie_pool.create_cookie()
-        request.data = data
-        request.cookies = cookies
+        request.data = json.dumps(data, separators=(',', ':'))
+        self.cookie_pool.proxies(proxy=request.get_proxy())
+        request.cookies = self.cookie_pool.create_cookie()
         request.headers = self.headers
 
-    def parse(self, request, response):
+    def validate(self, request, response):
+        if response.status_code != 200:
+            raise ConnectionRefusedError
+        return True
 
+    def parse(self, request, response):
         menu = request.item
         info_list = response.json.get('data').get('records')
         for info in info_list:
@@ -79,7 +84,7 @@ class Feapder(feapder.BiddingListSpider):
             area = cty
             city = ""
 
-            list_item = MgpListItem()         # 存储数据的管道
+            list_item = BidingListItem()         # 存储数据的管道
             list_item.href = href             # 标书链接
             list_item.unique_key = ('href',)
             list_item.channel = menu.get("channel")  # 最上方定义的抓取栏目 (编辑器定的)
@@ -95,14 +100,16 @@ class Feapder(feapder.BiddingListSpider):
             list_item.deal_detail = []  # 抽取正文xpath
             list_item.proxies = False
             list_item.parse_url = f"http://www.chinaunicombidding.cn/api/v1/bizAnno/getAnnoDetailed/{hid}"
-
             yield list_item
 
         # 无限翻页
         request = self.infinite_pages(request, response)
         yield request
 
+    def exception_request(self, request, response):
+        request.proxies = get_proxy()
+        yield request
 
-if __name__ == "__main__":
-    Feapder(redis_key="lzz:zgydcgyzbw_cgxqgs").start()
 
+if __name__ == "__main__":
+    Spider(redis_key="lzz:zgydcgyzbw_cgxqgs").start()

+ 30 - 24
a_zgltcgyzbw_bggg/招标信息-详情页.py

@@ -20,43 +20,49 @@ headers = {
     "roleId;": ""
 }
 
-class Details(feapder.BiddingDetailSpider):
-    ct = 0
-    cookie_pool = WebCookiePool(redis_key="zgydcgyzbw_ck", page_url="http://www.chinaunicombidding.cn/bidInformation",
-                                         cookie_key="jqmEwVYRfTEJT", driver_type="FIREFOX",
-                                         usages_local_driver=True,headless=True)
+
+class Spider(feapder.BiddingDetailSpider):
+
+    def start_callback(self):
+        self.cookie_pool = WebCookiePool(redis_key="zgydcgyzbw_ck",
+                                         page_url="http://www.chinaunicombidding.cn/bidInformation",
+                                         cookie_key="jqmEwVYRfTEJT")
+        self.cookie_pool.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
+
     def start_requests(self):
         data_lsit = self.get_tasks_by_rabbitmq(limit=50)
         for item in data_lsit:
             request_params = item.get("request_params")
-            timeout = request_params.get('timeout', 10)
-            request_params.pop('timeout', None)
+            timeout = request_params.pop('timeout', 10)
 
-            yield feapder.Request(url=item.get("parse_url"), item=item,
-                                  deal_detail=item.get("deal_detail"), callback=eval(item.get("parse")),
-                                  **request_params, timeout=timeout, proxies=False)
+            yield feapder.Request(url=item.get("parse_url"),
+                                  timeout=timeout,
+                                  proxies=False,
+                                  callback=eval(item.get("parse")),
+                                  item=item,
+                                  deal_detail=item.get("deal_detail"),
+                                  **request_params)
 
     def download_midware(self, request):
         request.headers = headers
         request.cookies = self.cookie_pool.get_cookie()
 
-    def detail_get(self, request, response):
-        if self.ct > 5:
-            return
+    def validate(self, request, response):
         if response.status_code != 200:
-            self.ct += 1
-            self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
-            yield request
-        else:
-            self.ct = 0
-            items = request.item
-            list_item = DataBakItem(**items)
+            raise ConnectionRefusedError
+        return True
 
-            html = response.json.get('data').get('annoText')
-            list_item.contenthtml = html
+    def detail_get(self, request, response):
+        items = request.item
+        data_item = DataBakItem(**items)
+        html = response.json.get('data').get('annoText')
+        data_item.contenthtml = html
+        yield data_item
 
-            yield list_item
+    def exception_request(self, request, response):
+        self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
+        yield request
 
 
 if __name__ == "__main__":
-    Details(redis_key="lzz:zgydcgyzbw_cgxqgs").start()
+    Spider(redis_key="lzz:zgydcgyzbw_cgxqgs").start()