Browse Source

风控改版,爬虫维护

dzr 3 tháng trước cách đây
mục cha
commit
ee0f1ca446
2 tập tin đã thay đổi với 55 bổ sung141 xóa
  1. 29 70
      a_gtcgpt_cggg/Gtcgpt.py
  2. 26 71
      a_gtcgpt_cggg/gtcgpt_details.py

+ 29 - 70
a_gtcgpt_cggg/Gtcgpt.py

@@ -6,63 +6,17 @@ Created on 2024-09-19
 ---------
 @author: lzz
 """
-import time
 from collections import namedtuple
 
 import feapder
-import requests
+from feapder.utils.tools import joint_url
 from items.spider_item import BidingListItem
-from untils.get_imgcode import get_code
 from untils.tools import get_proxy
 
 from fingerprint import get_fingerprint
 
 
-def Code(proxies):
-    s = requests.session()
-    tm = int(time.time()*1000)
-    headers = {
-        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
-        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-        "Cache-Control": "no-cache",
-        "Connection": "keep-alive",
-        "Pragma": "no-cache",
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-    }
-
-    url = "https://cg.95306.cn/proxy/portal/enterprise/base/loadComplexValidCodeImg"
-    params = {
-        "validCodeKey": f"{tm}",
-        "timestamp": f"{tm}"
-    }
-    for _ in range(3):
-        response = s.get(url, headers=headers, params=params,timeout=20,proxies=proxies,verify=False)
-        code = get_code(response.content)
-        if len(code) == 5:
-            zheaders = {
-                "Accept": "application/json, text/javascript, */*; q=0.01",
-                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
-                "Origin": "https://cg.95306.cn",
-                "Pragma": "no-cache",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-                "X-Requested-With": "XMLHttpRequest",
-            }
-
-            zurl = "https://cg.95306.cn/proxy/portal/elasticSearch/checkRequestNumValidateCode"
-            zdata = {
-                "picValidCodeKey": f"{tm}",
-                "picValidCode": f"{code}"
-            }
-            s.post(zurl, headers=zheaders, data=zdata,timeout=20,proxies=proxies,verify=False)
-            return s.cookies.get_dict()
-    else:
-        return None
-
-
-class Gtcgpt(feapder.BiddingListSpider):
+class Spider(feapder.BiddingListSpider):
 
     def start_callback(self):
         Menu = namedtuple('Menu', ['channel', 'code', 'noticeType', 'tid', 'crawl_page'])
@@ -70,14 +24,11 @@ class Gtcgpt(feapder.BiddingListSpider):
 
         self.menus = [
             Menu('采购公告', 'a_gtcgpt_cggg', '000', 'queryProcurementNoticeList', 20),
-            # Menu('采购结果', 'a_gtcgpt_cgjg', '001', 'queryProcurementResultsList', 20),
         ]
         self.headers = {
             "Accept": "application/json, text/javascript, */*; q=0.01",
             "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
             "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "Pragma": "no-cache",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
             "X-Requested-With": "XMLHttpRequest",
         }
@@ -88,28 +39,37 @@ class Gtcgpt(feapder.BiddingListSpider):
 
     def start_requests(self):
         for menu in self.menus:
-            start_url = f"https://cg.95306.cn/proxy/portal/elasticSearch/{menu.tid}"
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
+            referer = "https://cg.95306.cn/baseinfor/notice/procurementNotice"
+            params = {
+                "bidType": "",
+                "noticeType": f"{menu.noticeType}",
+                "transactionType": "01",
+                "wzType": "",
+                "title": "",
+                "bidding": "",
+                "navigation": ""
+            }
+            self.headers["Referer"] = joint_url(referer, params)
+
+            url = f"https://cg.95306.cn/proxy/portal/elasticSearch/{menu.tid}"
+            yield feapder.Request(url, item=menu._asdict(), page=1, proxies=False)
 
     def download_midware(self, request):
-        if not self.cookies:
-            self.cookies = Code(self.proxy)
+        if self.cookies is None:
+            self.cookies = {
+                'AlteonPcgmh': '0a03b7f3bb36ad3f1f41',
+                'mhId': self.fp,
+            }
 
-        page = request.page
-        noticeType = request.item.get('noticeType')
-        params = {
+        data = {
             'mhId': self.fp,
-            "projBidType": "01",
-            "bidType": "",
-            "noticeType": f"{noticeType}",
-            "title": "",
-            "inforCode": "",
-            "pageNum": f"{page}",
-            "projType": "",
-            "professionalCode": "",
-            "createPeopUnit": ""
+            'projBidType': '01',
+            'bidType': '',
+            'noticeType': '000',
+            'wzType': '',
+            'title': '',
         }
-        request.params = params
+        request.data = data
         request.headers = self.headers
         request.cookies = self.cookies
         request.proxies = self.proxy
@@ -151,7 +111,6 @@ class Gtcgpt(feapder.BiddingListSpider):
             }
             list_item.request_params = {"params": params_d}
             list_item.parse_url = "https://cg.95306.cn/proxy/portal/elasticSearch/indexView"
-
             yield list_item
 
         # 无限翻页设置
@@ -166,4 +125,4 @@ class Gtcgpt(feapder.BiddingListSpider):
 
 
 if __name__ == "__main__":
-    Gtcgpt(redis_key="lzz:Gtcgpt").start()
+    Spider(redis_key="lzz:Gtcgpt").start()

+ 26 - 71
a_gtcgpt_cggg/gtcgpt_details.py

@@ -1,103 +1,58 @@
 # -*- coding: utf-8 -*-
 """
-Created on 2024-11-21
+Created on 2025-04-09
 ---------
 @summary: 国铁采购平台
 ---------
 @author: lzz
 """
 import re
-import time
 
 import feapder
-import requests
 from items.spider_item import DataBakItem
-from untils.get_imgcode import get_code
 from untils.tools import get_proxy
 
 from fingerprint import get_fingerprint
 
-headers = {
-    "Accept": "application/json, text/javascript, */*; q=0.01",
-    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-    "Cache-Control": "no-cache",
-    "Connection": "keep-alive",
-    "Pragma": "no-cache",
-    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-    "X-Requested-With": "XMLHttpRequest",
-}
 
+class Spider(feapder.BiddingDetailSpider):
 
-def Code(proxies):
-    s = requests.session()
-    tm = int(time.time()*1000)
-    yheaders = {
-        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
-        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-        "Cache-Control": "no-cache",
-        "Connection": "keep-alive",
-        "Pragma": "no-cache",
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-    }
-
-    url = "https://cg.95306.cn/proxy/portal/enterprise/base/loadComplexValidCodeImg"
-    params = {
-        "validCodeKey": f"{tm}",
-        "timestamp": f"{tm}"
-    }
-    for _ in range(3):
-        response = s.get(url, headers=yheaders, params=params,timeout=20,proxies=proxies,verify=False)
-        code = get_code(response.content)
-        if len(code) == 5:
-            zheaders = {
-                "Accept": "application/json, text/javascript, */*; q=0.01",
-                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
-                "Cache-Control": "no-cache",
-                "Connection": "keep-alive",
-                "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
-                "Origin": "https://cg.95306.cn",
-                "Pragma": "no-cache",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
-                "X-Requested-With": "XMLHttpRequest",
-            }
-
-            zurl = "https://cg.95306.cn/proxy/portal/elasticSearch/checkRequestNumValidateCode"
-            zdata = {
-                "picValidCodeKey": f"{tm}",
-                "picValidCode": f"{code}"
-            }
-            res = s.post(zurl, headers=zheaders, data=zdata,timeout=20,proxies=proxies,verify=False)
-
-            return s.cookies.get_dict()
-    else:
-        return None
-
-
-class Details(feapder.BiddingDetailSpider):
-
-    ck = None
-    proxy = get_proxy()
+    def start_callback(self):
+        self.cookies = None
+        self.proxy = get_proxy()
 
     def start_requests(self):
         data_list = self.get_tasks_by_rabbitmq(limit=100)
         for item in data_list:
-            # log.debug(item)
             request_params = item.get("request_params")
             yield feapder.Request(url=item.get("parse_url"),
                                   proxies=False,
                                   callback=eval(item.get("parse")),
-                                  **request_params,
                                   item=item,
-                                  deal_detail=item.get("deal_detail"))
+                                  deal_detail=item.get("deal_detail"),
+                                  **request_params)
 
     def download_midware(self, request):
-        if not self.ck:
-            self.ck = Code(self.proxy)
+        headers = {
+            'Accept': '*/*',
+            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+            'Cache-Control': 'no-cache',
+            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+            'Origin': 'https://cg.95306.cn',
+            'Referer': request.item.get('href'),
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
+            'X-Requested-With': 'XMLHttpRequest',
+        }
+
+        if self.cookies is None:
+            self.cookies = {
+                'AlteonPcgmh': '0a03b7f3bb36ad3f1f41',
+                'mhId': request.params['mhId'],
+            }
 
-        headers['referer'] = request.item.get('href')
         request.headers = headers
         request.proxies = self.proxy
-        request.cookies = self.ck
+        request.cookies = self.cookies
 
     def validate(self, request, response):
         data = response.json.get('data')
@@ -114,11 +69,11 @@ class Details(feapder.BiddingDetailSpider):
         yield data_item
 
     def exception_request(self, request, response):
-        self.ck = None
+        self.cookies = None
         self.proxy = get_proxy()
         request.params['mhId'] = get_fingerprint()
         yield request
 
 
 if __name__ == "__main__":
-    Details(redis_key="lzz:Gtcgpt").start()
+    Spider(redis_key="lzz:Gtcgpt").start()