Browse Source

爬虫维护

lizongze 8 months ago
parent
commit
c4bfd3fedb

+ 8 - 10
中国交建物资采购管理信息系统/全部结果-列表页.py

@@ -1,12 +1,11 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 """
 """
-Created on 2023-7-10
+Created on 2024-12-02
 ---------
 ---------
 @summary: 中国交建物资采购管理信息系统
 @summary: 中国交建物资采购管理信息系统
 ---------
 ---------
 @author: lzz
 @author: lzz
 """
 """
-
 import feapder
 import feapder
 from items.spider_item import MgpListItem
 from items.spider_item import MgpListItem
 from collections import namedtuple
 from collections import namedtuple
@@ -22,21 +21,21 @@ class Feapder(feapder.BiddingListSpider):
         self.site = "中国交建物资采购管理信息系统"
         self.site = "中国交建物资采购管理信息系统"
 
 
         self.menus = [
         self.menus = [
-            Menu('全部结果', 'a_zgjjwzcgglxxxt_qbjg', 1),
+            Menu('全部结果', 'a_zgjjwzcgglxxxt_qbjg', 5),
         ]
         ]
 
 
         self.headers = {
         self.headers = {
-            "APP_TOKEN": "",
+            "APP_TOKEN;": "",
             "Accept": "application/json, text/plain, */*",
             "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Authorization": "",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Authorization;": "",
             "Cache-Control": "no-cache",
             "Cache-Control": "no-cache",
             "Connection": "keep-alive",
             "Connection": "keep-alive",
             "Content-Type": "application/json;charset=UTF-8",
             "Content-Type": "application/json;charset=UTF-8",
             "Origin": "https://sp.iccec.cn",
             "Origin": "https://sp.iccec.cn",
             "Pragma": "no-cache",
             "Pragma": "no-cache",
-            "Referer": "https://sp.iccec.cn/announcementsList?type=8&source=1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+            "Referer": "https://sp.iccec.cn/searchList?type=98",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
         }
         }
 
 
         self.cookies = {
         self.cookies = {
@@ -46,7 +45,7 @@ class Feapder(feapder.BiddingListSpider):
     def start_requests(self):
     def start_requests(self):
         for menu in self.menus:
         for menu in self.menus:
             start_url = "https://sp.iccec.cn/apis/sp/bidc/users/signup/searchSupNoticeNew"
             start_url = "https://sp.iccec.cn/apis/sp/bidc/users/signup/searchSupNoticeNew"
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1,  proxies=False)
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1)
 
 
     def download_midware(self, request):
     def download_midware(self, request):
         page = request.page
         page = request.page
@@ -120,7 +119,6 @@ class Feapder(feapder.BiddingListSpider):
             list_item.proxies = False
             list_item.proxies = False
             list_item.parse_url = "https://sp.iccec.cn/apis/sp/bidc/users/signup/qryNoticeDetail"  # 详情页请求地址
             list_item.parse_url = "https://sp.iccec.cn/apis/sp/bidc/users/signup/qryNoticeDetail"  # 详情页请求地址
 
 
-
             yield list_item
             yield list_item
 
 
         # 无限翻页
         # 无限翻页

+ 30 - 20
中国交建物资采购管理信息系统/全部结果-详情页.py

@@ -1,12 +1,11 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 """
 """
-Created on 2023-7-10
+Created on 2024-12-02
 ---------
 ---------
 @summary: 中国交建物资采购管理信息系统
 @summary: 中国交建物资采购管理信息系统
 ---------
 ---------
 @author: lzz
 @author: lzz
 """
 """
-
 import feapder
 import feapder
 from items.spider_item import DataBakItem
 from items.spider_item import DataBakItem
 from feapder.utils.tools import log
 from feapder.utils.tools import log
@@ -15,51 +14,62 @@ from untils.tools import extract_file_type
 import base64
 import base64
 
 
 
 
-
-
 def btoa(data):
 def btoa(data):
-    if data:
-        ss = base64.b64decode(data).decode()
-        return ss
-    else:
+    try:
+        return base64.b64decode(data).decode()
+    except:
         return ''
         return ''
 
 
-
-
+cookies = {
+    "language": "zh-cn"
+}
+headers = {
+    "APP_TOKEN;": "",
+    "Accept": "application/json, text/plain, */*",
+    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+    "Authorization;": "",
+    "Cache-Control": "no-cache",
+    "Connection": "keep-alive",
+    "Content-Type": "application/json;charset=UTF-8",
+    "Origin": "https://sp.iccec.cn",
+    "Pragma": "no-cache",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+}
 
 
 class Details(feapder.BiddingDetailSpider):
 class Details(feapder.BiddingDetailSpider):
 
 
     def start_requests(self):
     def start_requests(self):
-        data_list = self.get_tasks_by_rabbitmq(limit=20)
+        data_list = self.get_tasks_by_rabbitmq(limit=50)
         for item in data_list:
         for item in data_list:
-            log.debug(item)
+            # log.debug(item)
             request_params = item.get("request_params")
             request_params = item.get("request_params")
             yield feapder.Request(url=item.get("parse_url"), item=item,
             yield feapder.Request(url=item.get("parse_url"), item=item,
-                                  deal_detail=item.get("deal_detail"),proxies=False,
+                                  deal_detail=item.get("deal_detail"),
                                   callback=eval(item.get("parse")),  **request_params)
                                   callback=eval(item.get("parse")),  **request_params)
 
 
 
 
-
     def detail_get(self, request, response):
     def detail_get(self, request, response):
 
 
         items = request.item
         items = request.item
         list_item = DataBakItem(**items)
         list_item = DataBakItem(**items)
+        dt = response.json.get('data')
+        html = dt.get('textInfo')
+        linkInfo = dt.get('linkInfo','')
 
 
-        html = response.json.get('data').get('textInfo')
-
-        list_item.contenthtml = html
+        list_item.contenthtml = html + linkInfo
 
 
         file_list = response.json.get('data').get('fileinfoBOS')
         file_list = response.json.get('data').get('fileinfoBOS')
         if file_list:
         if file_list:
             attachments = {}
             attachments = {}
             for info in file_list:
             for info in file_list:
-                fid = info.get('createId')
-                file_url = btoa(info.get('fileUrl')) or f"https://sp.iccec.cn/apis/sp/oss/users/signup/downloadByUploadId?uploadId={fid}"
+                fid = info.get('fileUrl')
                 file_name = info.get('fileName').strip()
                 file_name = info.get('fileName').strip()
+                file_url = f"https://sp.iccec.cn/apis/sp/oss/users/signup/downloadByUploadId?fileUrl={fid}&fileName={file_name}"
                 file_type = extract_file_type(file_name,file_url)
                 file_type = extract_file_type(file_name,file_url)
                 if file_type:
                 if file_type:
                     attachment = AttachmentDownloader().fetch_attachment(
                     attachment = AttachmentDownloader().fetch_attachment(
-                        file_name=file_name, file_type=file_type, download_url=file_url)
+                        file_name=file_name, file_type=file_type, download_url=file_url,
+                    headers=headers,cookies=cookies)
                     attachments[str(len(attachments) + 1)] = attachment
                     attachments[str(len(attachments) + 1)] = attachment
             if attachments:
             if attachments:
                 list_item.projectinfo = {"attachments": attachments}
                 list_item.projectinfo = {"attachments": attachments}

+ 9 - 14
中国交建物资采购管理信息系统/流标公告-列表页.py

@@ -1,12 +1,11 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 """
 """
-Created on 2023-7-10
+Created on 2024-12-02
 ---------
 ---------
 @summary: 中国交建物资采购管理信息系统
 @summary: 中国交建物资采购管理信息系统
 ---------
 ---------
 @author: lzz
 @author: lzz
 """
 """
-
 import feapder
 import feapder
 from items.spider_item import MgpListItem
 from items.spider_item import MgpListItem
 from collections import namedtuple
 from collections import namedtuple
@@ -22,21 +21,21 @@ class Feapder(feapder.BiddingListSpider):
         self.site = "中国交建物资采购管理信息系统"
         self.site = "中国交建物资采购管理信息系统"
 
 
         self.menus = [
         self.menus = [
-            Menu('流标公告', 'a_zgjjwzcgglxxxt_lbgg', 1),
+            Menu('流标公告', 'a_zgjjwzcgglxxxt_lbgg', 2),
         ]
         ]
 
 
         self.headers = {
         self.headers = {
-            "APP_TOKEN": "",
+            "APP_TOKEN;": "",
             "Accept": "application/json, text/plain, */*",
             "Accept": "application/json, text/plain, */*",
-            "Accept-Language": "zh-CN,zh;q=0.9",
-            "Authorization": "",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            "Authorization;": "",
             "Cache-Control": "no-cache",
             "Cache-Control": "no-cache",
             "Connection": "keep-alive",
             "Connection": "keep-alive",
             "Content-Type": "application/json;charset=UTF-8",
             "Content-Type": "application/json;charset=UTF-8",
             "Origin": "https://sp.iccec.cn",
             "Origin": "https://sp.iccec.cn",
             "Pragma": "no-cache",
             "Pragma": "no-cache",
             "Referer": "https://sp.iccec.cn/announcementsList?type=8&source=1",
             "Referer": "https://sp.iccec.cn/announcementsList?type=8&source=1",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
         }
         }
 
 
         self.cookies = {
         self.cookies = {
@@ -46,7 +45,7 @@ class Feapder(feapder.BiddingListSpider):
     def start_requests(self):
     def start_requests(self):
         for menu in self.menus:
         for menu in self.menus:
             start_url = "https://sp.iccec.cn/apis/sp/bidc/users/signup/listSupBidNotice"
             start_url = "https://sp.iccec.cn/apis/sp/bidc/users/signup/listSupBidNotice"
-            yield feapder.Request(url=start_url, item=menu._asdict(), page=1,  proxies=False)
+            yield feapder.Request(url=start_url, item=menu._asdict(), page=1)
 
 
     def download_midware(self, request):
     def download_midware(self, request):
         page = request.page
         page = request.page
@@ -94,7 +93,7 @@ class Feapder(feapder.BiddingListSpider):
             list_item.city = city  # 城市 默认为空
             list_item.city = city  # 城市 默认为空
 
 
             list_item.unique_key = ('href',)
             list_item.unique_key = ('href',)
-            list_item.parse = "self.detail_json"  # 详情页回调方法
+            list_item.parse = "self.detail_get"  # 详情页回调方法
 
 
             ddata = {
             ddata = {
                 "schemeId": schemeId,
                 "schemeId": schemeId,
@@ -107,10 +106,6 @@ class Feapder(feapder.BiddingListSpider):
                                         "headers":self.headers,
                                         "headers":self.headers,
                                         "cookies":self.cookies}
                                         "cookies":self.cookies}
 
 
-            list_item.deal_detail = '''
-html = response.json.get('data').get('textInfo')
-list_item.contenthtml = html
-            '''
             list_item.proxies = False
             list_item.proxies = False
             list_item.parse_url = "https://sp.iccec.cn/apis/sp/bidc/users/signup/qryNoticeDetail"  # 详情页请求地址
             list_item.parse_url = "https://sp.iccec.cn/apis/sp/bidc/users/signup/qryNoticeDetail"  # 详情页请求地址
 
 
@@ -123,4 +118,4 @@ list_item.contenthtml = html
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    Feapder(redis_key="detail:normal_details", user="106").start()
+    Feapder(redis_key="lzz:zgjjwzcgglxxxt_qbjg").start()

+ 4 - 7
中国交建物资采购管理信息系统/通知公告-列表页.py

@@ -1,12 +1,11 @@
 # -*- coding: utf-8 -*-
 # -*- coding: utf-8 -*-
 """
 """
-Created on 2023-7-10
+Created on 2024-12-02
 ---------
 ---------
 @summary: 中国交建物资采购管理信息系统
 @summary: 中国交建物资采购管理信息系统
 ---------
 ---------
 @author: lzz
 @author: lzz
 """
 """
-
 import feapder
 import feapder
 from items.spider_item import MgpListItem
 from items.spider_item import MgpListItem
 from collections import namedtuple
 from collections import namedtuple
@@ -15,7 +14,7 @@ import re
 
 
 def viewInfoDetail(href):
 def viewInfoDetail(href):
     value = "".join(re.findall("'(.*?)'",href)[0]).replace('\\n','').replace('\\t','').replace('\\r','')
     value = "".join(re.findall("'(.*?)'",href)[0]).replace('\\n','').replace('\\t','').replace('\\r','')
-    url = "http://ec.ccccltd.cn/PMS/moredetail.shtml?id="+value
+    url = "https://ec.ccccltd.cn/PMS/moredetail.shtml?id="+value
     return url
     return url
 
 
 
 
@@ -47,7 +46,7 @@ class Feapder(feapder.BiddingListSpider):
 
 
     def start_requests(self):
     def start_requests(self):
         for menu in self.menus:
         for menu in self.menus:
-            start_url = "http://ec.ccccltd.cn/PMS/gysmore.shtml?id=SrZ/fkUbA7HVziGixMco3eq/vJlhtW8oM5Aj2hP2SXbI1E7xN8YH2nZStkmvDWjhqfyswIM+o9E="
+            start_url = "https://ec.ccccltd.cn/PMS/gysmore.shtml?id=SrZ/fkUbA7HVziGixMco3eq/vJlhtW8oM5Aj2hP2SXbI1E7xN8YH2nZStkmvDWjhqfyswIM+o9E="
             yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
             yield feapder.Request(url=start_url, item=menu._asdict(), page=1,
                                   render_time=5,render=True, proxies=False)
                                   render_time=5,render=True, proxies=False)
 
 
@@ -55,9 +54,7 @@ class Feapder(feapder.BiddingListSpider):
         page = request.page
         page = request.page
         request.headers = self.headers
         request.headers = self.headers
 
 
-
     def parse(self, request, response):
     def parse(self, request, response):
-
         menu = request.item
         menu = request.item
         info_list = response.xpath('//table[@class="listCss"]/tbody/tr/td/table/tbody/tr')
         info_list = response.xpath('//table[@class="listCss"]/tbody/tr/td/table/tbody/tr')
         for info in info_list[1:]:
         for info in info_list[1:]:
@@ -94,4 +91,4 @@ class Feapder(feapder.BiddingListSpider):
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    Feapder(redis_key="detail:chrome", user="105").start()
+    Feapder(redis_key="detail:chrome").start()