|
@@ -6,15 +6,16 @@ Created on 2025-04-29
|
|
---------
|
|
---------
|
|
@author: lzz
|
|
@author: lzz
|
|
"""
|
|
"""
|
|
-import feapder
|
|
|
|
-from items.spider_item import MgpListItem
|
|
|
|
-from collections import namedtuple
|
|
|
|
-from untils.WebCookiePool import WebCookiePool
|
|
|
|
import json
|
|
import json
|
|
|
|
+from collections import namedtuple
|
|
|
|
|
|
|
|
+import feapder
|
|
|
|
+from items.spider_item import BidingListItem
|
|
|
|
+from untils.WebCookiePool import WebCookiePool
|
|
|
|
+from untils.tools import get_proxy
|
|
|
|
|
|
|
|
|
|
-class Feapder(feapder.BiddingListSpider):
|
|
|
|
|
|
+class Spider(feapder.BiddingListSpider):
|
|
|
|
|
|
def start_callback(self):
|
|
def start_callback(self):
|
|
Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
|
|
Menu = namedtuple('Menu', ['channel', 'code', 'tid', 'crawl_page'])
|
|
@@ -37,15 +38,17 @@ class Feapder(feapder.BiddingListSpider):
|
|
"roleId;": ""
|
|
"roleId;": ""
|
|
}
|
|
}
|
|
|
|
|
|
- self.cookie_pool = WebCookiePool(redis_key="zgydcgyzbw_ck", page_url="http://www.chinaunicombidding.cn/bidInformation",
|
|
|
|
- cookie_key="jqmEwVYRfTEJT", driver_type="FIREFOX",
|
|
|
|
- usages_local_driver=True,headless=True)
|
|
|
|
|
|
+ self.cookie_pool = WebCookiePool(redis_key="zgydcgyzbw_ck",
|
|
|
|
+ page_url="http://www.chinaunicombidding.cn/bidInformation",
|
|
|
|
+ cookie_key="jqmEwVYRfTEJT")
|
|
|
|
|
|
|
|
+ self.cookie_pool.user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
|
|
|
|
|
|
def start_requests(self):
|
|
def start_requests(self):
|
|
|
|
+ url = "http://www.cupb.cn/api/v1/bizAnno/getAnnoList"
|
|
for menu in self.menus:
|
|
for menu in self.menus:
|
|
- start_url = "http://www.cupb.cn/api/v1/bizAnno/getAnnoList"
|
|
|
|
- yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
|
|
|
|
|
|
+ proxies = get_proxy()
|
|
|
|
+ yield feapder.Request(url, item=menu._asdict(), page=1, proxies=proxies)
|
|
|
|
|
|
def download_midware(self, request):
|
|
def download_midware(self, request):
|
|
page = request.page
|
|
page = request.page
|
|
@@ -57,15 +60,17 @@ class Feapder(feapder.BiddingListSpider):
|
|
"pageNo": page,
|
|
"pageNo": page,
|
|
"annoType": menu.get('tid')
|
|
"annoType": menu.get('tid')
|
|
}
|
|
}
|
|
- data = json.dumps(data, separators=(',', ':'))
|
|
|
|
-
|
|
|
|
- cookies = self.cookie_pool.create_cookie()
|
|
|
|
- request.data = data
|
|
|
|
- request.cookies = cookies
|
|
|
|
|
|
+ request.data = json.dumps(data, separators=(',', ':'))
|
|
|
|
+ self.cookie_pool.proxies(proxy=request.get_proxy())
|
|
|
|
+ request.cookies = self.cookie_pool.create_cookie()
|
|
request.headers = self.headers
|
|
request.headers = self.headers
|
|
|
|
|
|
- def parse(self, request, response):
|
|
|
|
|
|
+ def validate(self, request, response):
|
|
|
|
+ if response.status_code != 200:
|
|
|
|
+ raise ConnectionRefusedError
|
|
|
|
+ return True
|
|
|
|
|
|
|
|
+ def parse(self, request, response):
|
|
menu = request.item
|
|
menu = request.item
|
|
info_list = response.json.get('data').get('records')
|
|
info_list = response.json.get('data').get('records')
|
|
for info in info_list:
|
|
for info in info_list:
|
|
@@ -79,7 +84,7 @@ class Feapder(feapder.BiddingListSpider):
|
|
area = cty
|
|
area = cty
|
|
city = ""
|
|
city = ""
|
|
|
|
|
|
- list_item = MgpListItem() # 存储数据的管道
|
|
|
|
|
|
+ list_item = BidingListItem() # 存储数据的管道
|
|
list_item.href = href # 标书链接
|
|
list_item.href = href # 标书链接
|
|
list_item.unique_key = ('href',)
|
|
list_item.unique_key = ('href',)
|
|
list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
@@ -95,14 +100,16 @@ class Feapder(feapder.BiddingListSpider):
|
|
list_item.deal_detail = [] # 抽取正文xpath
|
|
list_item.deal_detail = [] # 抽取正文xpath
|
|
list_item.proxies = False
|
|
list_item.proxies = False
|
|
list_item.parse_url = f"http://www.chinaunicombidding.cn/api/v1/bizAnno/getAnnoDetailed/{hid}"
|
|
list_item.parse_url = f"http://www.chinaunicombidding.cn/api/v1/bizAnno/getAnnoDetailed/{hid}"
|
|
-
|
|
|
|
yield list_item
|
|
yield list_item
|
|
|
|
|
|
# 无限翻页
|
|
# 无限翻页
|
|
request = self.infinite_pages(request, response)
|
|
request = self.infinite_pages(request, response)
|
|
yield request
|
|
yield request
|
|
|
|
|
|
|
|
+ def exception_request(self, request, response):
|
|
|
|
+ request.proxies = get_proxy()
|
|
|
|
+ yield request
|
|
|
|
|
|
-if __name__ == "__main__":
|
|
|
|
- Feapder(redis_key="lzz:zgydcgyzbw_cgxqgs").start()
|
|
|
|
|
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ Spider(redis_key="lzz:zgydcgyzbw_cgxqgs").start()
|