|
@@ -6,19 +6,18 @@ Created on 2025-04-22
|
|
---------
|
|
---------
|
|
@author: lzz
|
|
@author: lzz
|
|
"""
|
|
"""
|
|
-import feapder
|
|
|
|
-from items.spider_item import MgpListItem
|
|
|
|
|
|
+import re
|
|
from collections import namedtuple
|
|
from collections import namedtuple
|
|
|
|
+
|
|
|
|
+import feapder
|
|
|
|
+from items.spider_item import BidingListItem
|
|
from untils.WebCookiePool import WebCookiePool
|
|
from untils.WebCookiePool import WebCookiePool
|
|
from untils.tools import get_proxy
|
|
from untils.tools import get_proxy
|
|
-import re
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
|
-class ZtbpcFeapder(feapder.BiddingListSpider):
|
|
|
|
|
|
+class Spider(feapder.BiddingListSpider):
|
|
|
|
|
|
def start_callback(self):
|
|
def start_callback(self):
|
|
-
|
|
|
|
self.site = "中国华能集团公司"
|
|
self.site = "中国华能集团公司"
|
|
|
|
|
|
Menu = namedtuple('Menu', ['channel', 'code', 'tp', 'crawl_page'])
|
|
Menu = namedtuple('Menu', ['channel', 'code', 'tp', 'crawl_page'])
|
|
@@ -39,22 +38,20 @@ class ZtbpcFeapder(feapder.BiddingListSpider):
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
}
|
|
}
|
|
- self.proxy = get_proxy()
|
|
|
|
- self.ct = 0
|
|
|
|
|
|
+
|
|
self.cookie_pool = WebCookiePool(redis_key="zghnjtgs_gkxjgg_ck",
|
|
self.cookie_pool = WebCookiePool(redis_key="zghnjtgs_gkxjgg_ck",
|
|
page_url="http://ec.chng.com.cn/ecmall/more.do",
|
|
page_url="http://ec.chng.com.cn/ecmall/more.do",
|
|
- cookie_key="S6J51OuUjLieT",
|
|
|
|
- driver_type="FIREFOX")
|
|
|
|
|
|
+ cookie_key="S6J51OuUjLieP")
|
|
|
|
+ self.cookie_pool.user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36")
|
|
|
|
|
|
def start_requests(self):
|
|
def start_requests(self):
|
|
|
|
+ url = "http://ec.chng.com.cn/ecmall/more.do"
|
|
for menu in self.menus:
|
|
for menu in self.menus:
|
|
- start_url = "http://ec.chng.com.cn/ecmall/more.do"
|
|
|
|
- yield feapder.Request(url=start_url, item=menu._asdict(), page=1, proxies=False)
|
|
|
|
|
|
+ proxies = get_proxy()
|
|
|
|
+ yield feapder.Request(url, item=menu._asdict(), page=1, proxies=proxies)
|
|
|
|
|
|
def download_midware(self, request):
|
|
def download_midware(self, request):
|
|
page = request.page
|
|
page = request.page
|
|
- self.cookie_pool.proxy = self.proxy.get('http')
|
|
|
|
- cookies = self.cookie_pool.get_cookie()
|
|
|
|
menu = request.item
|
|
menu = request.item
|
|
data = {
|
|
data = {
|
|
"type": "107",
|
|
"type": "107",
|
|
@@ -65,73 +62,70 @@ class ZtbpcFeapder(feapder.BiddingListSpider):
|
|
"limit": "50"
|
|
"limit": "50"
|
|
}
|
|
}
|
|
request.data = data
|
|
request.data = data
|
|
- request.cookies = cookies
|
|
|
|
|
|
+
|
|
|
|
+ self.cookie_pool.proxies(request.get_proxy())
|
|
|
|
+ request.cookies = self.cookie_pool.get_cookie()
|
|
request.headers = self.headers
|
|
request.headers = self.headers
|
|
|
|
|
|
- def exception_request(self, request, response):
|
|
|
|
- self.proxy = get_proxy()
|
|
|
|
- yield request
|
|
|
|
|
|
+ def validate(self, request, response):
|
|
|
|
+ if response.status_code != 200:
|
|
|
|
+ raise ConnectionRefusedError
|
|
|
|
+ return True
|
|
|
|
|
|
def parse(self, request, response):
|
|
def parse(self, request, response):
|
|
- if self.ct > 5:
|
|
|
|
- return
|
|
|
|
- if response.status_code != 200:
|
|
|
|
- self.ct += 1
|
|
|
|
- self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
|
|
|
|
- self.proxy = get_proxy()
|
|
|
|
- yield request
|
|
|
|
- else:
|
|
|
|
- self.ct = 0
|
|
|
|
- menu = request.item
|
|
|
|
- info_list = response.xpath('//ul[@class="main_r_con"]/li')
|
|
|
|
- for info in info_list:
|
|
|
|
- href_org = info.xpath('./a/@href').extract_first()
|
|
|
|
- hid = "".join(re.findall("\('(.*?)'", href_org))
|
|
|
|
- href = f"https://ec.chng.com.cn/ecmall/announcement/announcementDetail.do?announcementId={hid}"
|
|
|
|
- title = info.xpath('./a/@title').extract_first("").strip()
|
|
|
|
- publish_time = info.xpath('./p/text()').extract_first("").strip()
|
|
|
|
-
|
|
|
|
- area = "全国" # 省份
|
|
|
|
- city = "" # 城市
|
|
|
|
- district = "" # 区县
|
|
|
|
-
|
|
|
|
- list_item = MgpListItem() # 存储数据的管道
|
|
|
|
- list_item.href = href # 标书链接
|
|
|
|
- list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
|
|
- list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
|
|
- list_item.title = title # 标题
|
|
|
|
- list_item.publishtime = publish_time # 标书发布时间
|
|
|
|
- list_item.site = self.site
|
|
|
|
- list_item.area = area or "全国" # 省份 默认:全国
|
|
|
|
- list_item.city = city # 城市 默认 为空
|
|
|
|
- list_item.district = district # 区县 默认 为空
|
|
|
|
-
|
|
|
|
- list_item.unique_key = ('href',publish_time)
|
|
|
|
- list_item.parse = "self.detail_get" # 详情页回调方法
|
|
|
|
- list_item.request_params = {"rm_list":['//div[@class="layui-layer-btnhz"]',
|
|
|
|
- '//div[@class="company"]',
|
|
|
|
- '//div[@class="main_r_t border_f4"]']}
|
|
|
|
- list_item.deal_detail = ['//div[@class="detail_boxhz"]',
|
|
|
|
- '//div[@class="detail_box qst_box"]',
|
|
|
|
- '//div[@class="main_box"]'] # 抽取正文xpath
|
|
|
|
- list_item.proxies = True
|
|
|
|
- list_item.parse_url = href
|
|
|
|
-
|
|
|
|
- list_item.files = {
|
|
|
|
- "list_xpath": '//a[@href]',
|
|
|
|
- "url_xpath": './@href',
|
|
|
|
- "name_xpath": './text()',
|
|
|
|
- # "file_type":'pdf', # 默认的附件类型,用于url中未带附件类型的
|
|
|
|
- "url_key": 'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
|
|
|
|
- "host": '', # 需要拼接url的host
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- yield list_item
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- request = self.infinite_pages(request, response)
|
|
|
|
- yield request
|
|
|
|
|
|
+ menu = request.item
|
|
|
|
+ info_list = response.xpath('//ul[@class="main_r_con"]/li')
|
|
|
|
+ for info in info_list:
|
|
|
|
+ href_org = info.xpath('./a/@href').extract_first()
|
|
|
|
+ hid = "".join(re.findall("\('(.*?)'", href_org))
|
|
|
|
+ href = f"https://ec.chng.com.cn/ecmall/announcement/announcementDetail.do?announcementId={hid}"
|
|
|
|
+ title = info.xpath('./a/@title').extract_first("").strip()
|
|
|
|
+ publish_time = info.xpath('./p/text()').extract_first("").strip()
|
|
|
|
+
|
|
|
|
+ area = "全国" # 省份
|
|
|
|
+ city = "" # 城市
|
|
|
|
+ district = "" # 区县
|
|
|
|
+
|
|
|
|
+ list_item = BidingListItem() # 存储数据的管道
|
|
|
|
+ list_item.href = href # 标书链接
|
|
|
|
+ list_item.channel = menu.get("channel") # 最上方定义的抓取栏目 (编辑器定的)
|
|
|
|
+ list_item.spidercode = menu.get("code") # 最上方定义的爬虫code(编辑器定的)
|
|
|
|
+ list_item.title = title # 标题
|
|
|
|
+ list_item.publishtime = publish_time # 标书发布时间
|
|
|
|
+ list_item.site = self.site
|
|
|
|
+ list_item.area = area or "全国" # 省份 默认:全国
|
|
|
|
+ list_item.city = city # 城市 默认 为空
|
|
|
|
+ list_item.district = district # 区县 默认 为空
|
|
|
|
+
|
|
|
|
+ list_item.unique_key = ('href', publish_time)
|
|
|
|
+ list_item.parse = "self.detail_get" # 详情页回调方法
|
|
|
|
+ list_item.request_params = {"rm_list":['//div[@class="layui-layer-btnhz"]',
|
|
|
|
+ '//div[@class="company"]',
|
|
|
|
+ '//div[@class="main_r_t border_f4"]']}
|
|
|
|
+ list_item.deal_detail = ['//div[@class="detail_boxhz"]',
|
|
|
|
+ '//div[@class="detail_box qst_box"]',
|
|
|
|
+ '//div[@class="main_box"]'] # 抽取正文xpath
|
|
|
|
+ list_item.proxies = True
|
|
|
|
+ list_item.parse_url = href
|
|
|
|
+
|
|
|
|
+ list_item.files = {
|
|
|
|
+ "list_xpath": '//a[@href]',
|
|
|
|
+ "url_xpath": './@href',
|
|
|
|
+ "name_xpath": './text()',
|
|
|
|
+ # "file_type":'pdf', # 默认的附件类型,用于url中未带附件类型的
|
|
|
|
+ "url_key": 'http', # 用于区别连接是否为正常附件连接的url关键词 必须携带,如无可填http
|
|
|
|
+ "host": '', # 需要拼接url的host
|
|
|
|
+ }
|
|
|
|
+ yield list_item
|
|
|
|
+
|
|
|
|
+ request = self.infinite_pages(request, response)
|
|
|
|
+ yield request
|
|
|
|
+
|
|
|
|
+ def exception_request(self, request, response):
|
|
|
|
+ self.cookie_pool.del_cookie(self.cookie_pool.get_cookie())
|
|
|
|
+ request.proxies = get_proxy()
|
|
|
|
+ yield request
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
- ZtbpcFeapder(redis_key="detail:firefox").start()
|
|
|
|
|
|
+ Spider(redis_key="detail:firefox").start()
|