|
@@ -6,39 +6,19 @@ Created on 2024-01-08
|
|
|
---------
|
|
|
@author: lzz
|
|
|
"""
|
|
|
-import re
|
|
|
from collections import namedtuple
|
|
|
|
|
|
import feapder
|
|
|
-import requests
|
|
|
from items.spider_item import BidingListItem
|
|
|
-from untils.get_imgcode import jy_ocr
|
|
|
-from untils.tools import get_proxy
|
|
|
-
|
|
|
-
|
|
|
-def ocr_captcha(headers, proxies=False, max_retries=10):
|
|
|
- session = requests.session()
|
|
|
- session.proxies = proxies
|
|
|
-
|
|
|
- s = re.compile("'src', '(.*?)'", flags=re.S) # src
|
|
|
- href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
|
|
|
|
|
|
- code = ''
|
|
|
- for _ in range(max_retries):
|
|
|
- resp1 = session.get(href, headers=headers, timeout=30, verify=False)
|
|
|
- text = resp1.content.decode()
|
|
|
- img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
|
|
|
- resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
|
|
|
- code = jy_ocr(image=resp2.content)
|
|
|
- if code and len(code) == 6:
|
|
|
- break
|
|
|
-
|
|
|
- return code, session.cookies.get_dict()
|
|
|
+from tools import ocr_captcha
|
|
|
+from untils.tools import get_proxy
|
|
|
|
|
|
|
|
|
class Spider(feapder.BiddingListSpider):
|
|
|
__custom_setting__ = dict(
|
|
|
- SPIDER_MAX_RETRY_TIMES=10
|
|
|
+ SPIDER_MAX_RETRY_TIMES=10,
|
|
|
+ SESSION_DOWNLOADER="tools.CustomSessionDownloader"
|
|
|
)
|
|
|
|
|
|
def start_callback(self):
|
|
@@ -46,9 +26,8 @@ class Spider(feapder.BiddingListSpider):
|
|
|
self.site = "湖北省政府采购网上商城"
|
|
|
|
|
|
self.menus = [
|
|
|
- Menu('定点采购-终止公告', 'hb_hbszfcgwssc_ddcg_zzgg_python', '4', 2),
|
|
|
+ Menu('定点采购-终止公告', 'hb_hbszfcgwssc_ddcg_zzgg_python', '4', 10),
|
|
|
]
|
|
|
-
|
|
|
self.proxy = get_proxy()
|
|
|
self.headers = {
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
@@ -63,12 +42,13 @@ class Spider(feapder.BiddingListSpider):
|
|
|
def start_requests(self):
|
|
|
for menu in self.menus:
|
|
|
url = f'http://wssc.hubeigp.gov.cn/upgrade/fixed_project_notices?type={menu.typeone}&pt=all'
|
|
|
- yield feapder.Request(url, item=menu._asdict(), use_session=True, page=1, proxies=False)
|
|
|
+ yield feapder.Request(url, item=menu._asdict(), use_session=True, page=1, proxy=False)
|
|
|
|
|
|
def download_midware(self, request):
|
|
|
page = request.page
|
|
|
menu = request.item
|
|
|
request.headers = self.headers
|
|
|
+ request.proxies = self.proxy
|
|
|
|
|
|
if page != 1:
|
|
|
captcha, cookies = ocr_captcha(self.headers, self.proxy)
|
|
@@ -86,7 +66,7 @@ class Spider(feapder.BiddingListSpider):
|
|
|
def validate(self, request, response):
|
|
|
items = response.xpath('//div[@class="jmr_noticelist"]/ul/li')
|
|
|
if not items:
|
|
|
- raise ValueError('列表数据为空!')
|
|
|
+ raise ValueError('访问被拒绝!')
|
|
|
return True
|
|
|
|
|
|
def parse(self, request, response):
|