|
@@ -16,38 +16,35 @@ from feapder.network.request import requests
|
|
|
from feapder.utils.log import log as logger
|
|
|
from feapder.utils.tools import json
|
|
|
|
|
|
+from cookie_pool import WebCookiePool
|
|
|
+
|
|
|
Menu = namedtuple('Menu', ['channel', 'code', 'type', 'businessKeyWord'])
|
|
|
|
|
|
|
|
|
+class AliRobotsCaptchaError(Exception):
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
class ListPageSpider(feapder.AirSpider):
|
|
|
|
|
|
+ cookie_pool = WebCookiePool(
|
|
|
+ redis_key='zgztb.list.cookies',
|
|
|
+ page_url='http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getSearch.do'
|
|
|
+ )
|
|
|
+
|
|
|
@property
|
|
|
def proxy(self):
|
|
|
return swordfish_proxy()
|
|
|
|
|
|
- @staticmethod
|
|
|
- def extract_address(region):
|
|
|
- if region:
|
|
|
- args = region.split(' ')
|
|
|
- if len(args) == 2:
|
|
|
- area, city = args
|
|
|
- elif len(args) == 1:
|
|
|
- area, city = args[0], ''
|
|
|
- else:
|
|
|
- area, city, *argi = args
|
|
|
- else:
|
|
|
- area, city = '全国', ''
|
|
|
-
|
|
|
- area = area.strip().replace('省', '').replace('市', '')
|
|
|
- city = city.strip()
|
|
|
- return area, city
|
|
|
-
|
|
|
def start_callback(self):
|
|
|
self._task_coll_name = 'zgzb_list'
|
|
|
self._proxies = None
|
|
|
+ self._cookies = None # 全局 cookies
|
|
|
|
|
|
def start_requests(self):
|
|
|
- self._proxies = self.proxy
|
|
|
+ # self._proxies = self.proxy
|
|
|
+ self.ali_robots_cookies(limit=40) # 获取全局 cookies 与 全局代理
|
|
|
+
|
|
|
task_menus = [
|
|
|
Menu('未按数据规范-招标公告', 'a_zgzbtbggfwpt_wasjgf_zbgg', '招标公告', 'tenderBulletin'),
|
|
|
Menu('未按数据规范-开标记录', 'a_zgzbtbggfwpt_wasjgf_kbjl', '开标记录', 'openBidRecord'),
|
|
@@ -89,6 +86,7 @@ class ListPageSpider(feapder.AirSpider):
|
|
|
def download_midware(self, request):
|
|
|
request.url = 'http://www.cebpubservice.com/ctpsp_iiss/searchbusinesstypebeforedooraction/getStringMethod.do'
|
|
|
request.proxies = self._proxies
|
|
|
+ request.cookies = self._cookies
|
|
|
request.method = 'POST'
|
|
|
request.timeout = 5
|
|
|
request.headers = {
|
|
@@ -105,49 +103,37 @@ class ListPageSpider(feapder.AirSpider):
|
|
|
}
|
|
|
|
|
|
def validate(self, request, response):
|
|
|
- msg = request.meta['msg']
|
|
|
- menu = request.menu
|
|
|
- resp_json = response.json # 检测数据接口是否请求成功
|
|
|
- if not resp_json:
|
|
|
- logger.info(f'{msg}-无列表数据')
|
|
|
- return False
|
|
|
-
|
|
|
- # 汇总消息
|
|
|
- data = resp_json.get("object")
|
|
|
- return_list = data.get("returnlist")
|
|
|
-
|
|
|
- page = data.get("page")
|
|
|
- total_page = page.get("totalPage", 0)
|
|
|
- total_count = page.get("totalCount", 0)
|
|
|
- page_no = page.get("pageNo", 0)
|
|
|
- row = page.get("row", 0)
|
|
|
-
|
|
|
- msg = f"{menu.channel},超出最大采集页码"
|
|
|
- if page_no <= total_page:
|
|
|
- tips = [
|
|
|
- menu.channel,
|
|
|
- f'共{total_page}页{total_count}/{len(return_list)}条',
|
|
|
- f'第{page_no}页{row}条',
|
|
|
- ]
|
|
|
- msg = "-".join(tips)
|
|
|
-
|
|
|
- logger.info(msg)
|
|
|
- return True
|
|
|
-
|
|
|
- def exception_request(self, request, response, e):
|
|
|
- msg = request.meta['msg']
|
|
|
- proxy_errors = (
|
|
|
- json.decoder.JSONDecodeError,
|
|
|
- requests.exceptions.ConnectionError
|
|
|
- )
|
|
|
- if isinstance(e, proxy_errors):
|
|
|
- interval = request.meta['interval']
|
|
|
- logger.warning(f"{msg}--代理失效,{interval}s后重试...")
|
|
|
- tools.delay_time(interval)
|
|
|
- self._proxies = self.proxy
|
|
|
+ if response.is_html:
|
|
|
+ raise AliRobotsCaptchaError
|
|
|
else:
|
|
|
- logger.error(f"{msg}--请求失败")
|
|
|
- logger.exception(f'异常原因:{e}')
|
|
|
+ msg = request.meta['msg']
|
|
|
+ menu = request.menu
|
|
|
+ resp_json = response.json # 检测数据接口是否请求成功
|
|
|
+ if not resp_json:
|
|
|
+ logger.info(f'{msg}-无列表数据')
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 汇总消息
|
|
|
+ data = resp_json.get("object")
|
|
|
+ return_list = data.get("returnlist")
|
|
|
+
|
|
|
+ page = data.get("page")
|
|
|
+ total_page = page.get("totalPage", 0)
|
|
|
+ total_count = page.get("totalCount", 0)
|
|
|
+ page_no = page.get("pageNo", 0)
|
|
|
+ row = page.get("row", 0)
|
|
|
+
|
|
|
+ msg = f"{menu.channel},超出最大采集页码"
|
|
|
+ if page_no <= total_page:
|
|
|
+ tips = [
|
|
|
+ menu.channel,
|
|
|
+ f'共{total_page}页{total_count}/{len(return_list)}条',
|
|
|
+ f'第{page_no}页{row}条',
|
|
|
+ ]
|
|
|
+ msg = "-".join(tips)
|
|
|
+
|
|
|
+ logger.info(msg)
|
|
|
+ return True
|
|
|
|
|
|
def parse(self, request, response):
|
|
|
menu = request.menu
|
|
@@ -198,9 +184,70 @@ class ListPageSpider(feapder.AirSpider):
|
|
|
yield item
|
|
|
logger.info(f"采集成功--{menu.channel}-{item['title']}-{publish_time}")
|
|
|
|
|
|
+ def exception_request(self, request, response, e):
|
|
|
+ msg = request.meta['msg']
|
|
|
+ errors = (
|
|
|
+ json.decoder.JSONDecodeError,
|
|
|
+ requests.exceptions.ConnectionError
|
|
|
+ )
|
|
|
+ if isinstance(e, errors):
|
|
|
+ interval = request.meta['interval']
|
|
|
+ logger.warning(f"{msg}--代理失效,{interval}s后重试...")
|
|
|
+ tools.delay_time(interval)
|
|
|
+ self._proxies = self.proxy
|
|
|
+ elif isinstance(e, AliRobotsCaptchaError):
|
|
|
+ logger.info("cookies失效,重新获取...")
|
|
|
+ self.ali_robots_cookies(request)
|
|
|
+ else:
|
|
|
+ logger.error(f"{msg}--请求失败")
|
|
|
+ logger.exception(f'异常原因:{e}')
|
|
|
+
|
|
|
def end_callback(self):
|
|
|
logger.info("列表页采集结束")
|
|
|
|
|
|
+ def ali_robots_cookies(self, request=None, limit=3):
|
|
|
+ """
|
|
|
+ 阿里无感机器人滑块验证
|
|
|
+
|
|
|
+ @param request
|
|
|
+ @param limit 最大重试次数
|
|
|
+ """
|
|
|
+ retries = 0
|
|
|
+ cookies = None
|
|
|
+ proxy = self.proxy
|
|
|
+ while retries < limit:
|
|
|
+ cookies = self.cookie_pool.create_cookies(proxy=proxy.get('http'))
|
|
|
+ if not cookies or len(cookies) <= 4:
|
|
|
+ proxy = self.proxy
|
|
|
+ retries += 1
|
|
|
+ continue
|
|
|
+
|
|
|
+ if request is not None:
|
|
|
+ request.cookies = cookies
|
|
|
+ request.proxies = proxy
|
|
|
+
|
|
|
+ break
|
|
|
+
|
|
|
+ self._cookies = cookies
|
|
|
+ self._proxies = proxy
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def extract_address(region):
|
|
|
+ if region:
|
|
|
+ args = region.split(' ')
|
|
|
+ if len(args) == 2:
|
|
|
+ area, city = args
|
|
|
+ elif len(args) == 1:
|
|
|
+ area, city = args[0], ''
|
|
|
+ else:
|
|
|
+ area, city, *argi = args
|
|
|
+ else:
|
|
|
+ area, city = '全国', ''
|
|
|
+
|
|
|
+ area = area.strip().replace('省', '').replace('市', '')
|
|
|
+ city = city.strip()
|
|
|
+ return area, city
|
|
|
+
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
ListPageSpider(thread_count=1).start()
|