|
@@ -10,7 +10,7 @@ from crawler.clean_html import cleaner
|
|
|
from crawler.crawl_scheduler import Scheduler
|
|
|
from crawler.login import login, load_login_cookies, login_check
|
|
|
from utils.databases import mongo_table, int2long
|
|
|
-from utils.execptions import CrawlError, YbwCrawlError
|
|
|
+from utils.execptions import YbwCrawlError
|
|
|
from utils.log import logger
|
|
|
from utils.socks5 import Proxy
|
|
|
|
|
@@ -89,7 +89,8 @@ class DetailSpider:
|
|
|
update = {'crawl': False}
|
|
|
self._update_crawl_task(task['_id'], **update)
|
|
|
|
|
|
- def crawl_request(self, url):
|
|
|
+ def crawl_request(self, item: dict):
|
|
|
+ url = item['competehref']
|
|
|
headers = {
|
|
|
'Host': 'www.chinabidding.cn',
|
|
|
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
|
|
@@ -109,8 +110,14 @@ class DetailSpider:
|
|
|
request_params.setdefault('timeout', 60)
|
|
|
|
|
|
retries = 0
|
|
|
+ retries_502, max_retries_502 = 0, 15
|
|
|
proxy, proxies = None, None
|
|
|
while retries < 3:
|
|
|
+ if retries_502 > max_retries_502:
|
|
|
+ # 网站已移除该数据
|
|
|
+ self._update_crawl_task(item['_id'], crawl_status='remove')
|
|
|
+ break
|
|
|
+
|
|
|
login_cookies = load_login_cookies(self.user.phone)
|
|
|
if login_cookies is None:
|
|
|
login(*self.user)
|
|
@@ -122,7 +129,7 @@ class DetailSpider:
|
|
|
|
|
|
try:
|
|
|
r = requests.get(url, **request_params)
|
|
|
- '''账号登录状态检查'''
|
|
|
+ # 账号登录状态检查
|
|
|
retry_login = login_check(self.user.phone, url, False)
|
|
|
if retry_login:
|
|
|
logger.info(f"[重新登录]{self.user.phone}")
|
|
@@ -140,7 +147,9 @@ class DetailSpider:
|
|
|
element = fromstring(r.text)
|
|
|
nodes = element.xpath('//*[@id="main_dom"]/div[1]')
|
|
|
if len(nodes) != 1:
|
|
|
- raise CrawlError(code=10021, reason=f'"main_dom"属性匹配个数:{len(nodes)}')
|
|
|
+ retries_502 += 1
|
|
|
+ logger.debug(f'"main_dom"属性匹配个数:{len(nodes)}, {r.status_code} - {url}')
|
|
|
+ continue
|
|
|
else:
|
|
|
node = nodes[0]
|
|
|
logger.info(f'[采集正文] id={node.attrib.get("id")}')
|
|
@@ -204,8 +213,7 @@ class DetailSpider:
|
|
|
try:
|
|
|
# 检查请求采集任务
|
|
|
CheckTask(item)
|
|
|
- url = item['competehref']
|
|
|
- response = self.crawl_request(url)
|
|
|
+ response = self.crawl_request(item)
|
|
|
if response is not None:
|
|
|
self.crawl_response(response, item)
|
|
|
self._update_crawl_task(item["_id"], crawl_status='finished')
|
|
@@ -232,10 +240,10 @@ class DetailSpider:
|
|
|
self.user = scheduler.user
|
|
|
finished = self.crawl_spider(scheduler)
|
|
|
if finished:
|
|
|
- '''完成采集任务'''
|
|
|
+ # 完成采集任务
|
|
|
scheduler.finished()
|
|
|
else:
|
|
|
- '''暂无采集任务'''
|
|
|
+ # 暂无采集任务
|
|
|
scheduler.wait_for_next_task()
|
|
|
|
|
|
|