|
@@ -41,15 +41,15 @@ class CrawlDetailPageSpider:
|
|
|
self.senior_account = 'runhekeji'
|
|
|
|
|
|
@staticmethod
|
|
|
- def select_user(rows: dict, sc: Scheduler):
|
|
|
+ def select_user(rows: dict, username):
|
|
|
"""
|
|
|
选择用户账号,并在采集内容中添加用户账号
|
|
|
|
|
|
:param rows: 采集内容
|
|
|
- :param sc: 采集账号任务分配调度器
|
|
|
+ :param username: 采集账号
|
|
|
:return: 用户账号和账号cookie
|
|
|
"""
|
|
|
- account = rows.get('account', sc.user.username)
|
|
|
+ account = rows.get('account', username)
|
|
|
rows.update({'account': account})
|
|
|
return account, load_login_cookies(account)
|
|
|
|
|
@@ -244,7 +244,7 @@ class CrawlDetailPageSpider:
|
|
|
{'$set': {'crawl': status}}
|
|
|
)
|
|
|
|
|
|
- def crawl_spider(self, rows: dict, sc: Scheduler):
|
|
|
+ def crawl_spider(self, rows: dict, user, account, cookies):
|
|
|
headers = {
|
|
|
'Host': 'www.zbytb.com',
|
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
|
|
@@ -254,20 +254,16 @@ class CrawlDetailPageSpider:
|
|
|
}
|
|
|
headers.update({'Referer': rows['competehref']})
|
|
|
url = self.prepare_url(rows)
|
|
|
- account, cookies = self.select_user(rows, sc)
|
|
|
- user = sc.query_user(account)
|
|
|
success, response = self.crawl_request(user, url, headers, cookies)
|
|
|
print(rows['competehref'])
|
|
|
if success:
|
|
|
self.crawl_success(response, rows)
|
|
|
- sc.update_count(1)
|
|
|
else:
|
|
|
self.crawl_error(
|
|
|
spider_code=rows['spidercode'],
|
|
|
account=account,
|
|
|
response=response
|
|
|
)
|
|
|
- sc.update_count(0)
|
|
|
|
|
|
def _spider(self, sc: Scheduler):
|
|
|
while True:
|
|
@@ -279,8 +275,13 @@ class CrawlDetailPageSpider:
|
|
|
sc.crawl_url = item['competehref']
|
|
|
try:
|
|
|
CheckTask(item)
|
|
|
- self.crawl_spider(item, sc)
|
|
|
+ account, cookies = self.select_user(item, sc.user.username)
|
|
|
+ user = sc.query_user(account)
|
|
|
+ if user is None:
|
|
|
+ return False
|
|
|
+ self.crawl_spider(item, user, account, cookies)
|
|
|
self.update_crawl_status(item, False)
|
|
|
+ sc.crawl_counter(1)
|
|
|
sc.wait_for_next_task(10)
|
|
|
except JyBasicException as e:
|
|
|
if e.code == 10105:
|
|
@@ -296,12 +297,11 @@ class CrawlDetailPageSpider:
|
|
|
{'$set': {'crawl_status': 'error'}}
|
|
|
)
|
|
|
self.update_crawl_status(item, False)
|
|
|
+ sc.crawl_counter(0)
|
|
|
|
|
|
def start(self):
|
|
|
- query = {'used': False, 'site': '中国招标与采购网'}
|
|
|
while True:
|
|
|
- with Scheduler(query) as scheduler:
|
|
|
- scheduler.crawl_type = 'detail'
|
|
|
+ with Scheduler(site='中国招标与采购网', crawl_type='detail') as scheduler:
|
|
|
if scheduler.crawl_start:
|
|
|
finished = self._spider(scheduler)
|
|
|
if not finished:
|