|
@@ -150,6 +150,20 @@ class CrawlDetailPageSpider:
|
|
|
attachments[str(index + 1)] = attachment
|
|
|
index += 1
|
|
|
|
|
|
+ file_url = soup.findAll('pdfpath')
|
|
|
+ if file_url:
|
|
|
+ file_url = list(file_url[0].stripped_strings)[0]
|
|
|
+ file_type = extract_file_type(file_url)
|
|
|
+ file_name = rows['title']
|
|
|
+ if file_type:
|
|
|
+ attachment2 = self.attachment_downloader.download(
|
|
|
+ file_name=file_name,
|
|
|
+ file_type=file_type,
|
|
|
+ download_url=file_path,
|
|
|
+ )
|
|
|
+ if len(attachment2) > 0:
|
|
|
+ attachments[str(len(attachments) + 1)] = attachment2
|
|
|
+
|
|
|
if len(attachments) > 0:
|
|
|
rows["projectinfo"] = {"attachments": attachments}
|
|
|
|
|
@@ -254,47 +268,47 @@ class CrawlDetailPageSpider:
|
|
|
def crawl_spider(self, sc: Scheduler):
|
|
|
while True:
|
|
|
next_task_interval = None
|
|
|
- # item = sc.crawl_task
|
|
|
- # if len(item) == 0:
|
|
|
- # return False
|
|
|
- item_list = mongo_table('py_spider', 'zbytb_list').find({"count": 0, "crawl": False, "crawl_status": {"$exists": False}}).limit(1).sort("_id",1)
|
|
|
- for item in item_list:
|
|
|
- logger.info(f">>> {item['title']} - {item['competehref']}")
|
|
|
- self._lock_task(item)
|
|
|
- sc.spider_code = self.spider_code = item['spidercode']
|
|
|
- sc.crawl_url = item['competehref']
|
|
|
+ item = sc.crawl_task
|
|
|
+ if len(item) == 0:
|
|
|
+ return False
|
|
|
+ # item_list = mongo_table('py_spider', 'zbytb_list').find({"count": 0, "crawl": False, "crawl_status": {"$exists": False}}).limit(1).sort("_id",1)
|
|
|
+ # for item in item_list:
|
|
|
+ logger.info(f">>> {item['title']} - {item['competehref']}")
|
|
|
+ self._lock_task(item)
|
|
|
+ sc.spider_code = self.spider_code = item['spidercode']
|
|
|
+ sc.crawl_url = item['competehref']
|
|
|
|
|
|
- # 分配账号和账号cookie
|
|
|
- self.account = item.get('account', sc.user.username)
|
|
|
- self.cookies = load_login_cookies(self.account)
|
|
|
- user = sc.query_user(self.account)
|
|
|
- if user is None:
|
|
|
- return False
|
|
|
+ # 分配账号和账号cookie
|
|
|
+ self.account = item.get('account', sc.user.username)
|
|
|
+ self.cookies = load_login_cookies(self.account)
|
|
|
+ user = sc.query_user(self.account)
|
|
|
+ if user is None:
|
|
|
+ return False
|
|
|
|
|
|
- try:
|
|
|
- # CheckTask(item)
|
|
|
- url = self.prepare_url(item)
|
|
|
- referer = item['competehref']
|
|
|
- response = self.crawl_request(url, referer, user)
|
|
|
- if response is not None:
|
|
|
- num = self.crawl_response(response, item)
|
|
|
- sc.crawl_counter(num)
|
|
|
- next_task_interval = 10
|
|
|
- except (ZbYTbCrawlError, Exception) as e:
|
|
|
- if getattr(e, 'code', None) is None:
|
|
|
- err = ZbYTbCrawlError(unknown_err=e)
|
|
|
- sc.err_record(err)
|
|
|
- elif e.code == 10105:
|
|
|
- # 抛出异常时,将es查询统计结果进行更新
|
|
|
- self._update_crawl_task(item["_id"], count=item['count'])
|
|
|
- else:
|
|
|
- sc.err_record(e)
|
|
|
- self._update_crawl_task(item["_id"], crawl_status='error')
|
|
|
- sc.crawl_counter(0)
|
|
|
- next_task_interval = 0.1
|
|
|
- finally:
|
|
|
- self._release_task(item)
|
|
|
- sc.wait_for_next_task(next_task_interval)
|
|
|
+ try:
|
|
|
+ CheckTask(item)
|
|
|
+ url = self.prepare_url(item)
|
|
|
+ referer = item['competehref']
|
|
|
+ response = self.crawl_request(url, referer, user)
|
|
|
+ if response is not None:
|
|
|
+ num = self.crawl_response(response, item)
|
|
|
+ sc.crawl_counter(num)
|
|
|
+ next_task_interval = 10
|
|
|
+ except (ZbYTbCrawlError, Exception) as e:
|
|
|
+ if getattr(e, 'code', None) is None:
|
|
|
+ err = ZbYTbCrawlError(unknown_err=e)
|
|
|
+ sc.err_record(err)
|
|
|
+ elif e.code == 10105:
|
|
|
+ # 抛出异常时,将es查询统计结果进行更新
|
|
|
+ self._update_crawl_task(item["_id"], count=item['count'])
|
|
|
+ else:
|
|
|
+ sc.err_record(e)
|
|
|
+ self._update_crawl_task(item["_id"], crawl_status='error')
|
|
|
+ sc.crawl_counter(0)
|
|
|
+ next_task_interval = 0.1
|
|
|
+ finally:
|
|
|
+ self._release_task(item)
|
|
|
+ sc.wait_for_next_task(next_task_interval)
|
|
|
|
|
|
def start(self):
|
|
|
while True:
|