|
@@ -4,7 +4,6 @@ import time
|
|
from urllib.parse import urlencode, urlparse
|
|
from urllib.parse import urlencode, urlparse
|
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
from bs4 import BeautifulSoup
|
|
-from pymongo.errors import DuplicateKeyError
|
|
|
|
|
|
|
|
from crawler.check_utils import CheckText, CheckTask
|
|
from crawler.check_utils import CheckText, CheckTask
|
|
from crawler.clean_html import cleaner, clean_js
|
|
from crawler.clean_html import cleaner, clean_js
|
|
@@ -18,9 +17,10 @@ from utils.attachment import (
|
|
)
|
|
)
|
|
from utils.databases import mongo_table, int2long
|
|
from utils.databases import mongo_table, int2long
|
|
from utils.execptions import (
|
|
from utils.execptions import (
|
|
- CustomAccountPrivilegeError,
|
|
|
|
- AttachmentNullError,
|
|
|
|
- CustomCheckError, JyBasicException
|
|
|
|
|
|
+ AccountError,
|
|
|
|
+ AttachmentError,
|
|
|
|
+ CheckError,
|
|
|
|
+ ZbYTbCrawlError
|
|
)
|
|
)
|
|
from utils.log import logger
|
|
from utils.log import logger
|
|
|
|
|
|
@@ -75,7 +75,7 @@ class CrawlDetailPageSpider:
|
|
|
|
|
|
:param tid: 采集条目ObjectId
|
|
:param tid: 采集条目ObjectId
|
|
"""
|
|
"""
|
|
- # 需要高级会员才能查询的招标信息,切换高级账号
|
|
|
|
|
|
+ # 需要高级会员才能查询的招标信息,指定使用高级账号
|
|
self._update_crawl_task(tid, account=self.senior_account)
|
|
self._update_crawl_task(tid, account=self.senior_account)
|
|
|
|
|
|
def crawl_error(
|
|
def crawl_error(
|
|
@@ -110,11 +110,12 @@ class CrawlDetailPageSpider:
|
|
self.crawl_error_tab.insert_one(items)
|
|
self.crawl_error_tab.insert_one(items)
|
|
logger.error(err_msg)
|
|
logger.error(err_msg)
|
|
|
|
|
|
- def get_attachment(self, content: str, rows: dict):
|
|
|
|
|
|
+ def download_attachment(self, content: str, rows: dict):
|
|
|
|
+ logger.info('>>> 下载附件')
|
|
|
|
+ index = 0
|
|
|
|
+ attachments = {}
|
|
soup = BeautifulSoup(content, "lxml")
|
|
soup = BeautifulSoup(content, "lxml")
|
|
all_node = soup.findAll("a") or soup.findAll("iframe")
|
|
all_node = soup.findAll("a") or soup.findAll("iframe")
|
|
- attachments = {}
|
|
|
|
- index = 0
|
|
|
|
for node in all_node:
|
|
for node in all_node:
|
|
file_name, file_type = (node.string or node.text), None
|
|
file_name, file_type = (node.string or node.text), None
|
|
file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
|
|
file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
|
|
@@ -128,29 +129,33 @@ class CrawlDetailPageSpider:
|
|
or extract_file_type(file_path))
|
|
or extract_file_type(file_path))
|
|
|
|
|
|
# 抽取文件名称
|
|
# 抽取文件名称
|
|
- parser = urlparse(file_path)
|
|
|
|
- if parser.scheme in ['https', 'http'] and file_type is not None:
|
|
|
|
- if not file_name:
|
|
|
|
- name = extract_file_name_by_href(file_path, file_type)
|
|
|
|
- if name is not None:
|
|
|
|
- file_name = name
|
|
|
|
- else:
|
|
|
|
- file_name = f"{rows['title']}_{index}"
|
|
|
|
|
|
+ try:
|
|
|
|
+ parser = urlparse(file_path)
|
|
|
|
+ except ValueError:
|
|
|
|
+ pass
|
|
|
|
+ else:
|
|
|
|
+ if parser.scheme in ['https', 'http'] and file_type is not None:
|
|
|
|
+ if not file_name:
|
|
|
|
+ name = extract_file_name_by_href(file_path, file_type)
|
|
|
|
+ if name is not None:
|
|
|
|
+ file_name = name
|
|
|
|
+ else:
|
|
|
|
+ file_name = f"{rows['title']}_{index}"
|
|
|
|
|
|
- attachment = self.attachment_downloader.download(
|
|
|
|
- file_name=file_name,
|
|
|
|
- file_type=file_type,
|
|
|
|
- download_url=file_path,
|
|
|
|
- )
|
|
|
|
- if len(attachment) > 0:
|
|
|
|
- attachments[str(index + 1)] = attachment
|
|
|
|
- index += 1
|
|
|
|
|
|
+ attachment = self.attachment_downloader.download(
|
|
|
|
+ file_name=file_name,
|
|
|
|
+ file_type=file_type,
|
|
|
|
+ download_url=file_path,
|
|
|
|
+ )
|
|
|
|
+ if len(attachment) > 0:
|
|
|
|
+ attachments[str(index + 1)] = attachment
|
|
|
|
+ index += 1
|
|
|
|
|
|
- if attachments:
|
|
|
|
|
|
+ if len(attachments) > 0:
|
|
rows["projectinfo"] = {"attachments": attachments}
|
|
rows["projectinfo"] = {"attachments": attachments}
|
|
|
|
|
|
def save_data(self, content, rows: dict):
|
|
def save_data(self, content, rows: dict):
|
|
- self.get_attachment(content, rows)
|
|
|
|
|
|
+ logger.info('>>> 保存数据')
|
|
rows["contenthtml"] = clean_js(content)
|
|
rows["contenthtml"] = clean_js(content)
|
|
special = {
|
|
special = {
|
|
'<iframe[^<>]*>[\s\S]*?</iframe>': ''
|
|
'<iframe[^<>]*>[\s\S]*?</iframe>': ''
|
|
@@ -158,7 +163,7 @@ class CrawlDetailPageSpider:
|
|
rows["detail"] = cleaner(content, special=special)
|
|
rows["detail"] = cleaner(content, special=special)
|
|
try:
|
|
try:
|
|
CheckText(rows["detail"])
|
|
CheckText(rows["detail"])
|
|
- except CustomCheckError:
|
|
|
|
|
|
+ except CheckError:
|
|
# 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
|
|
# 页面是一个pdf阅读器, eg: https://www.zbytb.com/s-zhongbiao-10119392.html
|
|
rows["detail"] = "<br/>详细内容请访问原网页!"
|
|
rows["detail"] = "<br/>详细内容请访问原网页!"
|
|
rows["comeintime"] = int2long(int(time.time()))
|
|
rows["comeintime"] = int2long(int(time.time()))
|
|
@@ -170,7 +175,7 @@ class CrawlDetailPageSpider:
|
|
logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
|
|
logger.info("[采集成功]{}-{}".format(rows['title'], rows['publishtime']))
|
|
|
|
|
|
def crawl_response(self, response, rows: dict):
|
|
def crawl_response(self, response, rows: dict):
|
|
- # print(rows['competehref'])
|
|
|
|
|
|
+ logger.info('>>> 采集响应')
|
|
source = re.findall(r'Inner(.*?);Inner', response.text)
|
|
source = re.findall(r'Inner(.*?);Inner', response.text)
|
|
if len(source) > 0:
|
|
if len(source) > 0:
|
|
content = source[0][13:-1]
|
|
content = source[0][13:-1]
|
|
@@ -180,10 +185,11 @@ class CrawlDetailPageSpider:
|
|
counter = 0
|
|
counter = 0
|
|
try:
|
|
try:
|
|
CheckText(content)
|
|
CheckText(content)
|
|
|
|
+ self.download_attachment(content, rows)
|
|
self.save_data(content, rows)
|
|
self.save_data(content, rows)
|
|
self._update_crawl_task(rows['_id'], crawl_status='finished')
|
|
self._update_crawl_task(rows['_id'], crawl_status='finished')
|
|
counter = 1
|
|
counter = 1
|
|
- except (AttachmentNullError, CustomCheckError) as e:
|
|
|
|
|
|
+ except (AttachmentError, CheckError) as e:
|
|
if e.code == 10104 and self.account != self.senior_account:
|
|
if e.code == 10104 and self.account != self.senior_account:
|
|
self.switch_senior_user(rows)
|
|
self.switch_senior_user(rows)
|
|
else:
|
|
else:
|
|
@@ -197,11 +203,12 @@ class CrawlDetailPageSpider:
|
|
account=self.account,
|
|
account=self.account,
|
|
err_msg=err_msg
|
|
err_msg=err_msg
|
|
)
|
|
)
|
|
- except CustomAccountPrivilegeError:
|
|
|
|
|
|
+ except AccountError:
|
|
self.switch_senior_user(rows)
|
|
self.switch_senior_user(rows)
|
|
return counter
|
|
return counter
|
|
|
|
|
|
def crawl_request(self, url: str, referer: str, user: User):
|
|
def crawl_request(self, url: str, referer: str, user: User):
|
|
|
|
+ logger.info('>>> 采集请求')
|
|
headers = {
|
|
headers = {
|
|
'Host': 'www.zbytb.com',
|
|
'Host': 'www.zbytb.com',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
|
|
@@ -248,12 +255,14 @@ class CrawlDetailPageSpider:
|
|
self._lock_task(item)
|
|
self._lock_task(item)
|
|
sc.spider_code = self.spider_code = item['spidercode']
|
|
sc.spider_code = self.spider_code = item['spidercode']
|
|
sc.crawl_url = item['competehref']
|
|
sc.crawl_url = item['competehref']
|
|
- # 获取闲置用户账号
|
|
|
|
|
|
+
|
|
|
|
+ # 分配账号和账号cookie
|
|
self.account = item.get('account', sc.user.username)
|
|
self.account = item.get('account', sc.user.username)
|
|
self.cookies = load_login_cookies(self.account)
|
|
self.cookies = load_login_cookies(self.account)
|
|
user = sc.query_user(self.account)
|
|
user = sc.query_user(self.account)
|
|
if user is None:
|
|
if user is None:
|
|
return False
|
|
return False
|
|
|
|
+
|
|
try:
|
|
try:
|
|
CheckTask(item)
|
|
CheckTask(item)
|
|
url = self.prepare_url(item)
|
|
url = self.prepare_url(item)
|
|
@@ -263,7 +272,7 @@ class CrawlDetailPageSpider:
|
|
num = self.crawl_response(response, item)
|
|
num = self.crawl_response(response, item)
|
|
sc.crawl_counter(num)
|
|
sc.crawl_counter(num)
|
|
next_task_interval = 10
|
|
next_task_interval = 10
|
|
- except JyBasicException as e:
|
|
|
|
|
|
+ except (ZbYTbCrawlError, Exception) as e:
|
|
if e.code == 10105:
|
|
if e.code == 10105:
|
|
# 抛出异常时,将es查询统计结果进行更新
|
|
# 抛出异常时,将es查询统计结果进行更新
|
|
self._update_crawl_task(item["_id"], count=item['count'])
|
|
self._update_crawl_task(item["_id"], count=item['count'])
|
|
@@ -280,8 +289,5 @@ class CrawlDetailPageSpider:
|
|
while True:
|
|
while True:
|
|
with Scheduler(site='中国招标与采购网', crawl_type='detail') as scheduler:
|
|
with Scheduler(site='中国招标与采购网', crawl_type='detail') as scheduler:
|
|
if scheduler.crawl_start:
|
|
if scheduler.crawl_start:
|
|
- finished = self.crawl_spider(scheduler)
|
|
|
|
- if not finished:
|
|
|
|
- scheduler.wait_for_next_task(2)
|
|
|
|
- else:
|
|
|
|
- scheduler.wait_for_next_task(60)
|
|
|
|
|
|
+ self.crawl_spider(scheduler)
|
|
|
|
+ scheduler.finished(10)
|