|
@@ -11,9 +11,9 @@ from crawler.clean_html import cleaner, clean_js
|
|
from crawler.crawl_scheduler import Scheduler
|
|
from crawler.crawl_scheduler import Scheduler
|
|
from crawler.defaults import http_request_get
|
|
from crawler.defaults import http_request_get
|
|
from crawler.login import load_login_cookies, login, User, login_status_check
|
|
from crawler.login import load_login_cookies, login, User, login_status_check
|
|
-from utils.attachment import (
|
|
|
|
|
|
+from utils.attachment import AttachmentDownloader
|
|
|
|
+from utils.clean_file import (
|
|
extract_file_type,
|
|
extract_file_type,
|
|
- AttachmentDownloader,
|
|
|
|
extract_file_name_by_href
|
|
extract_file_name_by_href
|
|
)
|
|
)
|
|
from utils.databases import mongo_table, int2long
|
|
from utils.databases import mongo_table, int2long
|
|
@@ -112,35 +112,32 @@ class CrawlDetailPageSpider:
|
|
logger.error(err_msg)
|
|
logger.error(err_msg)
|
|
|
|
|
|
def download_attachment(self, content: str, rows: dict):
|
|
def download_attachment(self, content: str, rows: dict):
|
|
- index = 0
|
|
|
|
- attachments = {}
|
|
|
|
soup = BeautifulSoup(content, "lxml")
|
|
soup = BeautifulSoup(content, "lxml")
|
|
- all_node = soup.findAll("a") or soup.findAll("iframe")
|
|
|
|
- for node in all_node:
|
|
|
|
|
|
+ attachments = {}
|
|
|
|
+ nums = 0
|
|
|
|
+ nodes = soup.findAll("a") or soup.findAll("iframe")
|
|
|
|
+ for node in nodes:
|
|
file_name, file_type = (node.string or node.text), None
|
|
file_name, file_type = (node.string or node.text), None
|
|
file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
|
|
file_path = node.attrs.get("href", "") or node.attrs.get("src", "")
|
|
# 附件可能包含在一个iframe中
|
|
# 附件可能包含在一个iframe中
|
|
_id = node.attrs.get('id')
|
|
_id = node.attrs.get('id')
|
|
if _id == 'pdfContainer':
|
|
if _id == 'pdfContainer':
|
|
file_type = 'pdf'
|
|
file_type = 'pdf'
|
|
|
|
+
|
|
# 抽取文件类型
|
|
# 抽取文件类型
|
|
- elif file_type is None:
|
|
|
|
- file_type = (extract_file_type(file_name)
|
|
|
|
- or extract_file_type(file_path))
|
|
|
|
|
|
+ if file_type is None:
|
|
|
|
+ file_type = (extract_file_type(file_name) or extract_file_type(file_path))
|
|
|
|
|
|
# 抽取文件名称
|
|
# 抽取文件名称
|
|
try:
|
|
try:
|
|
parser = urlparse(file_path)
|
|
parser = urlparse(file_path)
|
|
- except ValueError:
|
|
|
|
- pass
|
|
|
|
- else:
|
|
|
|
if parser.scheme in ['https', 'http'] and file_type is not None:
|
|
if parser.scheme in ['https', 'http'] and file_type is not None:
|
|
if not file_name:
|
|
if not file_name:
|
|
name = extract_file_name_by_href(file_path, file_type)
|
|
name = extract_file_name_by_href(file_path, file_type)
|
|
if name is not None:
|
|
if name is not None:
|
|
file_name = name
|
|
file_name = name
|
|
else:
|
|
else:
|
|
- file_name = f"{rows['title']}_{index}"
|
|
|
|
|
|
+ file_name = f"{rows['title']}_{nums}"
|
|
|
|
|
|
attachment = self.attachment_downloader.download(
|
|
attachment = self.attachment_downloader.download(
|
|
file_name=file_name,
|
|
file_name=file_name,
|
|
@@ -148,8 +145,10 @@ class CrawlDetailPageSpider:
|
|
download_url=file_path,
|
|
download_url=file_path,
|
|
)
|
|
)
|
|
if len(attachment) > 0:
|
|
if len(attachment) > 0:
|
|
- attachments[str(index + 1)] = attachment
|
|
|
|
- index += 1
|
|
|
|
|
|
+ attachments[str(len(attachments) + 1)] = attachment
|
|
|
|
+ nums += 1
|
|
|
|
+ except ValueError:
|
|
|
|
+ pass
|
|
|
|
|
|
file_url = soup.findAll('pdfpath')
|
|
file_url = soup.findAll('pdfpath')
|
|
if file_url:
|
|
if file_url:
|
|
@@ -157,13 +156,13 @@ class CrawlDetailPageSpider:
|
|
file_type = extract_file_type(file_url)
|
|
file_type = extract_file_type(file_url)
|
|
file_name = rows['title']
|
|
file_name = rows['title']
|
|
if file_type:
|
|
if file_type:
|
|
- attachment2 = self.attachment_downloader.download(
|
|
|
|
|
|
+ attachment = self.attachment_downloader.download(
|
|
file_name=file_name,
|
|
file_name=file_name,
|
|
file_type=file_type,
|
|
file_type=file_type,
|
|
- download_url=file_path,
|
|
|
|
|
|
+ download_url=file_url,
|
|
)
|
|
)
|
|
- if len(attachment2) > 0:
|
|
|
|
- attachments[str(len(attachments) + 1)] = attachment2
|
|
|
|
|
|
+ if len(attachment) > 0:
|
|
|
|
+ attachments[str(len(attachments) + 1)] = attachment
|
|
|
|
|
|
if len(attachments) > 0:
|
|
if len(attachments) > 0:
|
|
rows["projectinfo"] = {"attachments": attachments}
|
|
rows["projectinfo"] = {"attachments": attachments}
|
|
@@ -195,12 +194,14 @@ class CrawlDetailPageSpider:
|
|
root = Selector(text=response.text)
|
|
root = Selector(text=response.text)
|
|
content = root.xpath('//div[@class="conent-box"]').extract_first()
|
|
content = root.xpath('//div[@class="conent-box"]').extract_first()
|
|
if content:
|
|
if content:
|
|
- extr_html1 = root.xpath('//div[@class="conent-box"]/div[@class="xgxm"]').extract_first()
|
|
|
|
- extr_html2 = root.xpath('//div[@class="content-user"]').extract_first()
|
|
|
|
- if extr_html1:
|
|
|
|
- content = content.replace(extr_html1,'')
|
|
|
|
- if extr_html2:
|
|
|
|
- content = content.replace(extr_html2,'')
|
|
|
|
|
|
+ clean_features = [
|
|
|
|
+ '//div[@class="conent-box"]/div[@class="xgxm"]',
|
|
|
|
+ '//div[@class="content-user"]'
|
|
|
|
+ ]
|
|
|
|
+ for feature in clean_features:
|
|
|
|
+ clean_html = root.xpath(feature).extract_first()
|
|
|
|
+ if clean_html is not None:
|
|
|
|
+ content = content.replace(clean_html, '')
|
|
else:
|
|
else:
|
|
content = ''
|
|
content = ''
|
|
|
|
|