|
@@ -16,6 +16,7 @@ from items.njpc_item import DataNjpcItem
|
|
from untils.attachment import AttachmentDownloader as AD
|
|
from untils.attachment import AttachmentDownloader as AD
|
|
from untils.attachment_res import AttachmentDownloader as ADres
|
|
from untils.attachment_res import AttachmentDownloader as ADres
|
|
from lxml.html import fromstring
|
|
from lxml.html import fromstring
|
|
|
|
+from untils.tools import remove_htmldata,extract_file_type
|
|
from feapder.utils.log import log
|
|
from feapder.utils.log import log
|
|
|
|
|
|
|
|
|
|
@@ -24,39 +25,42 @@ redis_key = "njpc_details"
|
|
|
|
|
|
# 拟建爬虫下载附件
|
|
# 拟建爬虫下载附件
|
|
def njpc_get_files(html,file_type="",s_key="http",proxies=False):
|
|
def njpc_get_files(html,file_type="",s_key="http",proxies=False):
|
|
|
|
+
|
|
|
|
+ def parse_filetype(response, filetypes):
|
|
|
|
+ val = response.headers.get("content-disposition")
|
|
|
|
+ filetype = val.split('.')[-1].replace('"', '').replace("'", "")
|
|
|
|
+ filetypes.append(filetype)
|
|
|
|
+
|
|
root = fromstring(html)
|
|
root = fromstring(html)
|
|
file_info = root.xpath('//a[@href]')
|
|
file_info = root.xpath('//a[@href]')
|
|
if file_info:
|
|
if file_info:
|
|
attachments = {}
|
|
attachments = {}
|
|
for info in file_info:
|
|
for info in file_info:
|
|
file_url = "".join(info.xpath('./@href'))
|
|
file_url = "".join(info.xpath('./@href'))
|
|
- file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
|
|
|
|
- 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps']
|
|
|
|
|
|
+ file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
|
|
|
|
+ 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
|
|
file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
|
|
file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
|
|
if file_type.lower() == "res":
|
|
if file_type.lower() == "res":
|
|
- file_type_name = "content-disposition"
|
|
|
|
- get_file_type = '''
|
|
|
|
-file_type = file_type_txt.split('.')[-1].replace('"','').replace("'","")
|
|
|
|
-file_types.append(file_type)
|
|
|
|
-'''
|
|
|
|
if s_key in file_url and file_name:
|
|
if s_key in file_url and file_name:
|
|
file_name = file_name.strip()
|
|
file_name = file_name.strip()
|
|
- attachment = ADres().fetch_attachment(get_file_type=get_file_type,file_type_name=file_type_name,
|
|
|
|
- proxies=proxies,file_name=file_name,download_url=file_url,enable_proxy=False,)
|
|
|
|
|
|
+ attachment = ADres().fetch_attachment(
|
|
|
|
+ file_name=file_name,
|
|
|
|
+ download_url=file_url,
|
|
|
|
+ callback=parse_filetype,
|
|
|
|
+ proxies=proxies,
|
|
|
|
+ )
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
else:
|
|
else:
|
|
if file_type.lower() in file_types:
|
|
if file_type.lower() in file_types:
|
|
file_tp = file_type
|
|
file_tp = file_type
|
|
else:
|
|
else:
|
|
- file_tp = file_url.split(".")[-1].lower()
|
|
|
|
- if file_tp not in file_types and file_name:
|
|
|
|
- file_tp = file_name.strip().split(".")[-1].lower()
|
|
|
|
|
|
+ file_tp = extract_file_type(file_name,file_url,[file_type])
|
|
|
|
|
|
- if file_tp in file_types and s_key in file_url and file_name:
|
|
|
|
|
|
+ if file_tp and s_key in file_url and file_name:
|
|
file_name = file_name.strip()
|
|
file_name = file_name.strip()
|
|
attachment = AD().fetch_attachment(
|
|
attachment = AD().fetch_attachment(
|
|
file_name=file_name, file_type=file_tp, download_url=file_url,
|
|
file_name=file_name, file_type=file_tp, download_url=file_url,
|
|
- enable_proxy=False, proxies=proxies)
|
|
|
|
|
|
+ proxies=proxies)
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
return attachments
|
|
return attachments
|
|
|
|
|
|
@@ -64,7 +68,7 @@ file_types.append(file_type)
|
|
class Details(feapder.PlanToBuildDetailSpider):
|
|
class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
def start_requests(self):
|
|
def start_requests(self):
|
|
- data_lsit = self.get_tasks_by_rabbitmq(limit=1)
|
|
|
|
|
|
+ data_lsit = self.get_tasks_by_rabbitmq(limit=100)
|
|
for item in data_lsit:
|
|
for item in data_lsit:
|
|
log.debug(item)
|
|
log.debug(item)
|
|
request_params = item.get("request_params")
|
|
request_params = item.get("request_params")
|
|
@@ -72,21 +76,17 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
is_join_html = item.get("is_join_html") # 正文是否根据xpath拼接
|
|
is_join_html = item.get("is_join_html") # 正文是否根据xpath拼接
|
|
extra_html = item.get("extra_html") # 过滤无效内容
|
|
extra_html = item.get("extra_html") # 过滤无效内容
|
|
title_xpath = item.get("title_xpath") # 三级页标题
|
|
title_xpath = item.get("title_xpath") # 三级页标题
|
|
- render = item.get("render") or False # 是否开启浏览器
|
|
|
|
- render_time = item.get("render_time") or 3 # 浏览器渲染时间
|
|
|
|
extra_activity = item.get("extra_activity") # 额外的需求动作
|
|
extra_activity = item.get("extra_activity") # 额外的需求动作
|
|
file_params = item.get("file_params") # 附件下载配置
|
|
file_params = item.get("file_params") # 附件下载配置
|
|
if item.get("proxies"):
|
|
if item.get("proxies"):
|
|
yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
|
|
yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
|
|
is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
|
|
is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
|
|
- callback=item.get("parser"), render=render, render_time=render_time,
|
|
|
|
- file_params=file_params,
|
|
|
|
|
|
+ callback=item.get("parser"), file_params=file_params,
|
|
extra_activity=extra_activity, timeout=timeout, **request_params)
|
|
extra_activity=extra_activity, timeout=timeout, **request_params)
|
|
else:
|
|
else:
|
|
yield feapder.Request(url=item.get("parser_url"), item=item,deal_detail=item.get("deal_detail"),
|
|
yield feapder.Request(url=item.get("parser_url"), item=item,deal_detail=item.get("deal_detail"),
|
|
is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
|
|
is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
|
|
- callback=item.get("parser"), render=render, render_time=render_time,
|
|
|
|
- file_params=file_params,
|
|
|
|
|
|
+ callback=item.get("parser"), file_params=file_params,
|
|
extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
|
|
extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
|
|
|
|
|
|
def detail_get(self,request,response):
|
|
def detail_get(self,request,response):
|