|
@@ -1,146 +0,0 @@
|
|
-# -*- coding: utf-8 -*-
|
|
|
|
-"""
|
|
|
|
-Created on {DATE}
|
|
|
|
----------
|
|
|
|
-@summary: 拟建爬虫通用快照页
|
|
|
|
----------
|
|
|
|
-@author: njpc_feapder
|
|
|
|
-"""
|
|
|
|
-import feapder
|
|
|
|
-import re
|
|
|
|
-import json
|
|
|
|
-import time, random
|
|
|
|
-from items.njpc_item import DataNjpcItem
|
|
|
|
-from untils.attachment import AttachmentDownloader as AD
|
|
|
|
-from untils.attachment_res import AttachmentDownloader as ADres
|
|
|
|
-from lxml.html import fromstring
|
|
|
|
-from untils.tools import remove_htmldata, extract_file_type
|
|
|
|
-from feapder.utils.log import log
|
|
|
|
-
|
|
|
|
-redis_key = "njpc_details"
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-# 拟建爬虫下载附件
|
|
|
|
-def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
|
|
|
|
- def parse_filetype(response, filetypes):
|
|
|
|
- val = response.headers.get("content-disposition")
|
|
|
|
- filetype = val.split('.')[-1].replace('"', '').replace("'", "")
|
|
|
|
- filetypes.append(filetype)
|
|
|
|
-
|
|
|
|
- root = fromstring(html)
|
|
|
|
- file_info = root.xpath('//a[@href]')
|
|
|
|
- if file_info:
|
|
|
|
- attachments = {}
|
|
|
|
- for info in file_info:
|
|
|
|
- file_url = "".join(info.xpath('./@href'))
|
|
|
|
- file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
|
|
|
|
- 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
|
|
|
|
- file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
|
|
|
|
- if file_type.lower() == "res":
|
|
|
|
- if s_key in file_url and file_name:
|
|
|
|
- file_name = file_name.strip()
|
|
|
|
- attachment = ADres().fetch_attachment(
|
|
|
|
- file_name=file_name,
|
|
|
|
- download_url=file_url,
|
|
|
|
- callback=parse_filetype,
|
|
|
|
- proxies=proxies,
|
|
|
|
- headers=headers,
|
|
|
|
- )
|
|
|
|
- attachments[str(len(attachments) + 1)] = attachment
|
|
|
|
- else:
|
|
|
|
- if file_type.lower() in file_types:
|
|
|
|
- file_tp = file_type
|
|
|
|
- else:
|
|
|
|
- file_tp = extract_file_type(file_name, file_url, [file_type])
|
|
|
|
-
|
|
|
|
- if file_tp and s_key in file_url and file_name:
|
|
|
|
- file_name = file_name.strip()
|
|
|
|
- attachment = AD().fetch_attachment(
|
|
|
|
- file_name=file_name, file_type=file_tp, download_url=file_url,
|
|
|
|
- proxies=proxies, headers=headers,)
|
|
|
|
- attachments[str(len(attachments) + 1)] = attachment
|
|
|
|
- return attachments
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
-
|
|
|
|
- def start_requests(self):
|
|
|
|
- data_lsit = self.get_tasks_by_rabbitmq(limit=100)
|
|
|
|
- for item in data_lsit:
|
|
|
|
- # log.debug(item)
|
|
|
|
- request_params = item.get("request_params")
|
|
|
|
- timeout = request_params.get('timeout', 10)
|
|
|
|
- request_params.pop('timeout', None)
|
|
|
|
- is_join_html = item.get("is_join_html") # 正文是否根据xpath拼接
|
|
|
|
- extra_html = item.get("extra_html") # 过滤无效内容
|
|
|
|
- title_xpath = item.get("title_xpath") # 三级页标题
|
|
|
|
- extra_activity = item.get("extra_activity") # 额外的需求动作
|
|
|
|
- file_params = item.get("file_params") # 附件下载配置
|
|
|
|
- if item.get("proxies"):
|
|
|
|
- yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
|
|
|
|
- is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
|
|
|
|
- callback=item.get("parser"), file_params=file_params,
|
|
|
|
- extra_activity=extra_activity, timeout=timeout, **request_params)
|
|
|
|
- else:
|
|
|
|
- yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
|
|
|
|
- is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
|
|
|
|
- callback=item.get("parser"), file_params=file_params,
|
|
|
|
- extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
|
|
|
|
-
|
|
|
|
- def detail_get(self, request, response):
|
|
|
|
- items = request.item
|
|
|
|
- data_item = DataNjpcItem(**items)
|
|
|
|
-
|
|
|
|
- html = ''
|
|
|
|
- for xpath in request.deal_detail:
|
|
|
|
- htmls = response.xpath(xpath).extract_first() # 标书详细内容
|
|
|
|
- if request.is_join_html:
|
|
|
|
- if htmls is not None:
|
|
|
|
- html += htmls
|
|
|
|
- else:
|
|
|
|
- if htmls is not None:
|
|
|
|
- html = htmls
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- if request.title_xpath:
|
|
|
|
- for sxpath in request.title_xpath:
|
|
|
|
- title = response.xpath(sxpath).extract_first() # 三级页标题
|
|
|
|
- if title:
|
|
|
|
- data_item.title = title.strip()
|
|
|
|
- if "..." in data_item.projectname or "…" in data_item.projectname:
|
|
|
|
- data_item.projectname = title.strip()
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- try:
|
|
|
|
- if request.extra_activity:
|
|
|
|
- from untils.tools import njpc_fields_extract, njpc_fields_extract_special
|
|
|
|
- exec(request.extra_activity)
|
|
|
|
- except:
|
|
|
|
- pass
|
|
|
|
-
|
|
|
|
- data_item.contenthtml = remove_htmldata(request.extra_html, html, response)
|
|
|
|
-
|
|
|
|
- fp = request.file_params or {}
|
|
|
|
- attachments = njpc_get_files(
|
|
|
|
- html,
|
|
|
|
- file_type=fp.get("file_type", ""),
|
|
|
|
- s_key=fp.get("s_key", "http"),
|
|
|
|
- proxies=fp.get("proxies", False),
|
|
|
|
- headers=fp.get('headers', {}
|
|
|
|
- )
|
|
|
|
- if attachments:
|
|
|
|
- data_item.projectinfo = {"attachments": attachments}
|
|
|
|
-
|
|
|
|
- yield data_item
|
|
|
|
-
|
|
|
|
- def detail_json(self, request, response):
|
|
|
|
- items = request.item
|
|
|
|
- data_item = DataNjpcItem(**items)
|
|
|
|
-
|
|
|
|
- exec(request.deal_detail)
|
|
|
|
-
|
|
|
|
- yield data_item
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-if __name__ == '__main__':
|
|
|
|
- Details(redis_key="detail:njpc_details").start()
|
|
|