|
@@ -6,10 +6,6 @@ Created on 2023-10-08
|
|
---------
|
|
---------
|
|
@author: njpc_feapder
|
|
@author: njpc_feapder
|
|
"""
|
|
"""
|
|
-import re
|
|
|
|
-import json
|
|
|
|
-import time
|
|
|
|
-import random
|
|
|
|
|
|
|
|
import feapder
|
|
import feapder
|
|
from items.njpc_item import DataNjpcItem
|
|
from items.njpc_item import DataNjpcItem
|
|
@@ -17,6 +13,35 @@ from lxml.html import fromstring
|
|
from untils.attachment import AttachmentDownloader as AD
|
|
from untils.attachment import AttachmentDownloader as AD
|
|
from untils.attachment_res import AttachmentDownloader as ADres
|
|
from untils.attachment_res import AttachmentDownloader as ADres
|
|
from untils.tools import remove_htmldata, extract_file_type
|
|
from untils.tools import remove_htmldata, extract_file_type
|
|
|
|
+try:
|
|
|
|
+ import re
|
|
|
|
+ import json
|
|
|
|
+ import time
|
|
|
|
+ import random
|
|
|
|
+except ImportError:
|
|
|
|
+ raise
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+DRISSIONPAGE = dict(
|
|
|
|
+ pool_size=1, # 浏览器标签页的数量
|
|
|
|
+ user_agent=None, # 字符串
|
|
|
|
+ load_images=False, # 是否加载图片
|
|
|
|
+ proxy=None, # xxx.xxx.xxx.xxx:xxxx
|
|
|
|
+ headless=True, # 是否为无头浏览器
|
|
|
|
+ timeout=30, # 请求超时时间
|
|
|
|
+ retry=1, # 连接失败重试次数
|
|
|
|
+ interval=0.5, # 连接失败重试间隔(秒)
|
|
|
|
+ page_load=30,
|
|
|
|
+ render_time=0, # 渲染时长,即打开网页等待加载超时时间
|
|
|
|
+ window_size=(1024, 800), # 窗口大小
|
|
|
|
+ driver_type="chromium",
|
|
|
|
+ load_mode="normal", # 网页加载策略, 可选值:"normal", "eager", "none"
|
|
|
|
+ download_path=None, # 下载文件的路径
|
|
|
|
+ custom_argument=[
|
|
|
|
+ "--no-sandbox",
|
|
|
|
+ "--ignore-certificate-errors"
|
|
|
|
+ ]
|
|
|
|
+)
|
|
|
|
|
|
|
|
|
|
# 拟建爬虫下载附件
|
|
# 拟建爬虫下载附件
|
|
@@ -27,19 +52,23 @@ def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
|
|
filetype = val.split('.')[-1].replace('"', '').replace("'", "")
|
|
filetype = val.split('.')[-1].replace('"', '').replace("'", "")
|
|
filetypes.append(filetype)
|
|
filetypes.append(filetype)
|
|
|
|
|
|
|
|
+ attachments = {}
|
|
|
|
+
|
|
root = fromstring(html)
|
|
root = fromstring(html)
|
|
file_info = root.xpath('//a[@href]')
|
|
file_info = root.xpath('//a[@href]')
|
|
if file_info:
|
|
if file_info:
|
|
- attachments = {}
|
|
|
|
|
|
+ file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb',
|
|
|
|
+ 'hzzbs',
|
|
|
|
+ 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
|
|
|
|
+ res_downloader = ADres()
|
|
|
|
+ downloader = AD()
|
|
for info in file_info:
|
|
for info in file_info:
|
|
file_url = "".join(info.xpath('./@href'))
|
|
file_url = "".join(info.xpath('./@href'))
|
|
- file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'hzzbs',
|
|
|
|
- 'jpg', 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg']
|
|
|
|
file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
|
|
file_name = "".join(info.xpath('./@title') or info.xpath('.//text()'))
|
|
if file_type.lower() == "res":
|
|
if file_type.lower() == "res":
|
|
if s_key in file_url and file_name:
|
|
if s_key in file_url and file_name:
|
|
file_name = file_name.strip()
|
|
file_name = file_name.strip()
|
|
- attachment = ADres().fetch_attachment(
|
|
|
|
|
|
+ attachment = res_downloader.fetch_attachment(
|
|
file_name=file_name,
|
|
file_name=file_name,
|
|
download_url=file_url,
|
|
download_url=file_url,
|
|
callback=parse_filetype,
|
|
callback=parse_filetype,
|
|
@@ -55,18 +84,22 @@ def njpc_get_files(html, headers, file_type="", s_key="http", proxies=False):
|
|
|
|
|
|
if file_tp and s_key in file_url and file_name:
|
|
if file_tp and s_key in file_url and file_name:
|
|
file_name = file_name.strip()
|
|
file_name = file_name.strip()
|
|
- attachment = AD().fetch_attachment(
|
|
|
|
- file_name=file_name, file_type=file_tp, download_url=file_url,
|
|
|
|
- proxies=proxies, headers=headers)
|
|
|
|
|
|
+ attachment = downloader.fetch_attachment(
|
|
|
|
+ file_name=file_name,
|
|
|
|
+ file_type=file_tp,
|
|
|
|
+ download_url=file_url,
|
|
|
|
+ proxies=proxies,
|
|
|
|
+ headers=headers)
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
- return attachments
|
|
|
|
|
|
|
|
|
|
+ return attachments
|
|
|
|
|
|
-class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
+
|
|
|
|
+class Spider(feapder.PlanToBuildDetailSpider):
|
|
__custom_setting__ = dict(
|
|
__custom_setting__ = dict(
|
|
- WEBDRIVER=dict(
|
|
|
|
- driver_type="FIREFOX"
|
|
|
|
- )
|
|
|
|
|
|
+ PROXY_EXTRACT_API="http://172.17.162.28:16001/sam",
|
|
|
|
+ PROXY_POOL="feapder.network.proxy_pool.SpringBoardProxyPool",
|
|
|
|
+ DRISSIONPAGE=DRISSIONPAGE
|
|
)
|
|
)
|
|
|
|
|
|
def start_requests(self):
|
|
def start_requests(self):
|
|
@@ -80,35 +113,20 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
render_time = item.get("render_time") or 3 # 浏览器渲染时间
|
|
render_time = item.get("render_time") or 3 # 浏览器渲染时间
|
|
extra_activity = item.get("extra_activity") # 额外的需求动作
|
|
extra_activity = item.get("extra_activity") # 额外的需求动作
|
|
file_params = item.get("file_params") # 附件下载配置
|
|
file_params = item.get("file_params") # 附件下载配置
|
|
- if item.get("proxies"):
|
|
|
|
- yield feapder.Request(url=item.get("parser_url"),
|
|
|
|
- timeout=timeout,
|
|
|
|
- render=True,
|
|
|
|
- render_time=render_time,
|
|
|
|
- callback=item.get("parser"),
|
|
|
|
- item=item,
|
|
|
|
- deal_detail=item.get("deal_detail"),
|
|
|
|
- is_join_html=is_join_html,
|
|
|
|
- extra_html=extra_html,
|
|
|
|
- title_xpath=title_xpath,
|
|
|
|
- file_params=file_params,
|
|
|
|
- extra_activity=extra_activity,
|
|
|
|
- **request_params)
|
|
|
|
- else:
|
|
|
|
- yield feapder.Request(url=item.get("parser_url"),
|
|
|
|
- proxies=False,
|
|
|
|
- timeout=timeout,
|
|
|
|
- render=True,
|
|
|
|
- render_time=render_time,
|
|
|
|
- callback=item.get("parser"),
|
|
|
|
- item=item,
|
|
|
|
- deal_detail=item.get("deal_detail"),
|
|
|
|
- is_join_html=is_join_html,
|
|
|
|
- extra_html=extra_html,
|
|
|
|
- title_xpath=title_xpath,
|
|
|
|
- file_params=file_params,
|
|
|
|
- extra_activity=extra_activity,
|
|
|
|
- **request_params)
|
|
|
|
|
|
+
|
|
|
|
+ yield feapder.Request(url=item.get("parser_url"),
|
|
|
|
+ timeout=timeout,
|
|
|
|
+ render=True,
|
|
|
|
+ render_time=render_time,
|
|
|
|
+ callback=item.get("parser"),
|
|
|
|
+ item=item,
|
|
|
|
+ deal_detail=item.get("deal_detail"),
|
|
|
|
+ is_join_html=is_join_html,
|
|
|
|
+ extra_html=extra_html,
|
|
|
|
+ title_xpath=title_xpath,
|
|
|
|
+ file_params=file_params,
|
|
|
|
+ extra_activity=extra_activity,
|
|
|
|
+ **request_params)
|
|
|
|
|
|
def detail_get(self, request, response):
|
|
def detail_get(self, request, response):
|
|
items = request.item
|
|
items = request.item
|
|
@@ -127,7 +145,7 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
if request.title_xpath:
|
|
if request.title_xpath:
|
|
for sxpath in request.title_xpath:
|
|
for sxpath in request.title_xpath:
|
|
- title = response.xpath(sxpath).extract_first() # 三级页标题
|
|
|
|
|
|
+ title = response.xpath(sxpath).extract_first() # 三级页标题
|
|
if title:
|
|
if title:
|
|
data_item.title = title.strip()
|
|
data_item.title = title.strip()
|
|
if "..." in data_item.projectname or "…" in data_item.projectname:
|
|
if "..." in data_item.projectname or "…" in data_item.projectname:
|
|
@@ -136,7 +154,7 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
try:
|
|
try:
|
|
if request.extra_activity:
|
|
if request.extra_activity:
|
|
- from untils.tools import njpc_fields_extract,njpc_fields_extract_special
|
|
|
|
|
|
+ from untils.tools import njpc_fields_extract, njpc_fields_extract_special
|
|
exec(request.extra_activity)
|
|
exec(request.extra_activity)
|
|
except:
|
|
except:
|
|
pass
|
|
pass
|
|
@@ -147,8 +165,8 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
data_item.contenthtml = html
|
|
data_item.contenthtml = html
|
|
|
|
|
|
- if request.proxies:
|
|
|
|
- fpx = request.proxies()
|
|
|
|
|
|
+ if request.get_proxies():
|
|
|
|
+ fpx = request.get_proxies()
|
|
else:
|
|
else:
|
|
fpx = False
|
|
fpx = False
|
|
|
|
|
|
@@ -172,4 +190,4 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
- Details(redis_key="detail:njpc_firefox_details").start()
|
|
|
|
|
|
+ Spider(redis_key="detail:njpc_firefox_details").start()
|