|
@@ -6,26 +6,22 @@ Created on {DATE}
|
|
---------
|
|
---------
|
|
@author: njpc_feapder
|
|
@author: njpc_feapder
|
|
"""
|
|
"""
|
|
-import sys
|
|
|
|
-sys.path.append('/app/spiders/sword_feapder/FworkSpider')
|
|
|
|
import feapder
|
|
import feapder
|
|
import re
|
|
import re
|
|
import json
|
|
import json
|
|
-import time,random
|
|
|
|
|
|
+import time, random
|
|
from items.njpc_item import DataNjpcItem
|
|
from items.njpc_item import DataNjpcItem
|
|
from untils.attachment import AttachmentDownloader as AD
|
|
from untils.attachment import AttachmentDownloader as AD
|
|
from untils.attachment_res import AttachmentDownloader as ADres
|
|
from untils.attachment_res import AttachmentDownloader as ADres
|
|
from lxml.html import fromstring
|
|
from lxml.html import fromstring
|
|
-from untils.tools import remove_htmldata,extract_file_type
|
|
|
|
|
|
+from untils.tools import remove_htmldata, extract_file_type
|
|
from feapder.utils.log import log
|
|
from feapder.utils.log import log
|
|
|
|
|
|
-
|
|
|
|
redis_key = "njpc_details"
|
|
redis_key = "njpc_details"
|
|
|
|
|
|
|
|
|
|
# 拟建爬虫下载附件
|
|
# 拟建爬虫下载附件
|
|
-def njpc_get_files(html,file_type="",s_key="http",proxies=False):
|
|
|
|
-
|
|
|
|
|
|
+def njpc_get_files(html, file_type="", s_key="http", proxies=False):
|
|
def parse_filetype(response, filetypes):
|
|
def parse_filetype(response, filetypes):
|
|
val = response.headers.get("content-disposition")
|
|
val = response.headers.get("content-disposition")
|
|
filetype = val.split('.')[-1].replace('"', '').replace("'", "")
|
|
filetype = val.split('.')[-1].replace('"', '').replace("'", "")
|
|
@@ -54,7 +50,7 @@ def njpc_get_files(html,file_type="",s_key="http",proxies=False):
|
|
if file_type.lower() in file_types:
|
|
if file_type.lower() in file_types:
|
|
file_tp = file_type
|
|
file_tp = file_type
|
|
else:
|
|
else:
|
|
- file_tp = extract_file_type(file_name,file_url,[file_type])
|
|
|
|
|
|
+ file_tp = extract_file_type(file_name, file_url, [file_type])
|
|
|
|
|
|
if file_tp and s_key in file_url and file_name:
|
|
if file_tp and s_key in file_url and file_name:
|
|
file_name = file_name.strip()
|
|
file_name = file_name.strip()
|
|
@@ -70,26 +66,27 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
def start_requests(self):
|
|
def start_requests(self):
|
|
data_lsit = self.get_tasks_by_rabbitmq(limit=100)
|
|
data_lsit = self.get_tasks_by_rabbitmq(limit=100)
|
|
for item in data_lsit:
|
|
for item in data_lsit:
|
|
- log.debug(item)
|
|
|
|
|
|
+ # log.debug(item)
|
|
request_params = item.get("request_params")
|
|
request_params = item.get("request_params")
|
|
- timeout = request_params.pop('timeout',10)
|
|
|
|
- is_join_html = item.get("is_join_html") # 正文是否根据xpath拼接
|
|
|
|
- extra_html = item.get("extra_html") # 过滤无效内容
|
|
|
|
- title_xpath = item.get("title_xpath") # 三级页标题
|
|
|
|
|
|
+ timeout = request_params.get('timeout', 10)
|
|
|
|
+ request_params.pop('timeout', None)
|
|
|
|
+ is_join_html = item.get("is_join_html") # 正文是否根据xpath拼接
|
|
|
|
+ extra_html = item.get("extra_html") # 过滤无效内容
|
|
|
|
+ title_xpath = item.get("title_xpath") # 三级页标题
|
|
extra_activity = item.get("extra_activity") # 额外的需求动作
|
|
extra_activity = item.get("extra_activity") # 额外的需求动作
|
|
- file_params = item.get("file_params") # 附件下载配置
|
|
|
|
|
|
+ file_params = item.get("file_params") # 附件下载配置
|
|
if item.get("proxies"):
|
|
if item.get("proxies"):
|
|
yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
|
|
yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
|
|
- is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
|
|
|
|
|
|
+ is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
|
|
callback=item.get("parser"), file_params=file_params,
|
|
callback=item.get("parser"), file_params=file_params,
|
|
extra_activity=extra_activity, timeout=timeout, **request_params)
|
|
extra_activity=extra_activity, timeout=timeout, **request_params)
|
|
else:
|
|
else:
|
|
- yield feapder.Request(url=item.get("parser_url"), item=item,deal_detail=item.get("deal_detail"),
|
|
|
|
- is_join_html=is_join_html, extra_html=extra_html,title_xpath=title_xpath,
|
|
|
|
|
|
+ yield feapder.Request(url=item.get("parser_url"), item=item, deal_detail=item.get("deal_detail"),
|
|
|
|
+ is_join_html=is_join_html, extra_html=extra_html, title_xpath=title_xpath,
|
|
callback=item.get("parser"), file_params=file_params,
|
|
callback=item.get("parser"), file_params=file_params,
|
|
extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
|
|
extra_activity=extra_activity, proxies=False, timeout=timeout, **request_params)
|
|
|
|
|
|
- def detail_get(self,request,response):
|
|
|
|
|
|
+ def detail_get(self, request, response):
|
|
items = request.item
|
|
items = request.item
|
|
data_item = DataNjpcItem(**items)
|
|
data_item = DataNjpcItem(**items)
|
|
|
|
|
|
@@ -106,7 +103,7 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
if request.title_xpath:
|
|
if request.title_xpath:
|
|
for sxpath in request.title_xpath:
|
|
for sxpath in request.title_xpath:
|
|
- title = response.xpath(sxpath).extract_first() # 三级页标题
|
|
|
|
|
|
+ title = response.xpath(sxpath).extract_first() # 三级页标题
|
|
if title:
|
|
if title:
|
|
data_item.title = title.strip()
|
|
data_item.title = title.strip()
|
|
if "..." in data_item.projectname or "…" in data_item.projectname:
|
|
if "..." in data_item.projectname or "…" in data_item.projectname:
|
|
@@ -115,12 +112,12 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
try:
|
|
try:
|
|
if request.extra_activity:
|
|
if request.extra_activity:
|
|
- from untils.tools import njpc_fields_extract,njpc_fields_extract_special
|
|
|
|
|
|
+ from untils.tools import njpc_fields_extract, njpc_fields_extract_special
|
|
exec(request.extra_activity)
|
|
exec(request.extra_activity)
|
|
except:
|
|
except:
|
|
pass
|
|
pass
|
|
|
|
|
|
- data_item.contenthtml = remove_htmldata(request.extra_html,html,response)
|
|
|
|
|
|
+ data_item.contenthtml = remove_htmldata(request.extra_html, html, response)
|
|
|
|
|
|
fp = request.file_params or {}
|
|
fp = request.file_params or {}
|
|
attachments = njpc_get_files(
|
|
attachments = njpc_get_files(
|
|
@@ -134,8 +131,7 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
yield data_item
|
|
yield data_item
|
|
|
|
|
|
-
|
|
|
|
- def detail_json(self,request,response):
|
|
|
|
|
|
+ def detail_json(self, request, response):
|
|
items = request.item
|
|
items = request.item
|
|
data_item = DataNjpcItem(**items)
|
|
data_item = DataNjpcItem(**items)
|
|
|
|
|
|
@@ -146,4 +142,3 @@ class Details(feapder.PlanToBuildDetailSpider):
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
Details(redis_key="detail:njpc_details").start()
|
|
Details(redis_key="detail:njpc_details").start()
|
|
-
|
|
|