|
@@ -6,14 +6,18 @@ Created on {DATE}
|
|
|
---------
|
|
|
@author: {USER}
|
|
|
"""
|
|
|
-import re
|
|
|
import sys
|
|
|
sys.path.append('/app/spiders/sword_feapder/FworkSpider')
|
|
|
from urllib.parse import urljoin
|
|
|
import feapder
|
|
|
-from untils.attachment import AttachmentDownloader
|
|
|
from items.spider_item import DataBakItem
|
|
|
+from untils.attachment import AttachmentDownloader
|
|
|
+from untils.tools import remove_htmldata
|
|
|
from feapder.utils.log import log
|
|
|
+import time
|
|
|
+import json
|
|
|
+import re
|
|
|
+
|
|
|
|
|
|
|
|
|
|
|
@@ -21,57 +25,56 @@ class Details(feapder.BiddingDetailSpider):
|
|
|
|
|
|
def start_requests(self):
|
|
|
while True:
|
|
|
- data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},limit=50)
|
|
|
+ data_lsit = self.to_db.find(self.db_name,{"parser_name":"${spider_name}"},sort={"item.publishtime":-1},limit=50)
|
|
|
for item in data_lsit:
|
|
|
- log.debug(item.get("item"))
|
|
|
+ log.debug(item)
|
|
|
request_params = item.get("request_params")
|
|
|
- is_join_html = item.get("is_join_html") # 正文是否根据xpath拼接
|
|
|
- extra_html = item.get("extra_html") # 过滤无效内容
|
|
|
+ timeout = request_params.pop('timeout',10)
|
|
|
+ if item.get("js"):
|
|
|
+ eval(item.get("js"))
|
|
|
+ if item.get("ex_python"):
|
|
|
+ exec(item.get("ex_python"))
|
|
|
if item.get("proxies"):
|
|
|
- yield feapder.Request(url=item.get("parse_url"),item=item.get("item"),files_info=item.get("files"),
|
|
|
- deal_detail=item.get("deal_detail"),is_join_html=is_join_html,extra_html=extra_html,
|
|
|
- callback=eval(item.get("parse")),base_info=item,**request_params)
|
|
|
+ yield feapder.Request(url=item.get("parse_url"), item=item.get("item"),
|
|
|
+ files_info=item.get("files"),
|
|
|
+ deal_detail=item.get("deal_detail"),
|
|
|
+ callback=eval(item.get("parse")), base_info=item, **request_params,
|
|
|
+ timeout=timeout)
|
|
|
else:
|
|
|
- yield feapder.Request(url=item.get("parse_url"), item=item.get("item"), files_info=item.get("files"),
|
|
|
- deal_detail=item.get("deal_detail"),is_join_html=is_join_html,extra_html=extra_html,
|
|
|
- callback=eval(item.get("parse")), base_info=item,proxies=False,**request_params)
|
|
|
+ yield feapder.Request(url=item.get("parse_url"), item=item.get("item"),
|
|
|
+ files_info=item.get("files"),
|
|
|
+ deal_detail=item.get("deal_detail"), timeout=timeout,
|
|
|
+ callback=eval(item.get("parse")), base_info=item, proxies=False,
|
|
|
+ **request_params)
|
|
|
+
|
|
|
self.to_db.delete(self.db_name, {"_id": item.get("_id")})
|
|
|
break
|
|
|
|
|
|
- def detail_get(self,request,response):
|
|
|
+ def detail_get(self, request, response):
|
|
|
|
|
|
items = request.item
|
|
|
list_item = DataBakItem()
|
|
|
for key in items:
|
|
|
- list_item.__setitem__(key,items[key])
|
|
|
+ list_item.__setitem__(key, items[key])
|
|
|
|
|
|
html = ''
|
|
|
for xpath in request.deal_detail:
|
|
|
html = response.xpath(xpath).extract_first() # 标书详细内容
|
|
|
- if request.is_join_html:
|
|
|
- if html is not None:
|
|
|
- html += html
|
|
|
- else:
|
|
|
- if html is not None:
|
|
|
- break
|
|
|
-
|
|
|
- extra_html_info = request.extra_html
|
|
|
- if html and extra_html_info:
|
|
|
- for extra_item in extra_html_info:
|
|
|
- if re.search('^//.*', extra_item):
|
|
|
- extra_html = response.xpath(extra_item).extract_first()
|
|
|
- else:
|
|
|
- extra_html = extra_item
|
|
|
- html = html.replace(extra_html,'')
|
|
|
+ if html is not None:
|
|
|
+ break
|
|
|
+
|
|
|
+ if request.to_dict.get('rm_list',None) and html:
|
|
|
+ rm_list = request.rm_list
|
|
|
+ html = remove_htmldata(rm_list,html,response)
|
|
|
|
|
|
list_item.contenthtml = html
|
|
|
|
|
|
- if request.files_info: # 附件下载
|
|
|
+ if request.files_info:
|
|
|
files_info = request.files_info
|
|
|
files = response.xpath(files_info.get("list_xpath"))
|
|
|
- if len(files)>0:
|
|
|
+ if len(files) > 0:
|
|
|
attachments = {}
|
|
|
- for info in files:
|
|
|
+ for index, info in enumerate(files):
|
|
|
file_url = info.xpath(files_info.get("url_xpath")).extract_first()
|
|
|
file_name = info.xpath(files_info.get("name_xpath")).extract_first()
|
|
|
if not file_name:
|
|
@@ -84,33 +87,47 @@ class Details(feapder.BiddingDetailSpider):
|
|
|
file_type = file_url.split("?")[0].split(".")[-1].lower()
|
|
|
if file_type not in files_info.get("files_type"):
|
|
|
file_type = file_name.split("?")[0].split(".")[-1].lower()
|
|
|
+ elif files_info.get("file_type") == "file_name":
|
|
|
+ file_type = file_name.split("?")[0].split(".")[-1].lower()
|
|
|
else:
|
|
|
file_type = files_info.get("file_type")
|
|
|
|
|
|
+ if request.proxies:
|
|
|
+ fpx = request.proxies()
|
|
|
+ else:
|
|
|
+ fpx = False
|
|
|
+
|
|
|
if file_type in files_info.get("files_type") and files_info.get("url_key") in file_url:
|
|
|
attachment = AttachmentDownloader().fetch_attachment(
|
|
|
- file_name=file_name,file_type=file_type,download_url=file_url,
|
|
|
- enable_proxy=False)
|
|
|
- attachments[str(len(attachments)+1)] = attachment
|
|
|
- if len(attachments)==0:
|
|
|
+ file_name=file_name, file_type=file_type, download_url=file_url,
|
|
|
+ enable_proxy=False,proxies=fpx)
|
|
|
+ attachments[str(len(attachments) + 1)] = attachment
|
|
|
+ if len(attachments) == 0:
|
|
|
pass
|
|
|
else:
|
|
|
- list_item.projectinfo={"attachments":attachments}
|
|
|
+ list_item.projectinfo = {"attachments": attachments}
|
|
|
|
|
|
yield list_item
|
|
|
|
|
|
+ def detail_json(self, request, response):
|
|
|
+ items = request.item
|
|
|
+ list_item = DataBakItem()
|
|
|
+ for key in items:
|
|
|
+ list_item.__setitem__(key, items[key])
|
|
|
+ exec(request.deal_detail)
|
|
|
|
|
|
- def detail_json(self,request,response):
|
|
|
+ yield list_item
|
|
|
|
|
|
+ def detail_post(self, request, response):
|
|
|
items = request.item
|
|
|
list_item = DataBakItem()
|
|
|
for key in items:
|
|
|
- list_item.__setitem__(key,items[key])
|
|
|
-
|
|
|
+ list_item.__setitem__(key, items[key])
|
|
|
exec(request.deal_detail)
|
|
|
|
|
|
yield list_item
|
|
|
|
|
|
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
|
Details(redis_key="{USER}:${spider_name}").start()
|