|
@@ -1,49 +1,49 @@
|
|
# -*- coding: utf-8 -*-
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
"""
|
|
-Created on 2023-4-28
|
|
|
|
|
|
+Created on 2024-12-05
|
|
---------
|
|
---------
|
|
@summary: 成都市城市管理委员会-公示公告
|
|
@summary: 成都市城市管理委员会-公示公告
|
|
---------
|
|
---------
|
|
@author: lzz
|
|
@author: lzz
|
|
"""
|
|
"""
|
|
-
|
|
|
|
import feapder
|
|
import feapder
|
|
from items.spider_item import DataBakItem
|
|
from items.spider_item import DataBakItem
|
|
from untils.attachment import AttachmentDownloader
|
|
from untils.attachment import AttachmentDownloader
|
|
|
|
+from untils.tools import extract_file_type
|
|
from feapder.utils.tools import log
|
|
from feapder.utils.tools import log
|
|
|
|
|
|
|
|
|
|
headers = {
|
|
headers = {
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
- "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
"Cache-Control": "no-cache",
|
|
"Cache-Control": "no-cache",
|
|
- "Connection": "keep-alive",
|
|
|
|
"Pragma": "no-cache",
|
|
"Pragma": "no-cache",
|
|
|
|
+ "Origin": "https://cgw.chengdu.gov.cn/cgw/c128900/sy.shtml",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
class Details(feapder.BiddingDetailSpider):
|
|
class Details(feapder.BiddingDetailSpider):
|
|
__custom_setting__ = dict(
|
|
__custom_setting__ = dict(
|
|
WEBDRIVER=dict(
|
|
WEBDRIVER=dict(
|
|
driver_type="FIREFOX",
|
|
driver_type="FIREFOX",
|
|
|
|
+ pool_size=1,
|
|
|
|
+ headless=True,
|
|
|
|
+ usages_local_driver=True
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|
|
|
|
- ct = 0
|
|
|
|
def start_requests(self):
|
|
def start_requests(self):
|
|
- data_list = self.get_tasks_by_rabbitmq(limit=20)
|
|
|
|
|
|
+ data_list = self.get_tasks_by_rabbitmq(limit=10)
|
|
for item in data_list:
|
|
for item in data_list:
|
|
- log.debug(item)
|
|
|
|
|
|
+ # log.debug(item)
|
|
request_params = item.get("request_params")
|
|
request_params = item.get("request_params")
|
|
|
|
|
|
yield feapder.Request(url=item.get("parse_url"), item=item,proxies=False,
|
|
yield feapder.Request(url=item.get("parse_url"), item=item,proxies=False,
|
|
- deal_detail=item.get("deal_detail"),render=True,render_time=3,
|
|
|
|
|
|
+ deal_detail=item.get("deal_detail"),render=True,render_time=5,
|
|
callback=eval(item.get("parse")), **request_params)
|
|
callback=eval(item.get("parse")), **request_params)
|
|
|
|
|
|
-
|
|
|
|
-
|
|
|
|
def detail_get(self,request,response):
|
|
def detail_get(self,request,response):
|
|
|
|
|
|
items = request.item
|
|
items = request.item
|
|
@@ -60,17 +60,15 @@ class Details(feapder.BiddingDetailSpider):
|
|
if file_list:
|
|
if file_list:
|
|
attachments = {}
|
|
attachments = {}
|
|
for info in file_list:
|
|
for info in file_list:
|
|
- file_name = info.xpath('./text()').extract_first().strip()
|
|
|
|
- file_url = info.xpath('./@href').extract_first().strip()
|
|
|
|
- file_type = file_url.split('.')[-1].lower()
|
|
|
|
- file_types = ['zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
|
|
|
|
- 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps']
|
|
|
|
|
|
+ file_name = info.xpath('./text()').extract_first("").strip()
|
|
|
|
+ file_url = info.xpath('./@href').extract_first("").strip()
|
|
|
|
+ file_type = extract_file_type(file_name,file_url)
|
|
ck = response.cookies.get_dict()
|
|
ck = response.cookies.get_dict()
|
|
- headers['Cookie'] = ";".join([i + "=" + ck.get(i) for i in ck])
|
|
|
|
- if file_type in file_types:
|
|
|
|
|
|
+ if file_type:
|
|
|
|
+ headers['Referer'] = file_url
|
|
attachment = AttachmentDownloader().fetch_attachment(
|
|
attachment = AttachmentDownloader().fetch_attachment(
|
|
file_name=file_name, file_type=file_type, download_url=file_url,
|
|
file_name=file_name, file_type=file_type, download_url=file_url,
|
|
- enable_proxy=False, headers=headers)
|
|
|
|
|
|
+ cookies=ck, headers=headers)
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
attachments[str(len(attachments) + 1)] = attachment
|
|
|
|
|
|
if attachments:
|
|
if attachments:
|