123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869 |
- # -*- coding: utf-8 -*-
- """
- Created on 2025-06-02
- ---------
- @summary: 江苏土地市场网
- ---------
- @author: lzz
- """
- import feapder
- from items.spider_item import DataBakItem
- from untils.tools import extract_file_type
- from untils.attachment import AttachmentDownloader
- import re
- def fileDown(hid):
- return hid
- class Details(feapder.BiddingDetailSpider):
- def start_requests(self):
- data_lsit = self.get_tasks_by_rabbitmq(limit=10)
- for item in data_lsit:
- request_params = item.get("request_params")
- timeout = request_params.get('timeout', 10)
- request_params.pop('timeout', None)
- yield feapder.Request(url=item.get("parse_url"), item=item, proxies=False,render_time=5,render=True,
- deal_detail=item.get("deal_detail"), callback=eval(item.get("parse")),
- **request_params, timeout=timeout)
- def detail_get(self, request, response):
- items = request.item
- list_item = DataBakItem(**items)
- html1 = response.xpath('//div[@id="resultInfo"]').extract_first("")
- html2 = response.xpath('//div[@id="bargainInfo"]').extract_first("")
- html3 = response.xpath('//div[@id="formInfo"]').extract_first("")
- html4 = response.xpath('//div[@id="afficheInfo"]').extract_first("")
- s_title = "".join("".join(response.xpath('//div[@class="mainTwo-middle notice"]/ul[1]/li[1]//text()').extract()).split())
- if s_title:
- list_item.title = s_title
- list_item.s_title = s_title
- list_item.contenthtml = html1 + html2 + html3 + html4
- attachments = {}
- file_list = response.xpath('//div[@class="mainContent"]//a[contains(@href, "downLoadAttch")]')
- if file_list:
- for info in file_list:
- fid = "".join(re.findall("javascript:downLoadAttch\('(.*?)'",info.xpath('./@href').extract_first("")))
- file_name = info.xpath('./text()').extract_first("").strip()
- file_url = f"http://www.landjs.com/tAfficheParcel/fileDownLoad/{fid}"
- file_type = extract_file_type(file_name=file_name, file_url=file_url)
- if file_type:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name, file_type=file_type, download_url=file_url, )
- attachments[str(len(attachments) + 1)] = attachment
- if attachments:
- list_item.projectinfo = {"attachments": attachments}
- yield list_item
- if __name__ == "__main__":
- Details(redis_key="lzz:jstdscw_jydt").start()
|