123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-01-08
- ---------
- @summary: 湖北省政府采购网上商城 - 详情
- ---------
- @author: lzz
- """
- import feapder
- from items.spider_item import DataBakItem
- from untils.attachment import AttachmentDownloader
- from feapder.utils.tools import log
- import requests
- from untils.tools import get_proxy
- import re
- from untils.get_imgcode import jy_ocr
- def ocr_captcha(headers, proxies=False, max_retries=10):
- session = requests.session()
- session.proxies = proxies
- s = re.compile("'src', '(.*?)'", flags=re.S) # src
- href = 'http://wssc.hubeigp.gov.cn/simple_captcha'
- code = ''
- for _ in range(max_retries):
- resp1 = session.get(href, headers=headers, timeout=30, verify=False)
- text = resp1.content.decode()
- img_url = "http://wssc.hubeigp.gov.cn" + "".join(s.findall(text))
- resp2 = session.get(img_url, headers=headers, timeout=30, verify=False)
- code = jy_ocr(image=resp2.content)
- if code and len(code) == 6:
- break
- return code, session.cookies.get_dict()
- class Details(feapder.BiddingDetailSpider):
- __custom_setting__ = dict(
- SPIDER_MAX_RETRY_TIMES=10
- )
- def start_callback(self):
- self.proxy = get_proxy()
- self.headers = {
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "Accept-Language": "zh-CN,zh;q=0.9",
- "Cache-Control": "no-cache",
- "Pragma": "no-cache",
- "Upgrade-Insecure-Requests": "1",
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
- }
- self.file_types = [
- 'zip', 'docx', 'ftp', 'pdf', 'doc', 'rar', 'gzzb', 'jpg',
- 'png', 'zbid', 'xls', 'xlsx', 'swp', 'dwg', 'wps', 'ofd'
- ]
- def start_requests(self):
- data_list = self.get_tasks_by_rabbitmq(limit=50)
- for item in data_list:
- log.debug(item)
- request_params = item.get("request_params")
- yield feapder.Request(url=item.get("parse_url"),
- callback=eval(item.get("parse")),
- deal_detail=item.get("deal_detail"),
- verify=False,
- item=item,
- **request_params)
- def download_midware(self, request):
- captcha, cookies = ocr_captcha(self.headers, self.proxy)
- params = {
- "captcha": captcha
- }
- request.params = params
- request.cookies = cookies
- request.headers = self.headers
- request.proxies = self.proxy
- def validate(self, request, response):
- html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first()
- if html is None:
- raise ValueError('详情数据为空!')
- return True
- def detail_get(self, request, response):
- items = request.item
- data_item = DataBakItem(**items)
- html = response.xpath('//div[@class="jggs_main clearfix"]').extract_first()
- data_item.contenthtml = html
- attachments = {}
- file_list = response.xpath('//div[@class="jm_textcon"]//a[@href]')
- for info in file_list:
- file_url = info.xpath('./@href').extract_first()
- file_name = info.xpath('./text()').extract_first()
- if not file_name or not file_url:
- continue
- file_name = file_name.strip()
- file_type = file_name.split('.')[-1].lower()
- if file_type not in self.file_types:
- file_type = file_url.split('.')[-1].lower()
- if file_type in self.file_types and "file" in file_url:
- attachment = AttachmentDownloader().fetch_attachment(
- file_name=file_name,
- file_type=file_type,
- download_url=file_url
- )
- attachments[str(len(attachments) + 1)] = attachment
- if len(attachments) > 0:
- data_item.projectinfo = {"attachments": attachments}
- yield data_item
- def exception_request(self, request, response):
- self.proxy = get_proxy()
- yield request
- if __name__ == "__main__":
- Details(redis_key="lzz:Hbszfcgwssc").start()
|