# -*- coding: utf-8 -*- """ Created on 2024-10-10 --------- @summary: 千里马详情页专用下载器 --------- @author: Dzr """ import copy import functools import json import execjs import requests from loguru import logger # 来源 rest/detail/alltypesdetail/detail _cookies = {} _headers = {} _proxies = None def router(func): @functools.wraps(func) def wrapper(*args, **kwargs): try: return func(*args, **kwargs) except AssertionError: logger.exception('账号异常') return False, {} except KeyboardInterrupt: pass except requests.exceptions.RequestException as e: logger.exception(f'网络请求错误, 原因:{e}') return False, {} return wrapper def set_cookies(ck): global _cookies _cookies = ck def set_headers(h): global _headers _headers = h def set_proxies(p): global _proxies _proxies = p def _extract_cid(href): script = ''' function extractCid(url) { if(url.indexOf('/zb/detail') != -1){ var cidArr = url.split('_'); if (cidArr.length > 1) { var cid = cidArr[1].replace('.html', ''); if (cid.indexOf('-') != -1) { cid = cid.split("-")[1]; } return cid } } if (url.indexOf('-') != -1) { t = url.lastIndexOf("-") n = url.substring(t + 1) cid = n.split(".html")[0] return cid } } ''' ctx = execjs.compile(script) result = ctx.call('extractCid', href) return result def _extract_referer(href, cid): global _cookies, _proxies href = str(href).replace('http:', 'https:') url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid headers = { 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Referer': href, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies) response = requests.get(url, timeout=10, **requests_params) assert response.status_code == 200 text = response.content.decode() script = ''' function extractDetailUrl(cid, dataStr) { var data = JSON.parse(dataStr) var catId = data.data; var pageName; switch (catId) { case 301: case 601: pageName = "tenderDetail.html"; break; case 202: pageName = "projectDetail.html"; break; case 201: pageName = "tenderDetail.html"; break; case 101: pageName = "projectDetail.html"; break; default: pageName = "tenderDetail.html"; break; } return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid; } ''' ctx = execjs.compile(script) result = ctx.call('extractDetailUrl', cid, text) return result def _download_detail(href, referer=False): global _cookies, _headers, _proxies headers = copy.deepcopy(_headers) cid = _extract_cid(href) if not cid: raise ValueError('cid is not exist') url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid if referer: referer = _extract_referer(href, cid) headers['Referer'] = referer requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies) try: response = requests.post(url, timeout=10, **requests_params) except requests.exceptions.Timeout: logger.error(f'采集失败|访问超时|{href}') return False, None # 账号额度不足时,返回:None username = _cookies['qlm_username'] status_code = response.status_code if status_code != 200: result = response.content.decode() logger.error(f'采集失败|{username}|状态码|{status_code}|请求响应|{result}') return False, status_code result = response.json() data = result['data'] if not data: logger.warning(f'数据异常|{result}') return False, data # 账号额度不足时,返回:None logger.info(f'采集成功[{href}]') return True, data @router def download_json(href, **kwargs): _, result = _download_detail(href, **kwargs) if not result: return False if isinstance(result, int): return result return result @router def download_html(href, **kwargs): _, result = _download_detail(href, **kwargs) if not result: return False return result['content']