123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-10-10
- ---------
- @summary: 千里马详情页专用下载器
- ---------
- @author: Dzr
- """
- import copy
- import functools
- import json
- import execjs
- import requests
- from loguru import logger
- # 来源 rest/detail/alltypesdetail/detail
- _cookies = {}
- _headers = {}
- _proxies = None
- def router(func):
- @functools.wraps(func)
- def wrapper(*args, **kwargs):
- try:
- return func(*args, **kwargs)
- except AssertionError:
- logger.exception('账号异常')
- return False, {}
- except KeyboardInterrupt:
- pass
- except requests.exceptions.RequestException as e:
- logger.exception(f'网络请求错误, 原因:{e}')
- return False, {}
- return wrapper
- def set_cookies(ck):
- global _cookies
- _cookies = ck
- def set_headers(h):
- global _headers
- _headers = h
- def set_proxies(p):
- global _proxies
- _proxies = p
- def _extract_cid(href):
- script = '''
- function extractCid(url) {
- if(url.indexOf('/zb/detail') != -1){
- var cidArr = url.split('_');
- if (cidArr.length > 1) {
- var cid = cidArr[1].replace('.html', '');
- if (cid.indexOf('-') != -1) {
- cid = cid.split("-")[1];
- }
- return cid
- }
- }
-
- if (url.indexOf('-') != -1) {
- t = url.lastIndexOf("-")
- n = url.substring(t + 1)
- cid = n.split(".html")[0]
- return cid
- }
-
- }
- '''
- ctx = execjs.compile(script)
- result = ctx.call('extractCid', href)
- return result
- def _extract_referer(href, cid):
- global _cookies, _proxies
- href = str(href).replace('http:', 'https:')
- url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
- headers = {
- 'Accept': '*/*',
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive',
- 'Pragma': 'no-cache',
- 'Referer': href,
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
- 'X-Requested-With': 'XMLHttpRequest',
- }
- requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
- response = requests.get(url, timeout=10, **requests_params)
- assert response.status_code == 200
- text = response.content.decode()
- script = '''
- function extractDetailUrl(cid, dataStr) {
- var data = JSON.parse(dataStr)
- var catId = data.data;
- var pageName;
- switch (catId) {
- case 301:
- case 601:
- pageName = "tenderDetail.html";
- break;
- case 202:
- pageName = "projectDetail.html";
- break;
- case 201:
- pageName = "tenderDetail.html";
- break;
- case 101:
- pageName = "projectDetail.html";
- break;
- default:
- pageName = "tenderDetail.html";
- break;
- }
- return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
- }
- '''
- ctx = execjs.compile(script)
- result = ctx.call('extractDetailUrl', cid, text)
- return result
- def _download_detail(href, referer=False):
- global _cookies, _headers, _proxies
- headers = copy.deepcopy(_headers)
- cid = _extract_cid(href)
- if not cid:
- raise ValueError('cid is not exist')
- url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
- if referer:
- referer = _extract_referer(href, cid)
- headers['Referer'] = referer
- requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
- try:
- response = requests.post(url, timeout=10, **requests_params)
- except requests.exceptions.Timeout:
- logger.error(f'采集失败|访问超时|{href}')
- return False, None # 账号额度不足时,返回:None
- username = _cookies['qlm_username']
- status_code = response.status_code
- if status_code != 200:
- result = response.content.decode()
- logger.error(f'采集失败|{username}|状态码|{status_code}|请求响应|{result}')
- return False, status_code
- result = response.json()
- data = result['data']
- if not data:
- logger.warning(f'数据异常|{result}')
- return False, data # 账号额度不足时,返回:None
- logger.info(f'采集成功[{href}]')
- return True, data
- @router
- def download_json(href, **kwargs):
- _, result = _download_detail(href, **kwargs)
- if not result:
- return False
- if isinstance(result, int):
- return result
- return result
- @router
- def download_html(href, **kwargs):
- _, result = _download_detail(href, **kwargs)
- if not result:
- return False
- return result['content']
|