# -*- coding: utf-8 -*- """ Created on 2024-10-10 --------- @summary: 千里马详情页专用下载器 --------- """ import copy import functools from datetime import datetime import execjs import requests from rgg.log import logger _cookies = {} _headers = {} _proxies = None def _account_supervision(func): @functools.wraps(func) def wrapper(*args, **kwargs): err = None for _ in range(3): try: return func(*args, **kwargs) except requests.exceptions.RequestException as e: err = e except AssertionError: logger.error('账号异常') send_wechat_warning('浙移集成|访问失败|账号异常') return if err is not None: # logger.exception(f'账号异常,原因:{err}') raise err return wrapper def set_cookies(ck): global _cookies _cookies = ck def set_headers(h): global _headers _headers = h def set_proxies(p): global _proxies _proxies = p def get_proxies(scheme=None): global _proxies if _proxies is None: return return _proxies if scheme is None else (_proxies or {}).get(scheme).replace('socks5://', '') def _extract_cid(href): script = ''' function extractCid(url) { if(url.indexOf('/zb/detail') != -1){ var cidArr = url.split('_'); if (cidArr.length > 1) { var cid = cidArr[1].replace('.html', ''); if (cid.indexOf('-') != -1) { cid = cid.split("-")[1]; } return cid } } if (url.indexOf('-') != -1) { t = url.lastIndexOf("-") n = url.substring(t + 1) cid = n.split(".html")[0] return cid } } ''' ctx = execjs.compile(script) result = ctx.call('extractCid', href) return result def _extract_referer(href, cid): global _cookies, _proxies href = str(href).replace('http:', 'https:') url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid headers = { 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Referer': href, 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies) response = requests.get(url, timeout=10, **requests_params) assert response.status_code == 200 text = response.content.decode() script = ''' function extractDetailUrl(cid, dataStr) { var data = JSON.parse(dataStr) var catId = data.data; var pageName; switch (catId) { case 301: case 601: pageName = "tenderDetail.html"; break; case 202: pageName = "projectDetail.html"; break; case 201: pageName = "tenderDetail.html"; break; case 101: pageName = "projectDetail.html"; break; default: pageName = "tenderDetail.html"; break; } return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid; } ''' ctx = execjs.compile(script) result = ctx.call('extractDetailUrl', cid, text) return result def _download_detail(href, referer=False, timeout=10): global _cookies, _headers, _proxies headers = copy.deepcopy(_headers) cid = _extract_cid(href) if not cid: raise ValueError('cid is not exist') url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid if referer: referer = _extract_referer(href, cid) headers['Referer'] = referer requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies) response = requests.post(url, timeout=timeout, **requests_params) assert response.status_code == 200 result = response.json() data = result['data'] if not data: logger.warning(f'下载异常|{result}') return data logger.info(f'下载成功|{href}') return data @_account_supervision def download_html(href, **kwargs): result = _download_detail(href, **kwargs) if not result: return return result['content'] @_account_supervision def download_json(href, **kwargs): result = _download_detail(href, timeout=30, **kwargs) if result is None: return False '''{"code":700053,"msg":"该条信息已被撤销,请重新检索","data":null}''' if 'code' in result and result['code'] == 700053: logger.warning(f'检索失败|{result}') return return result @_account_supervision def download_list(keywords, page, page_size, **kwargs): global _cookies, _headers, _proxies today = datetime.now().strftime('%Y-%m-%d') begin_time = kwargs.get('begin_time') or today end_time = kwargs.get('end_time') or today url = 'https://search.vip.qianlima.com/rest/service/website/search/solr' data = { "keywords": keywords, # 检索标题 "timeType": 4, # 自定义时间类型 "beginTime": begin_time, "endTime": end_time, "filtermode": 2, "searchMode": 0, "currentPage": page, # 页码 "numPerPage": page_size, # 每页最大条目数 "sortType": 6, "allType": -1, "noticeSegmentTypeStr": "", "beginAmount": "", "endAmount": "", "purchasingUnitIdList": "", "threeClassifyTagStr": "", "fourLevelCategoryIdListStr": "", "threeLevelCategoryIdListStr": "", "levelId": "", "tab": 0, "searchDataType": 0, "types": "-1", "showContent": 1, "hasTenderTransferProject": 1, "newAreas": "", "hasChooseSortType": 1, "summaryType": 0 } response = requests.post( url, cookies=_cookies, headers=_headers, json=data, timeout=60, proxies=_proxies ) assert response.status_code == 200 result = response.json() try: result['data']['rowCount'] except TypeError: logger.error(f'下载失败|{keywords}|第{page}页|{result}') return lst = result['data'] if not lst: logger.warning(f'数据异常|{keywords}|第{page}页|{result}') return logger.debug(f'下载成功|{keywords}|第{page}页') return result['data']['data'] def send_wechat_warning(msg, send=True): markdown = f'千里马会员账号采集异常,请相关同事注意。' markdown += f'\n>异常详情:**{msg}**' if not send: logger.info(markdown) return url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=bf53d666-bfa7-4176-b3e2-2d4b9d8a3bea' headers_ = {'Content-Type': 'application/json'} json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}} request_params = dict(headers=headers_, json=json_data, timeout=10) response = requests.post(url, **request_params) logger.info(response.json())