|
@@ -0,0 +1,270 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+Created on 2024-10-10
|
|
|
+---------
|
|
|
+@summary: 千里马详情页专用下载器
|
|
|
+---------
|
|
|
+
|
|
|
+"""
|
|
|
+import copy
|
|
|
+import functools
|
|
|
+from datetime import datetime
|
|
|
+
|
|
|
+import execjs
|
|
|
+import requests
|
|
|
+
|
|
|
+from rgg.log import logger
|
|
|
+
|
|
|
+_cookies = {}
|
|
|
+_headers = {}
|
|
|
+_proxies = None
|
|
|
+
|
|
|
+
|
|
|
+def _account_supervision(func):
|
|
|
+ @functools.wraps(func)
|
|
|
+ def wrapper(*args, **kwargs):
|
|
|
+ err = None
|
|
|
+ for _ in range(3):
|
|
|
+ try:
|
|
|
+ return func(*args, **kwargs)
|
|
|
+ except requests.exceptions.RequestException as e:
|
|
|
+ err = e
|
|
|
+
|
|
|
+ except AssertionError:
|
|
|
+ logger.error('账号异常')
|
|
|
+ send_wechat_warning('浙移集成|访问失败|账号异常')
|
|
|
+ return
|
|
|
+
|
|
|
+ if err is not None:
|
|
|
+ # logger.exception(f'账号异常,原因:{err}')
|
|
|
+ raise err
|
|
|
+
|
|
|
+ return wrapper
|
|
|
+
|
|
|
+
|
|
|
+def set_cookies(ck):
|
|
|
+ global _cookies
|
|
|
+ _cookies = ck
|
|
|
+
|
|
|
+
|
|
|
+def set_headers(h):
|
|
|
+ global _headers
|
|
|
+ _headers = h
|
|
|
+
|
|
|
+
|
|
|
+def set_proxies(p):
|
|
|
+ global _proxies
|
|
|
+ _proxies = p
|
|
|
+
|
|
|
+
|
|
|
+def get_proxies(scheme=None):
|
|
|
+ global _proxies
|
|
|
+ if _proxies is None:
|
|
|
+ return
|
|
|
+
|
|
|
+ return _proxies if scheme is None else (_proxies or {}).get(scheme).replace('socks5://', '')
|
|
|
+
|
|
|
+
|
|
|
+def _extract_cid(href):
|
|
|
+ script = '''
|
|
|
+ function extractCid(url) {
|
|
|
+ if(url.indexOf('/zb/detail') != -1){
|
|
|
+ var cidArr = url.split('_');
|
|
|
+ if (cidArr.length > 1) {
|
|
|
+ var cid = cidArr[1].replace('.html', '');
|
|
|
+ if (cid.indexOf('-') != -1) {
|
|
|
+ cid = cid.split("-")[1];
|
|
|
+ }
|
|
|
+ return cid
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (url.indexOf('-') != -1) {
|
|
|
+ t = url.lastIndexOf("-")
|
|
|
+ n = url.substring(t + 1)
|
|
|
+ cid = n.split(".html")[0]
|
|
|
+ return cid
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ '''
|
|
|
+ ctx = execjs.compile(script)
|
|
|
+ result = ctx.call('extractCid', href)
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def _extract_referer(href, cid):
|
|
|
+ global _cookies, _proxies
|
|
|
+ href = str(href).replace('http:', 'https:')
|
|
|
+
|
|
|
+ url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
|
|
|
+ headers = {
|
|
|
+ 'Accept': '*/*',
|
|
|
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
|
|
|
+ 'Cache-Control': 'no-cache',
|
|
|
+ 'Connection': 'keep-alive',
|
|
|
+ 'Pragma': 'no-cache',
|
|
|
+ 'Referer': href,
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
|
|
|
+ 'X-Requested-With': 'XMLHttpRequest',
|
|
|
+ }
|
|
|
+ requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
|
|
|
+ response = requests.get(url, timeout=10, **requests_params)
|
|
|
+ assert response.status_code == 200
|
|
|
+ text = response.content.decode()
|
|
|
+
|
|
|
+ script = '''
|
|
|
+ function extractDetailUrl(cid, dataStr) {
|
|
|
+ var data = JSON.parse(dataStr)
|
|
|
+ var catId = data.data;
|
|
|
+ var pageName;
|
|
|
+ switch (catId) {
|
|
|
+ case 301:
|
|
|
+ case 601:
|
|
|
+ pageName = "tenderDetail.html";
|
|
|
+ break;
|
|
|
+ case 202:
|
|
|
+ pageName = "projectDetail.html";
|
|
|
+ break;
|
|
|
+ case 201:
|
|
|
+ pageName = "tenderDetail.html";
|
|
|
+ break;
|
|
|
+ case 101:
|
|
|
+ pageName = "projectDetail.html";
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ pageName = "tenderDetail.html";
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
|
|
|
+ }
|
|
|
+ '''
|
|
|
+ ctx = execjs.compile(script)
|
|
|
+ result = ctx.call('extractDetailUrl', cid, text)
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def _download_detail(href, referer=False, timeout=10):
|
|
|
+ global _cookies, _headers, _proxies
|
|
|
+ headers = copy.deepcopy(_headers)
|
|
|
+ cid = _extract_cid(href)
|
|
|
+ if not cid:
|
|
|
+ raise ValueError('cid is not exist')
|
|
|
+
|
|
|
+ url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
|
|
|
+ if referer:
|
|
|
+ referer = _extract_referer(href, cid)
|
|
|
+ headers['Referer'] = referer
|
|
|
+
|
|
|
+ requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
|
|
|
+ response = requests.post(url, timeout=timeout, **requests_params)
|
|
|
+ assert response.status_code == 200
|
|
|
+ result = response.json()
|
|
|
+ data = result['data']
|
|
|
+ if not data:
|
|
|
+ logger.warning(f'下载异常|{result}')
|
|
|
+ return data
|
|
|
+
|
|
|
+ logger.info(f'下载成功|{href}')
|
|
|
+ return data
|
|
|
+
|
|
|
+
|
|
|
+@_account_supervision
|
|
|
+def download_html(href, **kwargs):
|
|
|
+ result = _download_detail(href, **kwargs)
|
|
|
+ if not result:
|
|
|
+ return
|
|
|
+
|
|
|
+ return result['content']
|
|
|
+
|
|
|
+
|
|
|
+@_account_supervision
|
|
|
+def download_json(href, **kwargs):
|
|
|
+ result = _download_detail(href, timeout=30, **kwargs)
|
|
|
+ if result is None:
|
|
|
+ return False
|
|
|
+
|
|
|
+ '''{"code":700053,"msg":"该条信息已被撤销,请重新检索","data":null}'''
|
|
|
+ if 'code' in result and result['code'] == 700053:
|
|
|
+ logger.warning(f'检索失败|{result}')
|
|
|
+ return
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+@_account_supervision
|
|
|
+def download_list(keywords, page, page_size, **kwargs):
|
|
|
+ global _cookies, _headers, _proxies
|
|
|
+ today = datetime.now().strftime('%Y-%m-%d')
|
|
|
+ begin_time = kwargs.get('begin_time') or today
|
|
|
+ end_time = kwargs.get('end_time') or today
|
|
|
+
|
|
|
+ url = 'https://search.vip.qianlima.com/rest/service/website/search/solr'
|
|
|
+ data = {
|
|
|
+ "keywords": keywords, # 检索标题
|
|
|
+ "timeType": 4, # 自定义时间类型
|
|
|
+ "beginTime": begin_time,
|
|
|
+ "endTime": end_time,
|
|
|
+ "filtermode": 2,
|
|
|
+ "searchMode": 0,
|
|
|
+ "currentPage": page, # 页码
|
|
|
+ "numPerPage": page_size, # 每页最大条目数
|
|
|
+ "sortType": 6,
|
|
|
+ "allType": -1,
|
|
|
+ "noticeSegmentTypeStr": "",
|
|
|
+ "beginAmount": "",
|
|
|
+ "endAmount": "",
|
|
|
+ "purchasingUnitIdList": "",
|
|
|
+ "threeClassifyTagStr": "",
|
|
|
+ "fourLevelCategoryIdListStr": "",
|
|
|
+ "threeLevelCategoryIdListStr": "",
|
|
|
+ "levelId": "",
|
|
|
+ "tab": 0,
|
|
|
+ "searchDataType": 0,
|
|
|
+ "types": "-1",
|
|
|
+ "showContent": 1,
|
|
|
+ "hasTenderTransferProject": 1,
|
|
|
+ "newAreas": "",
|
|
|
+ "hasChooseSortType": 1,
|
|
|
+ "summaryType": 0
|
|
|
+ }
|
|
|
+ response = requests.post(
|
|
|
+ url,
|
|
|
+ cookies=_cookies,
|
|
|
+ headers=_headers,
|
|
|
+ json=data,
|
|
|
+ timeout=60,
|
|
|
+ proxies=_proxies
|
|
|
+ )
|
|
|
+ assert response.status_code == 200
|
|
|
+ result = response.json()
|
|
|
+
|
|
|
+ try:
|
|
|
+ result['data']['rowCount']
|
|
|
+ except TypeError:
|
|
|
+ logger.error(f'下载失败|{keywords}|第{page}页|{result}')
|
|
|
+ return
|
|
|
+
|
|
|
+ lst = result['data']
|
|
|
+ if not lst:
|
|
|
+ logger.warning(f'数据异常|{keywords}|第{page}页|{result}')
|
|
|
+ return
|
|
|
+
|
|
|
+ logger.debug(f'下载成功|{keywords}|第{page}页')
|
|
|
+ return result['data']['data']
|
|
|
+
|
|
|
+
|
|
|
+def send_wechat_warning(msg, send=True):
|
|
|
+ markdown = f'千里马会员账号采集异常,请相关同事注意。'
|
|
|
+ markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
|
|
|
+
|
|
|
+ if not send:
|
|
|
+ logger.info(markdown)
|
|
|
+ return
|
|
|
+
|
|
|
+ url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=bf53d666-bfa7-4176-b3e2-2d4b9d8a3bea'
|
|
|
+ headers_ = {'Content-Type': 'application/json'}
|
|
|
+ json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}}
|
|
|
+ request_params = dict(headers=headers_, json=json_data, timeout=10)
|
|
|
+ response = requests.post(url, **request_params)
|
|
|
+ logger.info(response.json())
|