123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-10-10
- ---------
- @summary: 千里马详情页专用下载器
- ---------
- """
- import copy
- import functools
- from datetime import datetime
- import execjs
- import requests
- from rgg.log import logger
- _cookies = {}
- _headers = {}
- _proxies = None
- def _account_supervision(func):
- @functools.wraps(func)
- def wrapper(*args, **kwargs):
- err = None
- for _ in range(3):
- try:
- return func(*args, **kwargs)
- except requests.exceptions.RequestException as e:
- err = e
- except AssertionError:
- logger.error('账号异常')
- send_wechat_warning('浙移集成|访问失败|账号异常')
- return
- if err is not None:
- # logger.exception(f'账号异常,原因:{err}')
- raise err
- return wrapper
- def set_cookies(ck):
- global _cookies
- _cookies = ck
- def set_headers(h):
- global _headers
- _headers = h
- def set_proxies(p):
- global _proxies
- _proxies = p
- def get_proxies(scheme=None):
- global _proxies
- if _proxies is None:
- return
- return _proxies if scheme is None else (_proxies or {}).get(scheme).replace('socks5://', '')
- def _extract_cid(href):
- script = '''
- function extractCid(url) {
- if(url.indexOf('/zb/detail') != -1){
- var cidArr = url.split('_');
- if (cidArr.length > 1) {
- var cid = cidArr[1].replace('.html', '');
- if (cid.indexOf('-') != -1) {
- cid = cid.split("-")[1];
- }
- return cid
- }
- }
-
- if (url.indexOf('-') != -1) {
- t = url.lastIndexOf("-")
- n = url.substring(t + 1)
- cid = n.split(".html")[0]
- return cid
- }
-
- }
- '''
- ctx = execjs.compile(script)
- result = ctx.call('extractCid', href)
- return result
- def _extract_referer(href, cid):
- global _cookies, _proxies
- href = str(href).replace('http:', 'https:')
- url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
- headers = {
- 'Accept': '*/*',
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive',
- 'Pragma': 'no-cache',
- 'Referer': href,
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
- 'X-Requested-With': 'XMLHttpRequest',
- }
- requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
- response = requests.get(url, timeout=10, **requests_params)
- assert response.status_code == 200
- text = response.content.decode()
- script = '''
- function extractDetailUrl(cid, dataStr) {
- var data = JSON.parse(dataStr)
- var catId = data.data;
- var pageName;
- switch (catId) {
- case 301:
- case 601:
- pageName = "tenderDetail.html";
- break;
- case 202:
- pageName = "projectDetail.html";
- break;
- case 201:
- pageName = "tenderDetail.html";
- break;
- case 101:
- pageName = "projectDetail.html";
- break;
- default:
- pageName = "tenderDetail.html";
- break;
- }
- return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
- }
- '''
- ctx = execjs.compile(script)
- result = ctx.call('extractDetailUrl', cid, text)
- return result
- def _download_detail(href, referer=False, timeout=10):
- global _cookies, _headers, _proxies
- headers = copy.deepcopy(_headers)
- cid = _extract_cid(href)
- if not cid:
- raise ValueError('cid is not exist')
- url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
- if referer:
- referer = _extract_referer(href, cid)
- headers['Referer'] = referer
- requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
- response = requests.post(url, timeout=timeout, **requests_params)
- assert response.status_code == 200
- result = response.json()
- data = result['data']
- if not data:
- logger.warning(f'下载异常|{result}')
- return data
- logger.info(f'下载成功|{href}')
- return data
- @_account_supervision
- def download_html(href, **kwargs):
- result = _download_detail(href, **kwargs)
- if not result:
- return
- return result['content']
- @_account_supervision
- def download_json(href, **kwargs):
- result = _download_detail(href, timeout=30, **kwargs)
- if result is None:
- return False
- '''{"code":700053,"msg":"该条信息已被撤销,请重新检索","data":null}'''
- if 'code' in result and result['code'] == 700053:
- logger.warning(f'检索失败|{result}')
- return
- return result
- @_account_supervision
- def download_list(keywords, page, page_size, **kwargs):
- global _cookies, _headers, _proxies
- today = datetime.now().strftime('%Y-%m-%d')
- begin_time = kwargs.get('begin_time') or today
- end_time = kwargs.get('end_time') or today
- url = 'https://search.vip.qianlima.com/rest/service/website/search/solr'
- data = {
- "keywords": keywords, # 检索标题
- "timeType": 4, # 自定义时间类型
- "beginTime": begin_time,
- "endTime": end_time,
- "filtermode": 2,
- "searchMode": 0,
- "currentPage": page, # 页码
- "numPerPage": page_size, # 每页最大条目数
- "sortType": 6,
- "allType": -1,
- "noticeSegmentTypeStr": "",
- "beginAmount": "",
- "endAmount": "",
- "purchasingUnitIdList": "",
- "threeClassifyTagStr": "",
- "fourLevelCategoryIdListStr": "",
- "threeLevelCategoryIdListStr": "",
- "levelId": "",
- "tab": 0,
- "searchDataType": 0,
- "types": "-1",
- "showContent": 1,
- "hasTenderTransferProject": 1,
- "newAreas": "",
- "hasChooseSortType": 1,
- "summaryType": 0
- }
- response = requests.post(
- url,
- cookies=_cookies,
- headers=_headers,
- json=data,
- timeout=60,
- proxies=_proxies
- )
- assert response.status_code == 200
- result = response.json()
- try:
- result['data']['rowCount']
- except TypeError:
- logger.error(f'下载失败|{keywords}|第{page}页|{result}')
- return
- lst = result['data']
- if not lst:
- logger.warning(f'数据异常|{keywords}|第{page}页|{result}')
- return
- logger.debug(f'下载成功|{keywords}|第{page}页')
- return result['data']['data']
- def send_wechat_warning(msg, send=True):
- markdown = f'千里马会员账号采集异常,请相关同事注意。'
- markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
- if not send:
- logger.info(markdown)
- return
- url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=bf53d666-bfa7-4176-b3e2-2d4b9d8a3bea'
- headers_ = {'Content-Type': 'application/json'}
- json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}}
- request_params = dict(headers=headers_, json=json_data, timeout=10)
- response = requests.post(url, **request_params)
- logger.info(response.json())
|