net.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-10-10
  4. ---------
  5. @summary: 千里马详情页专用下载器
  6. ---------
  7. """
  8. import copy
  9. import functools
  10. from datetime import datetime
  11. import execjs
  12. import requests
  13. from rgg.log import logger
  14. _cookies = {}
  15. _headers = {}
  16. _proxies = None
  17. def _account_supervision(func):
  18. @functools.wraps(func)
  19. def wrapper(*args, **kwargs):
  20. err = None
  21. for _ in range(3):
  22. try:
  23. return func(*args, **kwargs)
  24. except requests.exceptions.RequestException as e:
  25. err = e
  26. except AssertionError:
  27. logger.error('账号异常')
  28. send_wechat_warning('浙移集成|访问失败|账号异常')
  29. return
  30. if err is not None:
  31. # logger.exception(f'账号异常,原因:{err}')
  32. raise err
  33. return wrapper
  34. def set_cookies(ck):
  35. global _cookies
  36. _cookies = ck
  37. def set_headers(h):
  38. global _headers
  39. _headers = h
  40. def set_proxies(p):
  41. global _proxies
  42. _proxies = p
  43. def get_proxies(scheme=None):
  44. global _proxies
  45. if _proxies is None:
  46. return
  47. return _proxies if scheme is None else (_proxies or {}).get(scheme).replace('socks5://', '')
  48. def _extract_cid(href):
  49. script = '''
  50. function extractCid(url) {
  51. if(url.indexOf('/zb/detail') != -1){
  52. var cidArr = url.split('_');
  53. if (cidArr.length > 1) {
  54. var cid = cidArr[1].replace('.html', '');
  55. if (cid.indexOf('-') != -1) {
  56. cid = cid.split("-")[1];
  57. }
  58. return cid
  59. }
  60. }
  61. if (url.indexOf('-') != -1) {
  62. t = url.lastIndexOf("-")
  63. n = url.substring(t + 1)
  64. cid = n.split(".html")[0]
  65. return cid
  66. }
  67. }
  68. '''
  69. ctx = execjs.compile(script)
  70. result = ctx.call('extractCid', href)
  71. return result
  72. def _extract_referer(href, cid):
  73. global _cookies, _proxies
  74. href = str(href).replace('http:', 'https:')
  75. url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
  76. headers = {
  77. 'Accept': '*/*',
  78. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
  79. 'Cache-Control': 'no-cache',
  80. 'Connection': 'keep-alive',
  81. 'Pragma': 'no-cache',
  82. 'Referer': href,
  83. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
  84. 'X-Requested-With': 'XMLHttpRequest',
  85. }
  86. requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
  87. response = requests.get(url, timeout=10, **requests_params)
  88. assert response.status_code == 200
  89. text = response.content.decode()
  90. script = '''
  91. function extractDetailUrl(cid, dataStr) {
  92. var data = JSON.parse(dataStr)
  93. var catId = data.data;
  94. var pageName;
  95. switch (catId) {
  96. case 301:
  97. case 601:
  98. pageName = "tenderDetail.html";
  99. break;
  100. case 202:
  101. pageName = "projectDetail.html";
  102. break;
  103. case 201:
  104. pageName = "tenderDetail.html";
  105. break;
  106. case 101:
  107. pageName = "projectDetail.html";
  108. break;
  109. default:
  110. pageName = "tenderDetail.html";
  111. break;
  112. }
  113. return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
  114. }
  115. '''
  116. ctx = execjs.compile(script)
  117. result = ctx.call('extractDetailUrl', cid, text)
  118. return result
  119. def _download_detail(href, referer=False, timeout=10):
  120. global _cookies, _headers, _proxies
  121. headers = copy.deepcopy(_headers)
  122. cid = _extract_cid(href)
  123. if not cid:
  124. raise ValueError('cid is not exist')
  125. url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
  126. if referer:
  127. referer = _extract_referer(href, cid)
  128. headers['Referer'] = referer
  129. requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
  130. response = requests.post(url, timeout=timeout, **requests_params)
  131. assert response.status_code == 200
  132. result = response.json()
  133. data = result['data']
  134. if not data:
  135. logger.warning(f'下载异常|{result}')
  136. return data
  137. logger.info(f'下载成功|{href}')
  138. return data
  139. @_account_supervision
  140. def download_html(href, **kwargs):
  141. result = _download_detail(href, **kwargs)
  142. if not result:
  143. return
  144. return result['content']
  145. @_account_supervision
  146. def download_json(href, **kwargs):
  147. result = _download_detail(href, timeout=30, **kwargs)
  148. if result is None:
  149. return False
  150. '''{"code":700053,"msg":"该条信息已被撤销,请重新检索","data":null}'''
  151. if 'code' in result and result['code'] == 700053:
  152. logger.warning(f'检索失败|{result}')
  153. return
  154. return result
  155. @_account_supervision
  156. def download_list(keywords, page, page_size, **kwargs):
  157. global _cookies, _headers, _proxies
  158. today = datetime.now().strftime('%Y-%m-%d')
  159. begin_time = kwargs.get('begin_time') or today
  160. end_time = kwargs.get('end_time') or today
  161. url = 'https://search.vip.qianlima.com/rest/service/website/search/solr'
  162. data = {
  163. "keywords": keywords, # 检索标题
  164. "timeType": 4, # 自定义时间类型
  165. "beginTime": begin_time,
  166. "endTime": end_time,
  167. "filtermode": 2,
  168. "searchMode": 0,
  169. "currentPage": page, # 页码
  170. "numPerPage": page_size, # 每页最大条目数
  171. "sortType": 6,
  172. "allType": -1,
  173. "noticeSegmentTypeStr": "",
  174. "beginAmount": "",
  175. "endAmount": "",
  176. "purchasingUnitIdList": "",
  177. "threeClassifyTagStr": "",
  178. "fourLevelCategoryIdListStr": "",
  179. "threeLevelCategoryIdListStr": "",
  180. "levelId": "",
  181. "tab": 0,
  182. "searchDataType": 0,
  183. "types": "-1",
  184. "showContent": 1,
  185. "hasTenderTransferProject": 1,
  186. "newAreas": "",
  187. "hasChooseSortType": 1,
  188. "summaryType": 0
  189. }
  190. response = requests.post(
  191. url,
  192. cookies=_cookies,
  193. headers=_headers,
  194. json=data,
  195. timeout=60,
  196. proxies=_proxies
  197. )
  198. assert response.status_code == 200
  199. result = response.json()
  200. try:
  201. result['data']['rowCount']
  202. except TypeError:
  203. logger.error(f'下载失败|{keywords}|第{page}页|{result}')
  204. return
  205. lst = result['data']
  206. if not lst:
  207. logger.warning(f'数据异常|{keywords}|第{page}页|{result}')
  208. return
  209. logger.debug(f'下载成功|{keywords}|第{page}页')
  210. return result['data']['data']
  211. def send_wechat_warning(msg, send=True):
  212. markdown = f'千里马会员账号采集异常,请相关同事注意。'
  213. markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
  214. if not send:
  215. logger.info(markdown)
  216. return
  217. url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=bf53d666-bfa7-4176-b3e2-2d4b9d8a3bea'
  218. headers_ = {'Content-Type': 'application/json'}
  219. json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}}
  220. request_params = dict(headers=headers_, json=json_data, timeout=10)
  221. response = requests.post(url, **request_params)
  222. logger.info(response.json())