net.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-10-10
  4. ---------
  5. @summary: 千里马详情页专用下载器
  6. ---------
  7. @author: Dzr
  8. """
  9. import copy
  10. import functools
  11. import json
  12. import execjs
  13. import requests
  14. from loguru import logger
  15. # 来源 rest/detail/alltypesdetail/detail
  16. _cookies = {}
  17. _headers = {}
  18. _proxies = None
  19. def router(func):
  20. @functools.wraps(func)
  21. def wrapper(*args, **kwargs):
  22. try:
  23. return func(*args, **kwargs)
  24. except AssertionError:
  25. logger.exception('账号异常')
  26. return False, {}
  27. except KeyboardInterrupt:
  28. pass
  29. except requests.exceptions.RequestException as e:
  30. logger.exception(f'网络请求错误, 原因:{e}')
  31. return False, {}
  32. return wrapper
  33. def set_cookies(ck):
  34. global _cookies
  35. _cookies = ck
  36. def set_headers(h):
  37. global _headers
  38. _headers = h
  39. def set_proxies(p):
  40. global _proxies
  41. _proxies = p
  42. def _extract_cid(href):
  43. script = '''
  44. function extractCid(url) {
  45. if(url.indexOf('/zb/detail') != -1){
  46. var cidArr = url.split('_');
  47. if (cidArr.length > 1) {
  48. var cid = cidArr[1].replace('.html', '');
  49. if (cid.indexOf('-') != -1) {
  50. cid = cid.split("-")[1];
  51. }
  52. return cid
  53. }
  54. }
  55. if (url.indexOf('-') != -1) {
  56. t = url.lastIndexOf("-")
  57. n = url.substring(t + 1)
  58. cid = n.split(".html")[0]
  59. return cid
  60. }
  61. }
  62. '''
  63. ctx = execjs.compile(script)
  64. result = ctx.call('extractCid', href)
  65. return result
  66. def _extract_referer(href, cid):
  67. global _cookies, _proxies
  68. href = str(href).replace('http:', 'https:')
  69. url = 'https://www.qianlima.com/website-seo/v2/cm/getcatid/' + cid
  70. headers = {
  71. 'Accept': '*/*',
  72. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,sq;q=0.7',
  73. 'Cache-Control': 'no-cache',
  74. 'Connection': 'keep-alive',
  75. 'Pragma': 'no-cache',
  76. 'Referer': href,
  77. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
  78. 'X-Requested-With': 'XMLHttpRequest',
  79. }
  80. requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
  81. response = requests.get(url, timeout=10, **requests_params)
  82. assert response.status_code == 200
  83. text = response.content.decode()
  84. script = '''
  85. function extractDetailUrl(cid, dataStr) {
  86. var data = JSON.parse(dataStr)
  87. var catId = data.data;
  88. var pageName;
  89. switch (catId) {
  90. case 301:
  91. case 601:
  92. pageName = "tenderDetail.html";
  93. break;
  94. case 202:
  95. pageName = "projectDetail.html";
  96. break;
  97. case 201:
  98. pageName = "tenderDetail.html";
  99. break;
  100. case 101:
  101. pageName = "projectDetail.html";
  102. break;
  103. default:
  104. pageName = "tenderDetail.html";
  105. break;
  106. }
  107. return 'https://detail.vip.qianlima.com/' + pageName + '?id=' + cid;
  108. }
  109. '''
  110. ctx = execjs.compile(script)
  111. result = ctx.call('extractDetailUrl', cid, text)
  112. return result
  113. def _download_detail(href, referer=False):
  114. global _cookies, _headers, _proxies
  115. headers = copy.deepcopy(_headers)
  116. cid = _extract_cid(href)
  117. if not cid:
  118. raise ValueError('cid is not exist')
  119. url = 'https://detail.vip.qianlima.com/rest/detail/alltypesdetail/detail/' + cid
  120. if referer:
  121. referer = _extract_referer(href, cid)
  122. headers['Referer'] = referer
  123. requests_params = dict(headers=headers, cookies=_cookies, proxies=_proxies)
  124. try:
  125. response = requests.post(url, timeout=10, **requests_params)
  126. except requests.exceptions.Timeout:
  127. logger.error(f'采集失败|访问超时|{href}')
  128. return False, None # 账号额度不足时,返回:None
  129. username = _cookies['qlm_username']
  130. status_code = response.status_code
  131. if status_code != 200:
  132. result = response.content.decode()
  133. logger.error(f'采集失败|{username}|状态码|{status_code}|请求响应|{result}')
  134. return False, status_code
  135. result = response.json()
  136. data = result['data']
  137. if not data:
  138. logger.warning(f'数据异常|{result}')
  139. return False, data # 账号额度不足时,返回:None
  140. logger.info(f'采集成功[{href}]')
  141. return True, data
  142. @router
  143. def download_json(href, **kwargs):
  144. _, result = _download_detail(href, **kwargs)
  145. if not result:
  146. return False
  147. if isinstance(result, int):
  148. return result
  149. return result
  150. @router
  151. def download_html(href, **kwargs):
  152. _, result = _download_detail(href, **kwargs)
  153. if not result:
  154. return False
  155. return result['content']