net.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-11-06
  4. ---------
  5. @summary:
  6. ---------
  7. @author: Dzr
  8. """
  9. import requests
  10. from clean_html import cleaner
  11. from font_tool import parse_font, download_font
  12. from jy_ocr import dg_ocr_image_extract
  13. from log import logger
  14. from utils import extract_list, extract_detail_html
  15. def get_proxy(scheme=None, socks5h=False, retry=5, default=None, missing_ok=True):
  16. url = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
  17. headers = {'Authorization': 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'}
  18. def _fetch_proxy():
  19. for _ in range(retry):
  20. try:
  21. return requests.get(url, headers=headers, timeout=15).json()
  22. except requests.RequestException as e:
  23. if not missing_ok:
  24. raise e
  25. return {}
  26. proxies = _fetch_proxy().get('data')
  27. if not proxies:
  28. return default
  29. if socks5h:
  30. proxy_item = proxies.get('http')
  31. proxies = {
  32. 'http': proxy_item.replace('socks5', 'socks5h'),
  33. 'https': proxy_item.replace('socks5', 'socks5h')
  34. }
  35. return proxies if not scheme else proxies.get(scheme, default)
  36. def fetch(url, headers, **kwargs):
  37. response = requests.get(url, headers=headers, **kwargs)
  38. # print(response)
  39. response.encoding = 'gb2312'
  40. return response
  41. def download(url, headers, proxies=None, **kwargs):
  42. try:
  43. response = fetch(url, headers, timeout=60, proxies=proxies, **kwargs)
  44. except requests.RequestException as e:
  45. # logger.error(f'网络访问|请求失败|{url}')
  46. # logger.exception(e)
  47. logger.error(e)
  48. return False
  49. html = response.content.decode('gbk')
  50. try:
  51. font_file = download_font(html=html, to_local=True) # 下载动态字体
  52. except ValueError:
  53. # logger.error(f'网络访问|请求失败|字体文件|{url}')
  54. return False
  55. ft = parse_font(font_file, ocr_extract=dg_ocr_image_extract)
  56. return html, ft
  57. def download_list(url, proxies=None, **kwargs):
  58. headers = {
  59. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  60. 'accept-language': 'zh-CN,zh;q=0.9',
  61. 'cache-control': 'no-cache',
  62. 'pragma': 'no-cache',
  63. 'priority': 'u=0, i',
  64. 'upgrade-insecure-requests': '1',
  65. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
  66. }
  67. rets = download(url, headers, proxies=proxies, **kwargs)
  68. if rets is False:
  69. return
  70. html, ft = rets
  71. return extract_list(html, ft.font_maps)
  72. def download_detail(url, proxies=None, missing_ok=True, **kwargs):
  73. headers = {
  74. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  75. 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
  76. 'cache-control': 'no-cache',
  77. 'pragma': 'no-cache',
  78. 'priority': 'u=0, i',
  79. 'upgrade-insecure-requests': '1',
  80. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
  81. }
  82. rets = download(url, headers, proxies=proxies, **kwargs)
  83. if not rets:
  84. return
  85. html, ft = rets
  86. try:
  87. contenthtml = extract_detail_html(html, ft.font_maps)
  88. return {
  89. 'contenthtml': contenthtml,
  90. 'detail': cleaner(contenthtml),
  91. }
  92. except ValueError as e:
  93. if not missing_ok:
  94. raise e
  95. def send_wechat_warning(msg, send=True):
  96. markdown = f'采集异常,请相关同事注意。'
  97. markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
  98. if not send:
  99. logger.info(markdown)
  100. return
  101. url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=860e3acc-4e5b-4b52-ac19-a49d6a1b5a69'
  102. headers = {'Content-Type': 'application/json'}
  103. json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}}
  104. request_params = dict(headers=headers, json=json_data, timeout=10)
  105. response = requests.post(url, **request_params)
  106. logger.info(response.json())