# -*- coding: utf-8 -*- """ Created on 2024-11-06 --------- @summary: --------- @author: Dzr """ import requests from clean_html import cleaner from font_tool import parse_font, download_font from jy_ocr import dg_ocr_image_extract from log import logger from utils import extract_list, extract_detail_html def get_proxy(scheme=None, socks5h=False, retry=5, default=None, missing_ok=True): url = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch' headers = {'Authorization': 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'} def _fetch_proxy(): for _ in range(retry): try: return requests.get(url, headers=headers, timeout=15).json() except requests.RequestException as e: if not missing_ok: raise e return {} proxies = _fetch_proxy().get('data') if not proxies: return default if socks5h: proxy_item = proxies.get('http') proxies = { 'http': proxy_item.replace('socks5', 'socks5h'), 'https': proxy_item.replace('socks5', 'socks5h') } return proxies if not scheme else proxies.get(scheme, default) def fetch(url, headers, **kwargs): response = requests.get(url, headers=headers, **kwargs) # print(response) response.encoding = 'gb2312' return response def download(url, headers, proxies=None, **kwargs): try: response = fetch(url, headers, timeout=60, proxies=proxies, **kwargs) except requests.RequestException as e: # logger.error(f'网络访问|请求失败|{url}') # logger.exception(e) logger.error(e) return False html = response.content.decode('gbk') try: font_file = download_font(html=html, to_local=True) # 下载动态字体 except ValueError: # logger.error(f'网络访问|请求失败|字体文件|{url}') return False ft = parse_font(font_file, ocr_extract=dg_ocr_image_extract) return html, ft def download_list(url, proxies=None, **kwargs): headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'priority': 'u=0, i', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', } rets = download(url, headers, proxies=proxies, **kwargs) if rets is False: return html, ft = rets return extract_list(html, ft.font_maps) def download_detail(url, proxies=None, missing_ok=True, **kwargs): headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'priority': 'u=0, i', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', } rets = download(url, headers, proxies=proxies, **kwargs) if not rets: return html, ft = rets try: contenthtml = extract_detail_html(html, ft.font_maps) return { 'contenthtml': contenthtml, 'detail': cleaner(contenthtml), } except ValueError as e: if not missing_ok: raise e def send_wechat_warning(msg, send=True): markdown = f'采集异常,请相关同事注意。' markdown += f'\n>异常详情:**{msg}**' if not send: logger.info(markdown) return url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=860e3acc-4e5b-4b52-ac19-a49d6a1b5a69' headers = {'Content-Type': 'application/json'} json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}} request_params = dict(headers=headers, json=json_data, timeout=10) response = requests.post(url, **request_params) logger.info(response.json())