123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-11-06
- ---------
- @summary:
- ---------
- @author: Dzr
- """
- import requests
- from clean_html import cleaner
- from font_tool import parse_font, download_font
- from jy_ocr import dg_ocr_image_extract
- from log import logger
- from utils import extract_list, extract_detail_html
- def get_proxy(scheme=None, socks5h=False, retry=5, default=None, missing_ok=True):
- url = 'http://cc.spdata.jianyu360.com/crawl/proxy/socks5/fetch'
- headers = {'Authorization': 'Basic amlhbnl1MDAxOjEyM3F3ZSFB'}
- def _fetch_proxy():
- for _ in range(retry):
- try:
- return requests.get(url, headers=headers, timeout=15).json()
- except requests.RequestException as e:
- if not missing_ok:
- raise e
- return {}
- proxies = _fetch_proxy().get('data')
- if not proxies:
- return default
- if socks5h:
- proxy_item = proxies.get('http')
- proxies = {
- 'http': proxy_item.replace('socks5', 'socks5h'),
- 'https': proxy_item.replace('socks5', 'socks5h')
- }
- return proxies if not scheme else proxies.get(scheme, default)
- def fetch(url, headers, **kwargs):
- response = requests.get(url, headers=headers, **kwargs)
- # print(response)
- response.encoding = 'gb2312'
- return response
- def download(url, headers, proxies=None, **kwargs):
- try:
- response = fetch(url, headers, timeout=60, proxies=proxies, **kwargs)
- except requests.RequestException as e:
- # logger.error(f'网络访问|请求失败|{url}')
- # logger.exception(e)
- logger.error(e)
- return False
- html = response.content.decode('gbk')
- try:
- font_file = download_font(html=html, to_local=True) # 下载动态字体
- except ValueError:
- # logger.error(f'网络访问|请求失败|字体文件|{url}')
- return False
- ft = parse_font(font_file, ocr_extract=dg_ocr_image_extract)
- return html, ft
- def download_list(url, proxies=None, **kwargs):
- headers = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'cache-control': 'no-cache',
- 'pragma': 'no-cache',
- 'priority': 'u=0, i',
- 'upgrade-insecure-requests': '1',
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
- }
- rets = download(url, headers, proxies=proxies, **kwargs)
- if rets is False:
- return
- html, ft = rets
- return extract_list(html, ft.font_maps)
- def download_detail(url, proxies=None, missing_ok=True, **kwargs):
- headers = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
- 'cache-control': 'no-cache',
- 'pragma': 'no-cache',
- 'priority': 'u=0, i',
- 'upgrade-insecure-requests': '1',
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
- }
- rets = download(url, headers, proxies=proxies, **kwargs)
- if not rets:
- return
- html, ft = rets
- try:
- contenthtml = extract_detail_html(html, ft.font_maps)
- return {
- 'contenthtml': contenthtml,
- 'detail': cleaner(contenthtml),
- }
- except ValueError as e:
- if not missing_ok:
- raise e
- def send_wechat_warning(msg, send=True):
- markdown = f'采集异常,请相关同事注意。'
- markdown += f'\n>异常详情:<font color=\"warning\">**{msg}**</font>'
- if not send:
- logger.info(markdown)
- return
- url = 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=860e3acc-4e5b-4b52-ac19-a49d6a1b5a69'
- headers = {'Content-Type': 'application/json'}
- json_data = {'msgtype': 'markdown', 'markdown': {'content': markdown}}
- request_params = dict(headers=headers, json=json_data, timeout=10)
- response = requests.post(url, **request_params)
- logger.info(response.json())
|