|
@@ -0,0 +1,260 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+Created on 2024-10-31
|
|
|
+---------
|
|
|
+@summary: 解析图片文本
|
|
|
+---------
|
|
|
+@author: Dzr
|
|
|
+"""
|
|
|
+import io
|
|
|
+import pathlib
|
|
|
+import random
|
|
|
+import re
|
|
|
+import string
|
|
|
+from pathlib import Path
|
|
|
+from urllib.request import urlretrieve
|
|
|
+
|
|
|
+import numpy as np
|
|
|
+from PIL import Image, ImageOps
|
|
|
+from ddddocr import DdddOcr
|
|
|
+from fontTools.misc.transform import Offset
|
|
|
+from fontTools.pens.freetypePen import FreeTypePen # pip install freetype-py
|
|
|
+from fontTools.ttLib import TTFont
|
|
|
+
|
|
|
+_root = Path(__file__).parent
|
|
|
+_cache_dir = _root.joinpath('cache')
|
|
|
+_cache_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+_font_dir = _cache_dir.joinpath('font')
|
|
|
+_font_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+_image_dir = _cache_dir.joinpath('image')
|
|
|
+_image_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+
|
|
|
+def get_random(length=4):
|
|
|
+ return ''.join(random.sample(string.ascii_letters + string.digits, length))
|
|
|
+
|
|
|
+
|
|
|
+def parse_font_url(html):
|
|
|
+ result = re.search(r"'icomoon';src:url\('(.*?)'\)", html, re.S)
|
|
|
+ if result is None:
|
|
|
+ raise ValueError(f'字体库 url "{result}" ')
|
|
|
+
|
|
|
+ return result.group(1)
|
|
|
+
|
|
|
+
|
|
|
+def create_file(filename):
|
|
|
+ file = _font_dir.joinpath(filename)
|
|
|
+ file.touch(exist_ok=True)
|
|
|
+ return file
|
|
|
+
|
|
|
+
|
|
|
+def download_font(html, font_type='ttf', to_local=False):
|
|
|
+ filename = f'{get_random(6)}.{font_type}'
|
|
|
+ tmp = create_file(filename)
|
|
|
+ url = parse_font_url(html)
|
|
|
+ urlretrieve(url, filename=tmp)
|
|
|
+ if not to_local:
|
|
|
+ file_bytes = tmp.read_bytes()
|
|
|
+ tmp.unlink(missing_ok=True)
|
|
|
+ tmp = file_bytes
|
|
|
+ return tmp
|
|
|
+
|
|
|
+
|
|
|
+def image_to_bytes(image, filetype='JPEG'):
|
|
|
+ byte_stream = io.BytesIO()
|
|
|
+ image.save(byte_stream, format=filetype)
|
|
|
+ byte_array = byte_stream.getvalue()
|
|
|
+ return byte_array
|
|
|
+
|
|
|
+
|
|
|
+class ImageToText:
|
|
|
+ def __init__(self, file, cache=False, ocr=False, callback=None, image_scale=5, auto_delete=True):
|
|
|
+ """
|
|
|
+
|
|
|
+ @param file: 字体文件
|
|
|
+ @param cache: 缓存字体图片到本地磁盘
|
|
|
+ @param ocr: 图片识别启用Ocr
|
|
|
+ @param image_scale: 图片缩放倍数
|
|
|
+ @param callback: 图片文本识别处理的回调函数
|
|
|
+ @param auto_delete: 自动清除字体图片
|
|
|
+ """
|
|
|
+ if not isinstance(file, (bytes, str, pathlib.PurePath)):
|
|
|
+ raise TypeError("未知文件类型")
|
|
|
+
|
|
|
+ if isinstance(file, bytes):
|
|
|
+ self._font = TTFont(io.BytesIO(file))
|
|
|
+ elif isinstance(file, str):
|
|
|
+ self._font = TTFont(file)
|
|
|
+ else:
|
|
|
+ assert isinstance(file, pathlib.PurePath)
|
|
|
+ self._font = TTFont(file)
|
|
|
+
|
|
|
+ # 字体图片映射关系
|
|
|
+ self._font_maps = {}
|
|
|
+ self._image_scale = image_scale
|
|
|
+
|
|
|
+ # 缓存
|
|
|
+ self._cache_images = {}
|
|
|
+ self._to_local = cache
|
|
|
+ self._auto_delete = False if cache is True else auto_delete
|
|
|
+
|
|
|
+ # Ocr
|
|
|
+ self._callback = None
|
|
|
+ self._enable_ocr = ocr
|
|
|
+ if ocr is True:
|
|
|
+ if callback is not None and callable(callback):
|
|
|
+ self._callback = callback
|
|
|
+ else:
|
|
|
+ ddddocr = DdddOcr(beta=False, old=True, show_ad=False)
|
|
|
+
|
|
|
+ def _classification(files):
|
|
|
+ if isinstance(files, tuple):
|
|
|
+ img = files[1]
|
|
|
+ else:
|
|
|
+ img = files
|
|
|
+
|
|
|
+ return ddddocr.classification(img)
|
|
|
+
|
|
|
+ self._callback = _classification
|
|
|
+
|
|
|
+ def to_xml(self):
|
|
|
+ filename = self._font.reader.file.name
|
|
|
+ font_f = Path(filename).with_suffix('.xml')
|
|
|
+ self._font.saveXML(font_f)
|
|
|
+
|
|
|
+ @property
|
|
|
+ def font_maps(self):
|
|
|
+ return self._font_maps
|
|
|
+
|
|
|
+ def parse_font(self):
|
|
|
+ self._font_encode()
|
|
|
+
|
|
|
+ if self._enable_ocr:
|
|
|
+ self._font_draw()
|
|
|
+ self._font_ocr()
|
|
|
+
|
|
|
+ def _font_encode(self):
|
|
|
+ for unicode, name in self._font.getBestCmap().items():
|
|
|
+ code = f'&#{str(hex(unicode))[1:]}' # 0x100c4 => 𐃄
|
|
|
+ glyph = {'name': name, 'code': hex(unicode), 'zh': ''}
|
|
|
+ self._font_maps[code] = glyph
|
|
|
+ # print(code, glyph)
|
|
|
+
|
|
|
+ def _font_draw(self):
|
|
|
+ glyph_set = self._font.getGlyphSet()
|
|
|
+ for code, glyph_dict in self._font_maps.items():
|
|
|
+ # print(code, glyph_dict)
|
|
|
+ glyph = glyph_set[glyph_dict['name']] # 获取字形
|
|
|
+
|
|
|
+ pen = FreeTypePen(None) # 创建变换笔(FreeTypePen)实例,绘制字形
|
|
|
+ glyph.draw(pen) # 绘制字形
|
|
|
+
|
|
|
+ # 获取字形的宽度,以及从字体文件的 OS/2 表中获取推荐的上升高度和下降高度,确定图像的高度
|
|
|
+ width, ascender, descender = (
|
|
|
+ glyph.width,
|
|
|
+ self._font['OS/2'].usWinAscent,
|
|
|
+ -self._font['OS/2'].usWinDescent,
|
|
|
+ )
|
|
|
+ height = ascender - descender
|
|
|
+
|
|
|
+ # 创建图像并转换为数组
|
|
|
+ single_font_image = pen.array(
|
|
|
+ width=width,
|
|
|
+ height=height,
|
|
|
+ transform=Offset(0, -descender),
|
|
|
+ contain=False,
|
|
|
+ evenOdd=False,
|
|
|
+ )
|
|
|
+
|
|
|
+ # 转换为灰度图像数组
|
|
|
+ single_font_image = np.array(single_font_image) * 255
|
|
|
+ # 反转颜色(使得黑色变为白色,白色变为黑色)
|
|
|
+ single_font_image = 255 - single_font_image
|
|
|
+
|
|
|
+ # 创建 PIL 图像对象
|
|
|
+ single_font_image = Image.fromarray(single_font_image)
|
|
|
+ # 转换为灰度模式
|
|
|
+ single_font_image = single_font_image.convert("L")
|
|
|
+ # 图片添加边框
|
|
|
+ single_font_image = ImageOps.expand(single_font_image, border=6, fill=255)
|
|
|
+
|
|
|
+ # 计算新的宽度和高度
|
|
|
+ new_width = single_font_image.width // self._image_scale
|
|
|
+ new_height = single_font_image.height // self._image_scale
|
|
|
+
|
|
|
+ # 调整图片大小
|
|
|
+ single_font_image = single_font_image.resize(
|
|
|
+ (new_width, new_height),
|
|
|
+ resample=Image.Resampling.LANCZOS
|
|
|
+ )
|
|
|
+
|
|
|
+ image_name = f'{glyph_dict["code"]}.jpg'
|
|
|
+ if not self._to_local:
|
|
|
+ image_bytes = image_to_bytes(single_font_image)
|
|
|
+ self._cache_images[code] = (image_name, image_bytes, 'jpg')
|
|
|
+ else:
|
|
|
+ single_font_image.save(_image_dir.joinpath(image_name)) # 保存图像
|
|
|
+
|
|
|
+ def _font_ocr(self):
|
|
|
+ for code, glyph_dict in dict(self._font_maps).items():
|
|
|
+ if not self._to_local:
|
|
|
+ files = self._cache_images[code]
|
|
|
+ text = self._callback(files)
|
|
|
+ else:
|
|
|
+ files = _image_dir.joinpath(f'{glyph_dict["code"]}.jpg')
|
|
|
+ text = self._callback(files)
|
|
|
+
|
|
|
+ self._font_maps[code]['zh'] = text
|
|
|
+
|
|
|
+ def __contains__(self, key):
|
|
|
+ return key in self._font_maps
|
|
|
+
|
|
|
+ def __getitem__(self, key):
|
|
|
+ if key in self._font_maps:
|
|
|
+ return self._font_maps[key]
|
|
|
+ else:
|
|
|
+ raise KeyError(key)
|
|
|
+
|
|
|
+ def get(self, key, default=None):
|
|
|
+ try:
|
|
|
+ return self.__getitem__(key)
|
|
|
+ except KeyError:
|
|
|
+ return default
|
|
|
+
|
|
|
+ def __enter__(self):
|
|
|
+ return self
|
|
|
+
|
|
|
+ def __exit__(self, exc_type, exc_val, exc_tb):
|
|
|
+ self.__del__()
|
|
|
+ return
|
|
|
+
|
|
|
+ def _del(self, missing_ok=False):
|
|
|
+ if self._auto_delete:
|
|
|
+ for img_f in _image_dir.iterdir():
|
|
|
+ img_f.unlink(missing_ok=True)
|
|
|
+
|
|
|
+ for font_f in _font_dir.iterdir():
|
|
|
+ font_f.unlink(missing_ok=True)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # _image_dir.rmdir()
|
|
|
+ # _font_dir.rmdir()
|
|
|
+ _cache_dir.rmdir()
|
|
|
+ except OSError as e:
|
|
|
+ if not missing_ok:
|
|
|
+ raise e
|
|
|
+
|
|
|
+ def __del__(self):
|
|
|
+ self._del(missing_ok=True)
|
|
|
+
|
|
|
+
|
|
|
+FontTranslator = ImageToText
|
|
|
+
|
|
|
+
|
|
|
+def parse_font(font_file, *, ocr=False, ocr_extract=None, **kwargs):
|
|
|
+ ocr = True if ocr_extract is not None and callable(ocr_extract) else ocr
|
|
|
+ translator = ImageToText(font_file, ocr=ocr, callback=ocr_extract, **kwargs)
|
|
|
+ translator.parse_font()
|
|
|
+ return translator
|