font_tool.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-10-31
  4. ---------
  5. @summary: 解析图片文本
  6. ---------
  7. @author: Dzr
  8. """
  9. import io
  10. import pathlib
  11. import random
  12. import re
  13. import string
  14. from pathlib import Path
  15. from urllib.request import urlretrieve
  16. import numpy as np
  17. from PIL import Image, ImageOps
  18. from ddddocr import DdddOcr
  19. from fontTools.misc.transform import Offset
  20. from fontTools.pens.freetypePen import FreeTypePen # pip install freetype-py
  21. from fontTools.ttLib import TTFont
  22. _root = Path(__file__).parent
  23. _cache_dir = _root.joinpath('cache')
  24. _cache_dir.mkdir(exist_ok=True)
  25. _font_dir = _cache_dir.joinpath('font')
  26. _font_dir.mkdir(exist_ok=True)
  27. _image_dir = _cache_dir.joinpath('image')
  28. _image_dir.mkdir(exist_ok=True)
  29. def get_random(length=4):
  30. return ''.join(random.sample(string.ascii_letters + string.digits, length))
  31. def parse_font_url(html):
  32. result = re.search(r"'icomoon';src:url\('(.*?)'\)", html, re.S)
  33. if result is None:
  34. raise ValueError(f'字体库 url "{result}" ')
  35. return result.group(1)
  36. def create_file(filename):
  37. file = _font_dir.joinpath(filename)
  38. file.touch(exist_ok=True)
  39. return file
  40. def download_font(html, font_type='ttf', to_local=False):
  41. filename = f'{get_random(6)}.{font_type}'
  42. tmp = create_file(filename)
  43. url = parse_font_url(html)
  44. urlretrieve(url, filename=tmp)
  45. if not to_local:
  46. file_bytes = tmp.read_bytes()
  47. tmp.unlink(missing_ok=True)
  48. tmp = file_bytes
  49. return tmp
  50. def image_to_bytes(image, filetype='JPEG'):
  51. byte_stream = io.BytesIO()
  52. image.save(byte_stream, format=filetype)
  53. byte_array = byte_stream.getvalue()
  54. return byte_array
  55. class ImageToText:
  56. def __init__(self, file, cache=False, ocr=False, callback=None, image_scale=5, auto_delete=True):
  57. """
  58. @param file: 字体文件
  59. @param cache: 缓存字体图片到本地磁盘
  60. @param ocr: 图片识别启用Ocr
  61. @param image_scale: 图片缩放倍数
  62. @param callback: 图片文本识别处理的回调函数
  63. @param auto_delete: 自动清除字体图片
  64. """
  65. if not isinstance(file, (bytes, str, pathlib.PurePath)):
  66. raise TypeError("未知文件类型")
  67. if isinstance(file, bytes):
  68. self._font = TTFont(io.BytesIO(file))
  69. elif isinstance(file, str):
  70. self._font = TTFont(file)
  71. else:
  72. assert isinstance(file, pathlib.PurePath)
  73. self._font = TTFont(file)
  74. # 字体图片映射关系
  75. self._font_maps = {}
  76. self._image_scale = image_scale
  77. # 缓存
  78. self._cache_images = {}
  79. self._to_local = cache
  80. self._auto_delete = False if cache is True else auto_delete
  81. # Ocr
  82. self._callback = None
  83. self._enable_ocr = ocr
  84. if ocr is True:
  85. if callback is not None and callable(callback):
  86. self._callback = callback
  87. else:
  88. ddddocr = DdddOcr(beta=False, old=True, show_ad=False)
  89. def _classification(files):
  90. if isinstance(files, tuple):
  91. img = files[1]
  92. else:
  93. img = files
  94. return ddddocr.classification(img)
  95. self._callback = _classification
  96. def to_xml(self):
  97. filename = self._font.reader.file.name
  98. font_f = Path(filename).with_suffix('.xml')
  99. self._font.saveXML(font_f)
  100. @property
  101. def font_maps(self):
  102. return self._font_maps
  103. def parse_font(self):
  104. self._font_encode()
  105. if self._enable_ocr:
  106. self._font_draw()
  107. self._font_ocr()
  108. def _font_encode(self):
  109. for unicode, name in self._font.getBestCmap().items():
  110. code = f'&#{str(hex(unicode))[1:]}' # 0x100c4 => &#x100c4
  111. glyph = {'name': name, 'code': hex(unicode), 'zh': ''}
  112. self._font_maps[code] = glyph
  113. # print(code, glyph)
  114. def _font_draw(self):
  115. glyph_set = self._font.getGlyphSet()
  116. for code, glyph_dict in self._font_maps.items():
  117. # print(code, glyph_dict)
  118. glyph = glyph_set[glyph_dict['name']] # 获取字形
  119. pen = FreeTypePen(None) # 创建变换笔(FreeTypePen)实例,绘制字形
  120. glyph.draw(pen) # 绘制字形
  121. # 获取字形的宽度,以及从字体文件的 OS/2 表中获取推荐的上升高度和下降高度,确定图像的高度
  122. width, ascender, descender = (
  123. glyph.width,
  124. self._font['OS/2'].usWinAscent,
  125. -self._font['OS/2'].usWinDescent,
  126. )
  127. height = ascender - descender
  128. # 创建图像并转换为数组
  129. single_font_image = pen.array(
  130. width=width,
  131. height=height,
  132. transform=Offset(0, -descender),
  133. contain=False,
  134. evenOdd=False,
  135. )
  136. # 转换为灰度图像数组
  137. single_font_image = np.array(single_font_image) * 255
  138. # 反转颜色(使得黑色变为白色,白色变为黑色)
  139. single_font_image = 255 - single_font_image
  140. # 创建 PIL 图像对象
  141. single_font_image = Image.fromarray(single_font_image)
  142. # 转换为灰度模式
  143. single_font_image = single_font_image.convert("L")
  144. # 图片添加边框
  145. single_font_image = ImageOps.expand(single_font_image, border=6, fill=255)
  146. # 计算新的宽度和高度
  147. new_width = single_font_image.width // self._image_scale
  148. new_height = single_font_image.height // self._image_scale
  149. # 调整图片大小
  150. single_font_image = single_font_image.resize(
  151. (new_width, new_height),
  152. resample=Image.Resampling.LANCZOS
  153. )
  154. image_name = f'{glyph_dict["code"]}.jpg'
  155. if not self._to_local:
  156. image_bytes = image_to_bytes(single_font_image)
  157. self._cache_images[code] = (image_name, image_bytes, 'jpg')
  158. else:
  159. single_font_image.save(_image_dir.joinpath(image_name)) # 保存图像
  160. def _font_ocr(self):
  161. for code, glyph_dict in dict(self._font_maps).items():
  162. if not self._to_local:
  163. files = self._cache_images[code]
  164. text = self._callback(files)
  165. else:
  166. files = _image_dir.joinpath(f'{glyph_dict["code"]}.jpg')
  167. text = self._callback(files)
  168. self._font_maps[code]['zh'] = text
  169. def __contains__(self, key):
  170. return key in self._font_maps
  171. def __getitem__(self, key):
  172. if key in self._font_maps:
  173. return self._font_maps[key]
  174. else:
  175. raise KeyError(key)
  176. def get(self, key, default=None):
  177. try:
  178. return self.__getitem__(key)
  179. except KeyError:
  180. return default
  181. def __enter__(self):
  182. return self
  183. def __exit__(self, exc_type, exc_val, exc_tb):
  184. self.__del__()
  185. return
  186. def _del(self, missing_ok=False):
  187. if self._auto_delete:
  188. for img_f in _image_dir.iterdir():
  189. img_f.unlink(missing_ok=True)
  190. for font_f in _font_dir.iterdir():
  191. font_f.unlink(missing_ok=True)
  192. try:
  193. # _image_dir.rmdir()
  194. # _font_dir.rmdir()
  195. _cache_dir.rmdir()
  196. except OSError as e:
  197. if not missing_ok:
  198. raise e
  199. def __del__(self):
  200. self._del(missing_ok=True)
  201. FontTranslator = ImageToText
  202. def parse_font(font_file, *, ocr=False, ocr_extract=None, **kwargs):
  203. ocr = True if ocr_extract is not None and callable(ocr_extract) else ocr
  204. translator = ImageToText(font_file, ocr=ocr, callback=ocr_extract, **kwargs)
  205. translator.parse_font()
  206. return translator