data_spider
/
match_spider


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
							# -*- coding: utf-8 -*-
"""
Created on 2024-10-31 
---------
@summary:  解析图片文本
---------
@author: Dzr
"""
import io
import pathlib
import random
import re
import string
from pathlib import Path
from urllib.request import urlretrieve

import numpy as np
from PIL import Image, ImageOps
from ddddocr import DdddOcr
from fontTools.misc.transform import Offset
from fontTools.pens.freetypePen import FreeTypePen  # pip install freetype-py
from fontTools.ttLib import TTFont

_root = Path(__file__).parent
_cache_dir = _root.joinpath('cache')
_cache_dir.mkdir(exist_ok=True)

_font_dir = _cache_dir.joinpath('font')
_font_dir.mkdir(exist_ok=True)

_image_dir = _cache_dir.joinpath('image')
_image_dir.mkdir(exist_ok=True)


def get_random(length=4):
    return ''.join(random.sample(string.ascii_letters + string.digits, length))


def parse_font_url(html):
    result = re.search(r"'icomoon';src:url\('(.*?)'\)", html, re.S)
    if result is None:
        raise ValueError(f'字体库 url "{result}" ')

    return result.group(1)


def create_file(filename):
    file = _font_dir.joinpath(filename)
    file.touch(exist_ok=True)
    return file


def download_font(html, font_type='ttf', to_local=False):
    filename = f'{get_random(6)}.{font_type}'
    tmp = create_file(filename)
    url = parse_font_url(html)
    urlretrieve(url, filename=tmp)
    if not to_local:
        file_bytes = tmp.read_bytes()
        tmp.unlink(missing_ok=True)
        tmp = file_bytes
    return tmp


def image_to_bytes(image, filetype='JPEG'):
    byte_stream = io.BytesIO()
    image.save(byte_stream, format=filetype)
    byte_array = byte_stream.getvalue()
    return byte_array


class ImageToText:
    def __init__(self, file, cache=False, ocr=False, callback=None, image_scale=5, auto_delete=True):
        """

        @param file: 字体文件
        @param cache: 缓存字体图片到本地磁盘
        @param ocr: 图片识别启用Ocr
        @param image_scale: 图片缩放倍数
        @param callback: 图片文本识别处理的回调函数
        @param auto_delete: 自动清除字体图片
        """
        if not isinstance(file, (bytes, str, pathlib.PurePath)):
            raise TypeError("未知文件类型")

        if isinstance(file, bytes):
            self._font = TTFont(io.BytesIO(file))
        elif isinstance(file, str):
            self._font = TTFont(file)
        else:
            assert isinstance(file, pathlib.PurePath)
            self._font = TTFont(file)

        # 字体图片映射关系
        self._font_maps = {}
        self._image_scale = image_scale

        # 缓存
        self._cache_images = {}
        self._to_local = cache
        self._auto_delete = False if cache is True else auto_delete

        # Ocr
        self._callback = None
        self._enable_ocr = ocr
        if ocr is True:
            if callback is not None and callable(callback):
                self._callback = callback
            else:
                ddddocr = DdddOcr(beta=False, old=True, show_ad=False)

                def _classification(files):
                    if isinstance(files, tuple):
                        img = files[1]
                    else:
                        img = files

                    return ddddocr.classification(img)

                self._callback = _classification

    def to_xml(self):
        filename = self._font.reader.file.name
        font_f = Path(filename).with_suffix('.xml')
        self._font.saveXML(font_f)

    @property
    def font_maps(self):
        return self._font_maps

    def parse_font(self):
        self._font_encode()

        if self._enable_ocr:
            self._font_draw()
            self._font_ocr()

    def _font_encode(self):
        for unicode, name in self._font.getBestCmap().items():
            code = f'&#{str(hex(unicode))[1:]}'  # 0x100c4 => &#x100c4
            glyph = {'name': name, 'code': hex(unicode), 'zh': ''}
            self._font_maps[code] = glyph
            # print(code, glyph)

    def _font_draw(self):
        glyph_set = self._font.getGlyphSet()
        for code, glyph_dict in self._font_maps.items():
            # print(code, glyph_dict)
            glyph = glyph_set[glyph_dict['name']]  # 获取字形

            pen = FreeTypePen(None)  # 创建变换笔(FreeTypePen)实例,绘制字形
            glyph.draw(pen)  # 绘制字形

            # 获取字形的宽度，以及从字体文件的 OS/2 表中获取推荐的上升高度和下降高度,确定图像的高度
            width, ascender, descender = (
                glyph.width,
                self._font['OS/2'].usWinAscent,
                -self._font['OS/2'].usWinDescent,
            )
            height = ascender - descender

            # 创建图像并转换为数组
            single_font_image = pen.array(
                width=width,
                height=height,
                transform=Offset(0, -descender),
                contain=False,
                evenOdd=False,
            )

            # 转换为灰度图像数组
            single_font_image = np.array(single_font_image) * 255
            # 反转颜色(使得黑色变为白色，白色变为黑色)
            single_font_image = 255 - single_font_image

            # 创建 PIL 图像对象
            single_font_image = Image.fromarray(single_font_image)
            # 转换为灰度模式
            single_font_image = single_font_image.convert("L")
            # 图片添加边框
            single_font_image = ImageOps.expand(single_font_image, border=6, fill=255)

            # 计算新的宽度和高度
            new_width = single_font_image.width // self._image_scale
            new_height = single_font_image.height // self._image_scale

            # 调整图片大小
            single_font_image = single_font_image.resize(
                (new_width, new_height),
                resample=Image.Resampling.LANCZOS
            )

            image_name = f'{glyph_dict["code"]}.jpg'
            if not self._to_local:
                image_bytes = image_to_bytes(single_font_image)
                self._cache_images[code] = (image_name, image_bytes, 'jpg')
            else:
                single_font_image.save(_image_dir.joinpath(image_name))  # 保存图像

    def _font_ocr(self):
        for code, glyph_dict in dict(self._font_maps).items():
            if not self._to_local:
                files = self._cache_images[code]
                text = self._callback(files)
            else:
                files = _image_dir.joinpath(f'{glyph_dict["code"]}.jpg')
                text = self._callback(files)

            self._font_maps[code]['zh'] = text

    def __contains__(self, key):
        return key in self._font_maps

    def __getitem__(self, key):
        if key in self._font_maps:
            return self._font_maps[key]
        else:
            raise KeyError(key)

    def get(self, key, default=None):
        try:
            return self.__getitem__(key)
        except KeyError:
            return default

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.__del__()
        return

    def _del(self, missing_ok=False):
        if self._auto_delete:
            for img_f in _image_dir.iterdir():
                img_f.unlink(missing_ok=True)

            for font_f in _font_dir.iterdir():
                font_f.unlink(missing_ok=True)

            try:
                # _image_dir.rmdir()
                # _font_dir.rmdir()
                _cache_dir.rmdir()
            except OSError as e:
                if not missing_ok:
                    raise e

    def __del__(self):
        self._del(missing_ok=True)


FontTranslator = ImageToText


def parse_font(font_file, *, ocr=False, ocr_extract=None, **kwargs):
    ocr = True if ocr_extract is not None and callable(ocr_extract) else ocr
    translator = ImageToText(font_file, ocr=ocr, callback=ocr_extract, **kwargs)
    translator.parse_font()
    return translator