dzr 8 月之前
父节点
当前提交
7dc5206a36
共有 13 个文件被更改,包括 1195 次插入0 次删除
  1. 二进制
      .DS_Store
  2. 29 0
      gys/aliyun.py
  3. 177 0
      gys/clean_html.py
  4. 85 0
      gys/db/RedisDB.py
  5. 8 0
      gys/db/__init__.py
  6. 114 0
      gys/fetch_detail.py
  7. 106 0
      gys/fetch_list.py
  8. 260 0
      gys/font_tool.py
  9. 15 0
      gys/proto/ocr.proto
  10. 131 0
      gys/proto/ocr_pb2.py
  11. 64 0
      gys/proto/ocr_pb2_grpc.py
  12. 30 0
      gys/setting.py
  13. 176 0
      gys/utils.py

二进制
.DS_Store


+ 29 - 0
gys/aliyun.py

@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-11-01 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+
+from base64 import b64encode
+
+import requests
+
+
+def file_extract(path):
+    b64_str = b64encode(path.read_bytes()).decode('utf8')
+    data = {"input": {"image": b64_str}}
+    url = 'http://1228910937799386.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/ms_eas_c38f0a8d_f314_463b_875d_581a/invoke'
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': 'ZDljNTk5ZTU3M2U3NzQzOGI5NzJhY2Y2OTI1M2I0NWI5NmVhZjljZA=='
+    }
+    # print(b64_str)
+    # data = {"input":{"image":"http://modelscope.oss-cn-beijing.aliyuncs.com/demo/images/image_ocr_recognition.jpg"}}
+
+    response = requests.post(url, headers=headers, json=data, timeout=10)
+    r_json = response.json()
+    print(r_json)
+    return r_json['Data']['text'][0] if r_json and 'Data' in r_json else ''

+ 177 - 0
gys/clean_html.py

@@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+import re
+
+__all__ = ['cleaner']
+
+'''独立元素'''
+INDEPENDENT_TAGS = {
+    '<head>[\s\S]*?</head>': '',
+    '<html>|<html [^>]*>|</html>': '',
+    '<body>|<body [^>]*>|</body>': '',
+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
+    '\\xa0|\\u3000': '',  # 空格
+    '<!--[\s\S]*?-->': '',  # 注释
+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
+    '<input>': '',  # 输入框
+    '</input>': '',  # 输入框
+    '<img[^>]*>': '<br>',  # 图片
+}
+'''行内元素'''
+INLINE_TAGS = {
+    '<a>|<a [^>]*>|</a>': '',  # 超链接
+    '<link>|<link [^>]*>|</link>': '',  # 超链接
+    '<span>|<span [^>]*>|</span>': '',  # span
+    '<label>|<label [^>]*>|</label>': '<br>',  # label
+    '<font>|<font [^>]*>|</font>': '',  # font
+    'data:image(.*?) ': '',  # 图片base64
+}
+'''块级元素'''
+BLOCK_TAGS = {
+    '<div>\s*?</div>': '',
+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
+    '<p>|<p [^>]*>': '<br>',  # 段落
+    '</p>': '',  # 段落
+    '<div>|<div [^>]*>': '<br>',  # 分割
+    '</div>': '',  # 分割 division
+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
+}
+'''其他'''
+OTHER = {
+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
+    '<epointform>': '',
+    '<!doctype html>|<!doctype html [^>]*>': '',
+    '【关闭】|关闭': '',
+    '【打印】|打印本页': '',
+    '【字体:[\s\S]*】': '',
+    '文章来源:[\u4e00-\u9fa5]+': '',
+    '浏览次数:.*[<]+': '',
+    '(责任编辑:.*?)': '',
+    '分享到[:]': '',
+}
+'''样式'''
+CSS_STYLE = {
+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
+}
+'''空白符'''
+BLANKS = {
+    '\n\s*\n': '\n',
+    '\s*\n\s*': '\n',
+    '[^\S\n]': ' ',
+    '\s+': ' ',
+}
+'''css标签集合'''
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
+'''css属性集合'''
+ATTRS = {'id', 'class', 'style', 'width'}
+'''特殊样式的标签'''
+SPECIAL_TAGS = {
+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
+}
+
+
+def _repair_tag():
+    """异常的标签组合,用来替换非标准页面的标签"""
+    _repairs = {}
+    for tag in TAGS:
+        for attr in ATTRS:
+            key = '{}{}'.format(tag, attr)
+            val = '{} {}'.format(tag, attr)
+            _repairs[key] = val
+    return _repairs
+
+
+def _escape_character(html):
+    """转义字符"""
+    html = html.replace('&lt;', '<')
+    html = html.replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = html.replace('&amp;', '&')
+    return html
+
+
+def _lowercase_tag(html):
+    """元素标签转成小写,不影响页面文本"""
+    tags = re.findall("<[^>]+>", html)
+    tag_sets = set(tags)
+
+    if len(tag_sets) > 10000:
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(html, 'lxml')
+        html = str(soup.body.next_element)
+    else:
+        for tag in tag_sets:
+            html = html.replace(tag, str(tag).lower())
+
+    repair_tags = _repair_tag()  # 标签修复
+    for err, right in repair_tags.items():
+        html = html.replace(err, right)
+
+    return html
+
+
+def _clear_special_tag(html):
+    """删除特殊元素标签"""
+    for tag, repl in SPECIAL_TAGS.items():
+        html = tag.sub(repl, html)
+    return html
+
+
+def _clear_input_tag(html, display=False):
+    """提取value值,替换input标签"""
+    if not display:
+        html = html.replace('<input', '<input style="border-color: transparent;"')  # 不显示输入框边框
+
+    tag = re.compile(r'<input .*?>', re.S)
+    value = re.compile(r'value=["|\'](.*?)["|\']')
+
+    lst = re.findall(tag, html) or []
+    for ipt in lst:
+        val = re.findall(value, ipt)
+        if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
+            html = html.replace(ipt, val[0])
+    return html
+
+
+def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
+    """
+    源码清洗
+
+    :param html: 清洗的页面
+    :param special: 额外指定页面清洗规则
+    :param completely: 是否完全清洗页面
+    :param del_tag: 删除标签
+    :return: 页面源码
+    """
+    special = set() if special is None else special
+    OTHER.update(special)
+    remove_tags = {
+        **INDEPENDENT_TAGS,
+        **INLINE_TAGS,
+        **BLOCK_TAGS,
+        **OTHER,
+        **CSS_STYLE,
+        **BLANKS,
+    }
+
+    html = _lowercase_tag(html)
+    if del_tag:
+        html = _clear_special_tag(html)
+
+    for tag, repl in remove_tags.items():
+        html = re.sub(tag, repl, html)
+
+    if completely:
+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
+
+    html = _escape_character(html)
+    html = _clear_input_tag(html, **kwargs)
+    return html

+ 85 - 0
gys/db/RedisDB.py

@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-02-27
+---------
+@summary: redis 去重
+---------
+@author: Lzz
+"""
+import hashlib
+
+import redis
+
+
+class RedisFilter:
+
+    def __init__(self, url, expire_time=None):
+        self.redis_db = redis.StrictRedis.from_url(url)
+        self._ex = expire_time or 86400 * 365 * 1  # 1年 = 86400 * 365 * 1
+
+    def __repr__(self):
+        return "<RedisFilter: {}>".format(self.redis_db)
+
+    def exists(self, key):
+        """全量检索"""
+        if self.redis_db.exists(key) > 0:
+            return True
+        return False
+
+    def add(self, keys):
+        """
+        添加数据
+
+        @param keys: 检查关键词在 redis 中是否存在,支持列表批量
+        @return: list / 单个值(添加失败返回False, 添加成功返回True)
+        """
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        is_added = []
+        for key in keys:
+            pkey = "pylist_" + self.fingerprint(key)
+            if not self.exists(pkey):
+                is_added.append(self.redis_db.set(pkey, 1, ex=self._ex))
+            else:
+                is_added.append(False)
+
+        return is_added if is_list else is_added[0]
+
+    def get(self, keys):
+        """
+        检查数据是否存在
+        @param keys: list / 单个值
+        @return: list / 单个值 (存在返回True 不存在返回False)
+        """
+        is_list = isinstance(keys, list)
+        keys = keys if is_list else [keys]
+
+        is_exist = []
+        for key in keys:
+            pkey = "pylist_" + self.fingerprint(key)
+            is_exist.append(self.exists(pkey))
+
+        # 判断数据本身是否重复
+        temp_set = set()
+        for i, key in enumerate(keys):
+            if key in temp_set:
+                is_exist[i] = True
+            else:
+                temp_set.add(key)
+
+        return is_exist if is_list else is_exist[0]
+
+    def fingerprint(self, *args):
+        """
+        @summary: 获取唯一的64位值,获取唯一数据指纹
+        ---------
+        @param args: 去重数据集合
+        ---------
+        @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
+        """
+        args = sorted(args)
+        sha256 = hashlib.sha256()
+        for arg in args:
+            sha256.update(str(arg).encode())
+        return sha256.hexdigest()

+ 8 - 0
gys/db/__init__.py

@@ -0,0 +1,8 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-11-11 
+---------
+@summary:  
+---------
+@author: Dzr
+"""

+ 114 - 0
gys/fetch_detail.py

@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-11-11 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from bson import Int64
+from pymongo import MongoClient
+from pymongo.operations import UpdateOne
+
+import net
+import setting
+from log import logger
+
+
+def spider(task):
+    _id = task.pop('_id')
+    url = task['href']
+    ret = net.download_detail(url, proxies=net.get_proxy())
+    if ret is None:
+        logger.error(f'详情数据|下载失败|{url}')
+        return _id, None
+
+    logger.info(f'详情数据|下载成功|{url}')
+    data = {
+        'site': task['site'],
+        'channel': task['channel'],
+        'spidercode': task['spidercode'],
+        'area': task['area'],
+        'city': task['city'],
+        'district': task['district'],
+        'href': url,
+        'title': task['title'],
+        's_title': task['title'],
+        'contenthtml': ret['contenthtml'],
+        'detail': ret['detail'],
+        'publishtime': task['publishtime'],
+        'l_np_publishtime': Int64(task['l_np_publishtime']),
+        'comeintime': Int64(int(time.time())),
+        'T': task['T'],
+        'infoformat': task['infoformat'],
+        'sendflag': task['sendflag'],
+        'iscompete': task['iscompete'],
+        '_d': task['_d'],
+        'publishdept': task['publishdept'],
+        'type': task['type'],
+        'is_mixed': task['is_mixed'],
+    }
+    return _id, data
+
+
+def main():
+    while True:
+        client = MongoClient(setting.MONGO_HOST, setting.MONGO_PORT)
+        to_lst_coll = client[setting.MONGO_DB][setting.MONGO_LIST_COLL]
+        to_data_coll = client[setting.MONGO_DB][setting.MONGO_DATA_COLL]
+
+        data_count = 0
+        fail_count = 0
+        updates = []
+        inserts = []
+        q = {'isdownload': None}
+        with to_lst_coll.find(q, limit=100) as cursor:
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                fs = executor.map(spider, cursor)
+                for f in fs:
+                    _id, result = f
+                    condition = {'_id': _id}
+                    if result is None:
+                        item = {'isdownload': 1, 'isfailed': 1}
+                        fail_count += 1
+                    else:
+                        item = {'isdownload': 1, 'isfailed': 0}
+                        inserts.append(result)
+                        data_count += 1
+
+                    updates.append(UpdateOne(condition, {'$set': item}))
+
+                    if len(inserts) == 10:
+                        to_data_coll.insert_many(inserts, ordered=False)
+                        logger.info(f'详情数据|数据下载|成功{len(inserts)}条')
+                        inserts = []
+
+                    if len(updates) == 10:
+                        to_lst_coll.bulk_write(updates, ordered=False)
+                        logger.info(f'详情数据|更新状态|完成{len(updates)}条')
+                        updates = []
+
+                if len(inserts) > 0:
+                    to_data_coll.insert_many(inserts, ordered=False)
+                    logger.info(f'详情数据|数据下载|成功{len(inserts)}条')
+
+                if len(updates) > 0:
+                    to_lst_coll.bulk_write(updates, ordered=False)
+                    logger.info(f'详情数据|更新状态|完成{len(updates)}条')
+
+        logger.info(f'详情数据|数据下载|10s后执行...')
+        time.sleep(10)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        pass
+
+    except Exception as e:
+        net.send_wechat_warning('详情采集被中止')
+        logger.exception(e)

+ 106 - 0
gys/fetch_list.py

@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-11-11 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+import time
+from concurrent.futures import ThreadPoolExecutor
+
+from bson import Int64
+from pymongo import MongoClient
+
+import net
+import setting
+from db.RedisDB import RedisFilter
+from log import logger
+
+
+def spider(url):
+    items = net.download_list(url, proxies=net.get_proxy())
+    if not items:
+        logger.error(f'列表数据|下载失败|{url}')
+        return
+
+    logger.info(f'列表数据|下载成功|{url}')
+
+    results = []
+    for item in items:
+        results.append({
+            'site': '供应商网',
+            'channel': '最新采购',
+            'spidercode': 'a_gysw_zxcg',
+            'area': item['area'],
+            'city': item['city'],
+            'district': item['district'],
+            'href': item['href'],
+            'title': item['title'],
+            'publishtime': item['publishtime'],
+            'l_np_publishtime': Int64(item['l_np_publishtime']),
+            'comeintime': Int64(int(time.time())),
+            'T': 'bidding',
+            'infoformat': 1,
+            'sendflag': 'false',
+            'iscompete': True,
+            '_d': 'comeintime',
+            'publishdept': '',
+            'type': '',
+            'is_mixed': True,
+        })
+
+    return results
+
+
+def main():
+    while True:
+        client = MongoClient(setting.MONGO_HOST, setting.MONGO_PORT)
+        to_coll = client[setting.MONGO_DB][setting.MONGO_LIST_COLL]
+        to_dedup = RedisFilter(url=setting.REDIS_URL)
+
+        urls = (f'https://www.gys.cn/buy/purchase/{i}.html' for i in range(1, 101))
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            fs = executor.map(spider, urls)
+            for f in fs:
+                items = f or []
+
+                data_count = 0
+                dedupe_count = 0
+                unique = []
+                inserts = []
+                for item in items:
+                    href = item['href']
+                    if not to_dedup.get(href):
+                        inserts.append(item)
+                        unique.append(href)
+                    else:
+                        dedupe_count += 1
+
+                    if len(inserts) == 50:
+                        to_coll.insert_many(inserts, ordered=False)
+                        to_dedup.add(unique)
+                        data_count += len(inserts)
+                        inserts = []
+                        unique = []
+
+                if len(inserts) > 0:
+                    to_coll.insert_many(inserts, ordered=False)
+                    to_dedup.add(unique)
+                    data_count += len(inserts)
+
+                logger.info(f'列表数据|数据处理|重复{dedupe_count}条|入库{data_count}条')
+
+        logger.info(f'列表数据|数据下载|10m后执行...')
+        time.sleep(600)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        pass
+
+    except Exception as e:
+        net.send_wechat_warning('列表采集被中止')
+        logger.exception(e)

+ 260 - 0
gys/font_tool.py

@@ -0,0 +1,260 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-31 
+---------
+@summary:  解析图片文本
+---------
+@author: Dzr
+"""
+import io
+import pathlib
+import random
+import re
+import string
+from pathlib import Path
+from urllib.request import urlretrieve
+
+import numpy as np
+from PIL import Image, ImageOps
+from ddddocr import DdddOcr
+from fontTools.misc.transform import Offset
+from fontTools.pens.freetypePen import FreeTypePen  # pip install freetype-py
+from fontTools.ttLib import TTFont
+
+_root = Path(__file__).parent
+_cache_dir = _root.joinpath('cache')
+_cache_dir.mkdir(exist_ok=True)
+
+_font_dir = _cache_dir.joinpath('font')
+_font_dir.mkdir(exist_ok=True)
+
+_image_dir = _cache_dir.joinpath('image')
+_image_dir.mkdir(exist_ok=True)
+
+
+def get_random(length=4):
+    return ''.join(random.sample(string.ascii_letters + string.digits, length))
+
+
+def parse_font_url(html):
+    result = re.search(r"'icomoon';src:url\('(.*?)'\)", html, re.S)
+    if result is None:
+        raise ValueError(f'字体库 url "{result}" ')
+
+    return result.group(1)
+
+
+def create_file(filename):
+    file = _font_dir.joinpath(filename)
+    file.touch(exist_ok=True)
+    return file
+
+
+def download_font(html, font_type='ttf', to_local=False):
+    filename = f'{get_random(6)}.{font_type}'
+    tmp = create_file(filename)
+    url = parse_font_url(html)
+    urlretrieve(url, filename=tmp)
+    if not to_local:
+        file_bytes = tmp.read_bytes()
+        tmp.unlink(missing_ok=True)
+        tmp = file_bytes
+    return tmp
+
+
+def image_to_bytes(image, filetype='JPEG'):
+    byte_stream = io.BytesIO()
+    image.save(byte_stream, format=filetype)
+    byte_array = byte_stream.getvalue()
+    return byte_array
+
+
+class ImageToText:
+    def __init__(self, file, cache=False, ocr=False, callback=None, image_scale=5, auto_delete=True):
+        """
+
+        @param file: 字体文件
+        @param cache: 缓存字体图片到本地磁盘
+        @param ocr: 图片识别启用Ocr
+        @param image_scale: 图片缩放倍数
+        @param callback: 图片文本识别处理的回调函数
+        @param auto_delete: 自动清除字体图片
+        """
+        if not isinstance(file, (bytes, str, pathlib.PurePath)):
+            raise TypeError("未知文件类型")
+
+        if isinstance(file, bytes):
+            self._font = TTFont(io.BytesIO(file))
+        elif isinstance(file, str):
+            self._font = TTFont(file)
+        else:
+            assert isinstance(file, pathlib.PurePath)
+            self._font = TTFont(file)
+
+        # 字体图片映射关系
+        self._font_maps = {}
+        self._image_scale = image_scale
+
+        # 缓存
+        self._cache_images = {}
+        self._to_local = cache
+        self._auto_delete = False if cache is True else auto_delete
+
+        # Ocr
+        self._callback = None
+        self._enable_ocr = ocr
+        if ocr is True:
+            if callback is not None and callable(callback):
+                self._callback = callback
+            else:
+                ddddocr = DdddOcr(beta=False, old=True, show_ad=False)
+
+                def _classification(files):
+                    if isinstance(files, tuple):
+                        img = files[1]
+                    else:
+                        img = files
+
+                    return ddddocr.classification(img)
+
+                self._callback = _classification
+
+    def to_xml(self):
+        filename = self._font.reader.file.name
+        font_f = Path(filename).with_suffix('.xml')
+        self._font.saveXML(font_f)
+
+    @property
+    def font_maps(self):
+        return self._font_maps
+
+    def parse_font(self):
+        self._font_encode()
+
+        if self._enable_ocr:
+            self._font_draw()
+            self._font_ocr()
+
+    def _font_encode(self):
+        for unicode, name in self._font.getBestCmap().items():
+            code = f'&#{str(hex(unicode))[1:]}'  # 0x100c4 => &#x100c4
+            glyph = {'name': name, 'code': hex(unicode), 'zh': ''}
+            self._font_maps[code] = glyph
+            # print(code, glyph)
+
+    def _font_draw(self):
+        glyph_set = self._font.getGlyphSet()
+        for code, glyph_dict in self._font_maps.items():
+            # print(code, glyph_dict)
+            glyph = glyph_set[glyph_dict['name']]  # 获取字形
+
+            pen = FreeTypePen(None)  # 创建变换笔(FreeTypePen)实例,绘制字形
+            glyph.draw(pen)  # 绘制字形
+
+            # 获取字形的宽度,以及从字体文件的 OS/2 表中获取推荐的上升高度和下降高度,确定图像的高度
+            width, ascender, descender = (
+                glyph.width,
+                self._font['OS/2'].usWinAscent,
+                -self._font['OS/2'].usWinDescent,
+            )
+            height = ascender - descender
+
+            # 创建图像并转换为数组
+            single_font_image = pen.array(
+                width=width,
+                height=height,
+                transform=Offset(0, -descender),
+                contain=False,
+                evenOdd=False,
+            )
+
+            # 转换为灰度图像数组
+            single_font_image = np.array(single_font_image) * 255
+            # 反转颜色(使得黑色变为白色,白色变为黑色)
+            single_font_image = 255 - single_font_image
+
+            # 创建 PIL 图像对象
+            single_font_image = Image.fromarray(single_font_image)
+            # 转换为灰度模式
+            single_font_image = single_font_image.convert("L")
+            # 图片添加边框
+            single_font_image = ImageOps.expand(single_font_image, border=6, fill=255)
+
+            # 计算新的宽度和高度
+            new_width = single_font_image.width // self._image_scale
+            new_height = single_font_image.height // self._image_scale
+
+            # 调整图片大小
+            single_font_image = single_font_image.resize(
+                (new_width, new_height),
+                resample=Image.Resampling.LANCZOS
+            )
+
+            image_name = f'{glyph_dict["code"]}.jpg'
+            if not self._to_local:
+                image_bytes = image_to_bytes(single_font_image)
+                self._cache_images[code] = (image_name, image_bytes, 'jpg')
+            else:
+                single_font_image.save(_image_dir.joinpath(image_name))  # 保存图像
+
+    def _font_ocr(self):
+        for code, glyph_dict in dict(self._font_maps).items():
+            if not self._to_local:
+                files = self._cache_images[code]
+                text = self._callback(files)
+            else:
+                files = _image_dir.joinpath(f'{glyph_dict["code"]}.jpg')
+                text = self._callback(files)
+
+            self._font_maps[code]['zh'] = text
+
+    def __contains__(self, key):
+        return key in self._font_maps
+
+    def __getitem__(self, key):
+        if key in self._font_maps:
+            return self._font_maps[key]
+        else:
+            raise KeyError(key)
+
+    def get(self, key, default=None):
+        try:
+            return self.__getitem__(key)
+        except KeyError:
+            return default
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.__del__()
+        return
+
+    def _del(self, missing_ok=False):
+        if self._auto_delete:
+            for img_f in _image_dir.iterdir():
+                img_f.unlink(missing_ok=True)
+
+            for font_f in _font_dir.iterdir():
+                font_f.unlink(missing_ok=True)
+
+            try:
+                # _image_dir.rmdir()
+                # _font_dir.rmdir()
+                _cache_dir.rmdir()
+            except OSError as e:
+                if not missing_ok:
+                    raise e
+
+    def __del__(self):
+        self._del(missing_ok=True)
+
+
+FontTranslator = ImageToText
+
+
+def parse_font(font_file, *, ocr=False, ocr_extract=None, **kwargs):
+    ocr = True if ocr_extract is not None and callable(ocr_extract) else ocr
+    translator = ImageToText(font_file, ocr=ocr, callback=ocr_extract, **kwargs)
+    translator.parse_font()
+    return translator

+ 15 - 0
gys/proto/ocr.proto

@@ -0,0 +1,15 @@
+syntax = "proto3";
+//OCR图像转文字服务
+package proto;
+
+message OcrRequest {
+  bytes image = 1;
+}
+
+message OcrResponse {
+    string message = 1;
+}
+
+service Ocr {
+  rpc Ocr(OcrRequest) returns (OcrResponse);
+}

+ 131 - 0
gys/proto/ocr_pb2.py

@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: proto/ocr.proto
+
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='proto/ocr.proto',
+  package='proto',
+  syntax='proto3',
+  serialized_options=None,
+  serialized_pb=b'\n\x0fproto/ocr.proto\x12\x05proto\"\x1b\n\nOcrRequest\x12\r\n\x05image\x18\x01 \x01(\x0c\"\x1e\n\x0bOcrResponse\x12\x0f\n\x07message\x18\x01 \x01(\t23\n\x03Ocr\x12,\n\x03Ocr\x12\x11.proto.OcrRequest\x1a\x12.proto.OcrResponseb\x06proto3'
+)
+
+
+
+
+_OCRREQUEST = _descriptor.Descriptor(
+  name='OcrRequest',
+  full_name='proto.OcrRequest',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='image', full_name='proto.OcrRequest.image', index=0,
+      number=1, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"",
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=26,
+  serialized_end=53,
+)
+
+
+_OCRRESPONSE = _descriptor.Descriptor(
+  name='OcrResponse',
+  full_name='proto.OcrResponse',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='message', full_name='proto.OcrResponse.message', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto3',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=55,
+  serialized_end=85,
+)
+
+DESCRIPTOR.message_types_by_name['OcrRequest'] = _OCRREQUEST
+DESCRIPTOR.message_types_by_name['OcrResponse'] = _OCRRESPONSE
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+OcrRequest = _reflection.GeneratedProtocolMessageType('OcrRequest', (_message.Message,), {
+  'DESCRIPTOR' : _OCRREQUEST,
+  '__module__' : 'proto.ocr_pb2'
+  # @@protoc_insertion_point(class_scope:proto.OcrRequest)
+  })
+_sym_db.RegisterMessage(OcrRequest)
+
+OcrResponse = _reflection.GeneratedProtocolMessageType('OcrResponse', (_message.Message,), {
+  'DESCRIPTOR' : _OCRRESPONSE,
+  '__module__' : 'proto.ocr_pb2'
+  # @@protoc_insertion_point(class_scope:proto.OcrResponse)
+  })
+_sym_db.RegisterMessage(OcrResponse)
+
+
+
+_OCR = _descriptor.ServiceDescriptor(
+  name='Ocr',
+  full_name='proto.Ocr',
+  file=DESCRIPTOR,
+  index=0,
+  serialized_options=None,
+  serialized_start=87,
+  serialized_end=138,
+  methods=[
+  _descriptor.MethodDescriptor(
+    name='Ocr',
+    full_name='proto.Ocr.Ocr',
+    index=0,
+    containing_service=None,
+    input_type=_OCRREQUEST,
+    output_type=_OCRRESPONSE,
+    serialized_options=None,
+  ),
+])
+_sym_db.RegisterServiceDescriptor(_OCR)
+
+DESCRIPTOR.services_by_name['Ocr'] = _OCR
+
+# @@protoc_insertion_point(module_scope)

+ 64 - 0
gys/proto/ocr_pb2_grpc.py

@@ -0,0 +1,64 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+import grpc
+
+from proto import ocr_pb2 as proto_dot_ocr__pb2
+
+
+class OcrStub(object):
+    """Missing associated documentation comment in .proto file"""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.Ocr = channel.unary_unary(
+                '/proto.Ocr/Ocr',
+                request_serializer=proto_dot_ocr__pb2.OcrRequest.SerializeToString,
+                response_deserializer=proto_dot_ocr__pb2.OcrResponse.FromString,
+                )
+
+
+class OcrServicer(object):
+    """Missing associated documentation comment in .proto file"""
+
+    def Ocr(self, request, context):
+        """Missing associated documentation comment in .proto file"""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_OcrServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'Ocr': grpc.unary_unary_rpc_method_handler(
+                    servicer.Ocr,
+                    request_deserializer=proto_dot_ocr__pb2.OcrRequest.FromString,
+                    response_serializer=proto_dot_ocr__pb2.OcrResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'proto.Ocr', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+
+
+ # This class is part of an EXPERIMENTAL API.
+class Ocr(object):
+    """Missing associated documentation comment in .proto file"""
+
+    @staticmethod
+    def Ocr(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(request, target, '/proto.Ocr/Ocr',
+            proto_dot_ocr__pb2.OcrRequest.SerializeToString,
+            proto_dot_ocr__pb2.OcrResponse.FromString,
+            options, channel_credentials,
+            call_credentials, compression, wait_for_ready, timeout, metadata)

+ 30 - 0
gys/setting.py

@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-11-08 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+
+REDIS_URL = 'redis://:k5ZJR5KV4q7DRZ92DQ@172.17.162.34:8361/0'
+
+MONGO_HOST = '172.17.4.87'
+MONGO_PORT = 27080
+MONGO_DB = 'py_spider'
+MONGO_LIST_COLL = 'gys_list'
+MONGO_DATA_COLL = 'data_bak'
+
+# 第三方Ocr
+DG_OCR_A2S_ADDRESS = '172.17.4.188:9090'
+DG_OCR_TOPIC = 'dg_ocr'
+DG_OCR_TIMEOUT = 20
+
+# 剑鱼Ocr
+JY_OCR_A2S_ADDRESS = '172.17.4.188:9090'
+JY_OCR_A2S_TIMEOUT = 60
+JY_OCR_TOPIC = 'spider_ocr'
+
+# 剑鱼ApiOcr
+JY_API_OCR_ADDRESS = '172.17.208.114:8089'
+JY_API_OCR_TIMEOUT = 20

+ 176 - 0
gys/utils.py

@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+"""
+Created on 2024-10-30 
+---------
+@summary:  
+---------
+@author: Dzr
+"""
+import datetime
+import functools
+import re
+import time
+
+from lxml.html import Element, HtmlElement, fromstring, tostring
+
+
+def run_time(fun):
+    @functools.wraps(fun)
+    def wrapper(*args, **kwargs):
+        time_start = time.time()
+        result = fun(*args, **kwargs)
+        time_end = time.time()
+        print(time_end - time_start)
+        return result
+    return wrapper
+
+
+def is_chinese_character(char):
+    # Unicode范围:汉字基本区(4E00-9FFF),扩展A区(3400-4DBF),扩展B区(20000-2A6DF),扩展C区(2A700-2B73F),扩展D区(2B740-2B81F),扩展E区(2B820-2CEAF),扩展F区(2CEB0-2EBEF),扩展G区(30000-3134F)
+    # regex = re.compile(r'^[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$')
+    # 排除数字汉字(4E00-4E9F)
+    regex = re.compile(r'^[\u4ea0-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$')
+    return bool(regex.match(char))
+
+
+def is_specific_number_chinese_character(char):
+    # Unicode范围:特定的数字汉字:零,一,二,三,四...
+    regex = re.compile(r'^[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u96f6]$')
+    return bool(regex.match(char))
+
+
+def is_en(char):
+    # Unicode范围:英文字母(大写和小写)
+    regex = re.compile(r'^[A-Za-z]$')
+    return bool(regex.match(char))
+
+
+def is_digit(char):
+    # Unicode范围:阿拉伯数字
+    regex = re.compile(r'^[1-9]$')
+    return bool(regex.match(char))
+
+
+def is_zero_or_o(char):
+    # 正则表达式匹配 '0', 'o' 或 'O'
+    regex = re.compile(r'^[0oO]$')
+    return bool(regex.match(char))
+
+
+def replace_element(old_tag: HtmlElement, new_tag: HtmlElement):
+    old_tag.getparent().replace(old_tag, new_tag)
+
+
+def create_element(tag, attrib, text=None):
+    element = Element(tag, **attrib)
+    if text is not None:
+        element.text = text
+    return element
+
+
+def drop_element(tag: HtmlElement, feature: str):
+    element = tag.xpath(feature)
+    element = next(iter(element or []), None)
+    if element is not None:
+        element.drop_tree()
+
+
+def translate(val, font_maps):
+    characters = val.split(";")
+
+    ret = ""
+    characters = list(filter(lambda x: x.strip() != '', characters))
+    for character in characters:
+        ret += font_maps[f'&#{hex(int(character[2:]))[1:]}']['zh']
+    return ret
+
+
+def translate_text(tag: HtmlElement, font_maps, pattern):
+    original_element_str = tostring(tag, encoding='gbk').decode('gbk')
+    origin_text = re.findall(pattern, original_element_str, flags=re.S)  # 正则抽取保证文本不自动解码
+    origin_text = next(iter(origin_text or []), '')
+    if not origin_text:
+        return origin_text
+
+    target_text = translate(origin_text, font_maps)
+    # print(origin_text, ' <= ', target_text)
+    return target_text
+
+
+def translate_element_text(tag: HtmlElement, font_maps, feature, pattern):
+    # 抽取原始文本
+    original_element = next(iter(tag.xpath(feature) or []), None)
+    if original_element is None:
+        raise ValueError(f'{original_element}')
+
+    tag_name = original_element.tag
+    attrib = original_element.attrib
+    target_text = translate_text(original_element, font_maps, pattern)
+
+    # 创建新元素
+    new_element = create_element(tag_name, attrib, text=target_text)
+    # 替换旧元素
+    replace_element(original_element, new_element)
+
+
+def parse_element(tag: HtmlElement, font_maps):
+    # 字体混淆反解析 - 标题
+    translate_element_text(
+        tag,
+        font_maps,
+        '//div[@class="bw_140 secret"]',
+        r'<div.*>(.*?)</div>'
+    )
+
+    # 字体混淆反解析 - 说明
+    translate_element_text(
+        tag,
+        font_maps,
+        '//td[@class="secret"]/div',
+        r'<div.*>(.*?)</div>'
+    )
+
+    # 删除源码中敏感数据
+    drop_element(tag, '//div[@class="details_txt"]')
+
+
+def extract_list(html, font_maps):
+    results = []
+    tree = fromstring(html)
+    for li_tag in tree.xpath('//ul[@class="industry_ul"]/li'):
+        elem1 = next(iter(li_tag.xpath('./div[@class="industry_left"]') or []), Element('div'))
+        p_tag = next(iter(elem1.xpath('./p[@class="tt"]/text()') or []), '全国').replace('收货地:', '')
+        args = str(p_tag).split()
+        if len(args) == 2:
+            area, city = args
+        else:
+            city = ''
+            area = args[0]
+
+        elem2 = next(iter(li_tag.xpath('./div[@class="industry_cc"]') or []), Element('div'))
+        a_tag = next(iter(elem2.xpath('.//h3[@class="secret"]/a') or []), Element('a'))
+        title = translate_text(a_tag, font_maps, r'<a.*>(.*?)</a>')
+        href = a_tag.get('href')
+        publish_time = next(iter(elem2.xpath('./div/span/text()') or []), '').replace('发布时间:', '')
+        publish_time_ts = datetime.datetime.strptime(publish_time, '%Y-%m-%d').timestamp()
+        # print(f'{title} {href} {publish_time}')
+        results.append({
+            'title': title,
+            'href': href,
+            'publishtime': publish_time,
+            'l_np_publishtime': publish_time_ts,
+            'area': area,
+            'city': city,
+            'district': '',
+        })
+    return results
+
+
+def extract_detail_html(html, font_maps):
+    tree = fromstring(html)
+    parse_element(tree, font_maps)
+    element = tree.xpath('//div[@class="details_text"]')
+    element = next(iter(element or []), Element('div'))
+    source = tostring(element, encoding='gbk').decode('gbk')
+    # print(source)
+    return source