8 months ago · 7dc5206a36
--- a/.DS_Store
+++ b/.DS_Store
--- a/gys/aliyun.py
+++ b/gys/aliyun.py
@@ -0,0 +1,29 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-11-01 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+
			
 
				+from base64 import b64encode
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+def file_extract(path):
			
 
				+    b64_str = b64encode(path.read_bytes()).decode('utf8')
			
 
				+    data = {"input": {"image": b64_str}}
			
 
				+    url = 'http://1228910937799386.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/ms_eas_c38f0a8d_f314_463b_875d_581a/invoke'
			
 
				+    headers = {
			
 
				+        'Content-Type': 'application/json',
			
 
				+        'Authorization': 'ZDljNTk5ZTU3M2U3NzQzOGI5NzJhY2Y2OTI1M2I0NWI5NmVhZjljZA=='
			
 
				+    }
			
 
				+    # print(b64_str)
			
 
				+    # data = {"input":{"image":"http://modelscope.oss-cn-beijing.aliyuncs.com/demo/images/image_ocr_recognition.jpg"}}
			
 
				+
			
 
				+    response = requests.post(url, headers=headers, json=data, timeout=10)
			
 
				+    r_json = response.json()
			
 
				+    print(r_json)
			
 
				+    return r_json['Data']['text'][0] if r_json and 'Data' in r_json else ''
			
--- a/gys/clean_html.py
+++ b/gys/clean_html.py
@@ -0,0 +1,177 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import re
			
 
				+
			
 
				+__all__ = ['cleaner']
			
 
				+
			
 
				+'''独立元素'''
			
 
				+INDEPENDENT_TAGS = {
			
 
				+    '<head>[\s\S]*?</head>': '',
			
 
				+    '<html>|<html [^>]*>|</html>': '',
			
 
				+    '<body>|<body [^>]*>|</body>': '',
			
 
				+    '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '',  # 元数据
			
 
				+    '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '',  # 空格
			
 
				+    '\\xa0|\\u3000': '',  # 空格
			
 
				+    '<!--[\s\S]*?-->': '',  # 注释
			
 
				+    '<style[^<>]*>[\s\S]*?</style>': '',  # 样式
			
 
				+    '<script[^<>]*>[\s\S]*?</script>': '',  # JavaScript
			
 
				+    '<input>': '',  # 输入框
			
 
				+    '</input>': '',  # 输入框
			
 
				+    '<img[^>]*>': '<br>',  # 图片
			
 
				+}
			
 
				+'''行内元素'''
			
 
				+INLINE_TAGS = {
			
 
				+    '<a>|<a [^>]*>|</a>': '',  # 超链接
			
 
				+    '<link>|<link [^>]*>|</link>': '',  # 超链接
			
 
				+    '<span>|<span [^>]*>|</span>': '',  # span
			
 
				+    '<label>|<label [^>]*>|</label>': '<br>',  # label
			
 
				+    '<font>|<font [^>]*>|</font>': '',  # font
			
 
				+    'data:image(.*?) ': '',  # 图片base64
			
 
				+}
			
 
				+'''块级元素'''
			
 
				+BLOCK_TAGS = {
			
 
				+    '<div>\s*?</div>': '',
			
 
				+    '<h[1-6][^>]*>|</h[1-6]>': '',  # 标题
			
 
				+    '<p>|<p [^>]*>': '<br>',  # 段落
			
 
				+    '</p>': '',  # 段落
			
 
				+    '<div>|<div [^>]*>': '<br>',  # 分割
			
 
				+    '</div>': '',  # 分割 division
			
 
				+    '<o:p>|<o:p [^>]*>|</o:p>': ''  # OFFICE微软WORD段落
			
 
				+}
			
 
				+'''其他'''
			
 
				+OTHER = {
			
 
				+    '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
			
 
				+    '<epointform>': '',
			
 
				+    '<!doctype html>|<!doctype html [^>]*>': '',
			
 
				+    '【关闭】|关闭': '',
			
 
				+    '【打印】|打印本页': '',
			
 
				+    '【字体：[\s\S]*】': '',
			
 
				+    '文章来源：[\u4e00-\u9fa5]+': '',
			
 
				+    '浏览次数：.*[<]+': '',
			
 
				+    '（责任编辑：.*?）': '',
			
 
				+    '分享到[：]': '',
			
 
				+}
			
 
				+'''样式'''
			
 
				+CSS_STYLE = {
			
 
				+    'style="[\s\S]*?"|style ="[\s\S]*?"': '',
			
 
				+    'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
			
 
				+    'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
			
 
				+    'class="[\s\S]*?"|class ="[\s\S]*?"': '',
			
 
				+    'align="[\s\S]*?"|align ="[\s\S]*?"': '',
			
 
				+    'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
			
 
				+}
			
 
				+'''空白符'''
			
 
				+BLANKS = {
			
 
				+    '\n\s*\n': '\n',
			
 
				+    '\s*\n\s*': '\n',
			
 
				+    '[^\S\n]': ' ',
			
 
				+    '\s+': ' ',
			
 
				+}
			
 
				+'''css标签集合'''
			
 
				+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
			
 
				+'''css属性集合'''
			
 
				+ATTRS = {'id', 'class', 'style', 'width'}
			
 
				+'''特殊样式的标签'''
			
 
				+SPECIAL_TAGS = {
			
 
				+    re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def _repair_tag():
			
 
				+    """异常的标签组合,用来替换非标准页面的标签"""
			
 
				+    _repairs = {}
			
 
				+    for tag in TAGS:
			
 
				+        for attr in ATTRS:
			
 
				+            key = '{}{}'.format(tag, attr)
			
 
				+            val = '{} {}'.format(tag, attr)
			
 
				+            _repairs[key] = val
			
 
				+    return _repairs
			
 
				+
			
 
				+
			
 
				+def _escape_character(html):
			
 
				+    """转义字符"""
			
 
				+    html = html.replace('&lt;', '<')
			
 
				+    html = html.replace('&gt;', '>')
			
 
				+    html = html.replace('&quot;', '"')
			
 
				+    html = html.replace('&amp;', '&')
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _lowercase_tag(html):
			
 
				+    """元素标签转成小写，不影响页面文本"""
			
 
				+    tags = re.findall("<[^>]+>", html)
			
 
				+    tag_sets = set(tags)
			
 
				+
			
 
				+    if len(tag_sets) > 10000:
			
 
				+        from bs4 import BeautifulSoup
			
 
				+        soup = BeautifulSoup(html, 'lxml')
			
 
				+        html = str(soup.body.next_element)
			
 
				+    else:
			
 
				+        for tag in tag_sets:
			
 
				+            html = html.replace(tag, str(tag).lower())
			
 
				+
			
 
				+    repair_tags = _repair_tag()  # 标签修复
			
 
				+    for err, right in repair_tags.items():
			
 
				+        html = html.replace(err, right)
			
 
				+
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _clear_special_tag(html):
			
 
				+    """删除特殊元素标签"""
			
 
				+    for tag, repl in SPECIAL_TAGS.items():
			
 
				+        html = tag.sub(repl, html)
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def _clear_input_tag(html, display=False):
			
 
				+    """提取value值，替换input标签"""
			
 
				+    if not display:
			
 
				+        html = html.replace('<input', '<input style="border-color: transparent;"')  # 不显示输入框边框
			
 
				+
			
 
				+    tag = re.compile(r'<input .*?>', re.S)
			
 
				+    value = re.compile(r'value=["|\'](.*?)["|\']')
			
 
				+
			
 
				+    lst = re.findall(tag, html) or []
			
 
				+    for ipt in lst:
			
 
				+        val = re.findall(value, ipt)
			
 
				+        if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
			
 
				+            html = html.replace(ipt, val[0])
			
 
				+    return html
			
 
				+
			
 
				+
			
 
				+def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
			
 
				+    """
			
 
				+    源码清洗
			
 
				+
			
 
				+    :param html: 清洗的页面
			
 
				+    :param special: 额外指定页面清洗规则
			
 
				+    :param completely: 是否完全清洗页面
			
 
				+    :param del_tag: 删除标签
			
 
				+    :return: 页面源码
			
 
				+    """
			
 
				+    special = set() if special is None else special
			
 
				+    OTHER.update(special)
			
 
				+    remove_tags = {
			
 
				+        **INDEPENDENT_TAGS,
			
 
				+        **INLINE_TAGS,
			
 
				+        **BLOCK_TAGS,
			
 
				+        **OTHER,
			
 
				+        **CSS_STYLE,
			
 
				+        **BLANKS,
			
 
				+    }
			
 
				+
			
 
				+    html = _lowercase_tag(html)
			
 
				+    if del_tag:
			
 
				+        html = _clear_special_tag(html)
			
 
				+
			
 
				+    for tag, repl in remove_tags.items():
			
 
				+        html = re.sub(tag, repl, html)
			
 
				+
			
 
				+    if completely:
			
 
				+        html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html)  # 画布
			
 
				+        html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html)  # 内框架
			
 
				+        html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
			
 
				+
			
 
				+    html = _escape_character(html)
			
 
				+    html = _clear_input_tag(html, **kwargs)
			
 
				+    return html
			
--- a/gys/db/RedisDB.py
+++ b/gys/db/RedisDB.py
@@ -0,0 +1,85 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-02-27
			
 
				+---------
			
 
				+@summary: redis 去重
			
 
				+---------
			
 
				+@author: Lzz
			
 
				+"""
			
 
				+import hashlib
			
 
				+
			
 
				+import redis
			
 
				+
			
 
				+
			
 
				+class RedisFilter:
			
 
				+
			
 
				+    def __init__(self, url, expire_time=None):
			
 
				+        self.redis_db = redis.StrictRedis.from_url(url)
			
 
				+        self._ex = expire_time or 86400 * 365 * 1  # 1年 = 86400 * 365 * 1
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "<RedisFilter: {}>".format(self.redis_db)
			
 
				+
			
 
				+    def exists(self, key):
			
 
				+        """全量检索"""
			
 
				+        if self.redis_db.exists(key) > 0:
			
 
				+            return True
			
 
				+        return False
			
 
				+
			
 
				+    def add(self, keys):
			
 
				+        """
			
 
				+        添加数据
			
 
				+
			
 
				+        @param keys: 检查关键词在 redis 中是否存在，支持列表批量
			
 
				+        @return: list / 单个值(添加失败返回False, 添加成功返回True)
			
 
				+        """
			
 
				+        is_list = isinstance(keys, list)
			
 
				+        keys = keys if is_list else [keys]
			
 
				+
			
 
				+        is_added = []
			
 
				+        for key in keys:
			
 
				+            pkey = "pylist_" + self.fingerprint(key)
			
 
				+            if not self.exists(pkey):
			
 
				+                is_added.append(self.redis_db.set(pkey, 1, ex=self._ex))
			
 
				+            else:
			
 
				+                is_added.append(False)
			
 
				+
			
 
				+        return is_added if is_list else is_added[0]
			
 
				+
			
 
				+    def get(self, keys):
			
 
				+        """
			
 
				+        检查数据是否存在
			
 
				+        @param keys: list / 单个值
			
 
				+        @return: list / 单个值 （存在返回True 不存在返回False)
			
 
				+        """
			
 
				+        is_list = isinstance(keys, list)
			
 
				+        keys = keys if is_list else [keys]
			
 
				+
			
 
				+        is_exist = []
			
 
				+        for key in keys:
			
 
				+            pkey = "pylist_" + self.fingerprint(key)
			
 
				+            is_exist.append(self.exists(pkey))
			
 
				+
			
 
				+        # 判断数据本身是否重复
			
 
				+        temp_set = set()
			
 
				+        for i, key in enumerate(keys):
			
 
				+            if key in temp_set:
			
 
				+                is_exist[i] = True
			
 
				+            else:
			
 
				+                temp_set.add(key)
			
 
				+
			
 
				+        return is_exist if is_list else is_exist[0]
			
 
				+
			
 
				+    def fingerprint(self, *args):
			
 
				+        """
			
 
				+        @summary: 获取唯一的64位值,获取唯一数据指纹
			
 
				+        ---------
			
 
				+        @param args: 去重数据集合
			
 
				+        ---------
			
 
				+        @result: 5580c91ea29bf5bd963f4c08dfcacd983566e44ecea1735102bc380576fd6f30
			
 
				+        """
			
 
				+        args = sorted(args)
			
 
				+        sha256 = hashlib.sha256()
			
 
				+        for arg in args:
			
 
				+            sha256.update(str(arg).encode())
			
 
				+        return sha256.hexdigest()
			
--- a/gys/db/__init__.py
+++ b/gys/db/__init__.py
@@ -0,0 +1,8 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-11-11 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
--- a/gys/fetch_detail.py
+++ b/gys/fetch_detail.py
@@ -0,0 +1,114 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-11-11 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import time
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+
			
 
				+from bson import Int64
			
 
				+from pymongo import MongoClient
			
 
				+from pymongo.operations import UpdateOne
			
 
				+
			
 
				+import net
			
 
				+import setting
			
 
				+from log import logger
			
 
				+
			
 
				+
			
 
				+def spider(task):
			
 
				+    _id = task.pop('_id')
			
 
				+    url = task['href']
			
 
				+    ret = net.download_detail(url, proxies=net.get_proxy())
			
 
				+    if ret is None:
			
 
				+        logger.error(f'详情数据|下载失败|{url}')
			
 
				+        return _id, None
			
 
				+
			
 
				+    logger.info(f'详情数据|下载成功|{url}')
			
 
				+    data = {
			
 
				+        'site': task['site'],
			
 
				+        'channel': task['channel'],
			
 
				+        'spidercode': task['spidercode'],
			
 
				+        'area': task['area'],
			
 
				+        'city': task['city'],
			
 
				+        'district': task['district'],
			
 
				+        'href': url,
			
 
				+        'title': task['title'],
			
 
				+        's_title': task['title'],
			
 
				+        'contenthtml': ret['contenthtml'],
			
 
				+        'detail': ret['detail'],
			
 
				+        'publishtime': task['publishtime'],
			
 
				+        'l_np_publishtime': Int64(task['l_np_publishtime']),
			
 
				+        'comeintime': Int64(int(time.time())),
			
 
				+        'T': task['T'],
			
 
				+        'infoformat': task['infoformat'],
			
 
				+        'sendflag': task['sendflag'],
			
 
				+        'iscompete': task['iscompete'],
			
 
				+        '_d': task['_d'],
			
 
				+        'publishdept': task['publishdept'],
			
 
				+        'type': task['type'],
			
 
				+        'is_mixed': task['is_mixed'],
			
 
				+    }
			
 
				+    return _id, data
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    while True:
			
 
				+        client = MongoClient(setting.MONGO_HOST, setting.MONGO_PORT)
			
 
				+        to_lst_coll = client[setting.MONGO_DB][setting.MONGO_LIST_COLL]
			
 
				+        to_data_coll = client[setting.MONGO_DB][setting.MONGO_DATA_COLL]
			
 
				+
			
 
				+        data_count = 0
			
 
				+        fail_count = 0
			
 
				+        updates = []
			
 
				+        inserts = []
			
 
				+        q = {'isdownload': None}
			
 
				+        with to_lst_coll.find(q, limit=100) as cursor:
			
 
				+            with ThreadPoolExecutor(max_workers=4) as executor:
			
 
				+                fs = executor.map(spider, cursor)
			
 
				+                for f in fs:
			
 
				+                    _id, result = f
			
 
				+                    condition = {'_id': _id}
			
 
				+                    if result is None:
			
 
				+                        item = {'isdownload': 1, 'isfailed': 1}
			
 
				+                        fail_count += 1
			
 
				+                    else:
			
 
				+                        item = {'isdownload': 1, 'isfailed': 0}
			
 
				+                        inserts.append(result)
			
 
				+                        data_count += 1
			
 
				+
			
 
				+                    updates.append(UpdateOne(condition, {'$set': item}))
			
 
				+
			
 
				+                    if len(inserts) == 10:
			
 
				+                        to_data_coll.insert_many(inserts, ordered=False)
			
 
				+                        logger.info(f'详情数据|数据下载|成功{len(inserts)}条')
			
 
				+                        inserts = []
			
 
				+
			
 
				+                    if len(updates) == 10:
			
 
				+                        to_lst_coll.bulk_write(updates, ordered=False)
			
 
				+                        logger.info(f'详情数据|更新状态|完成{len(updates)}条')
			
 
				+                        updates = []
			
 
				+
			
 
				+                if len(inserts) > 0:
			
 
				+                    to_data_coll.insert_many(inserts, ordered=False)
			
 
				+                    logger.info(f'详情数据|数据下载|成功{len(inserts)}条')
			
 
				+
			
 
				+                if len(updates) > 0:
			
 
				+                    to_lst_coll.bulk_write(updates, ordered=False)
			
 
				+                    logger.info(f'详情数据|更新状态|完成{len(updates)}条')
			
 
				+
			
 
				+        logger.info(f'详情数据|数据下载|10s后执行...')
			
 
				+        time.sleep(10)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    try:
			
 
				+        main()
			
 
				+    except KeyboardInterrupt:
			
 
				+        pass
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        net.send_wechat_warning('详情采集被中止')
			
 
				+        logger.exception(e)
			
--- a/gys/fetch_list.py
+++ b/gys/fetch_list.py
@@ -0,0 +1,106 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-11-11 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import time
			
 
				+from concurrent.futures import ThreadPoolExecutor
			
 
				+
			
 
				+from bson import Int64
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+import net
			
 
				+import setting
			
 
				+from db.RedisDB import RedisFilter
			
 
				+from log import logger
			
 
				+
			
 
				+
			
 
				+def spider(url):
			
 
				+    items = net.download_list(url, proxies=net.get_proxy())
			
 
				+    if not items:
			
 
				+        logger.error(f'列表数据|下载失败|{url}')
			
 
				+        return
			
 
				+
			
 
				+    logger.info(f'列表数据|下载成功|{url}')
			
 
				+
			
 
				+    results = []
			
 
				+    for item in items:
			
 
				+        results.append({
			
 
				+            'site': '供应商网',
			
 
				+            'channel': '最新采购',
			
 
				+            'spidercode': 'a_gysw_zxcg',
			
 
				+            'area': item['area'],
			
 
				+            'city': item['city'],
			
 
				+            'district': item['district'],
			
 
				+            'href': item['href'],
			
 
				+            'title': item['title'],
			
 
				+            'publishtime': item['publishtime'],
			
 
				+            'l_np_publishtime': Int64(item['l_np_publishtime']),
			
 
				+            'comeintime': Int64(int(time.time())),
			
 
				+            'T': 'bidding',
			
 
				+            'infoformat': 1,
			
 
				+            'sendflag': 'false',
			
 
				+            'iscompete': True,
			
 
				+            '_d': 'comeintime',
			
 
				+            'publishdept': '',
			
 
				+            'type': '',
			
 
				+            'is_mixed': True,
			
 
				+        })
			
 
				+
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    while True:
			
 
				+        client = MongoClient(setting.MONGO_HOST, setting.MONGO_PORT)
			
 
				+        to_coll = client[setting.MONGO_DB][setting.MONGO_LIST_COLL]
			
 
				+        to_dedup = RedisFilter(url=setting.REDIS_URL)
			
 
				+
			
 
				+        urls = (f'https://www.gys.cn/buy/purchase/{i}.html' for i in range(1, 101))
			
 
				+        with ThreadPoolExecutor(max_workers=4) as executor:
			
 
				+            fs = executor.map(spider, urls)
			
 
				+            for f in fs:
			
 
				+                items = f or []
			
 
				+
			
 
				+                data_count = 0
			
 
				+                dedupe_count = 0
			
 
				+                unique = []
			
 
				+                inserts = []
			
 
				+                for item in items:
			
 
				+                    href = item['href']
			
 
				+                    if not to_dedup.get(href):
			
 
				+                        inserts.append(item)
			
 
				+                        unique.append(href)
			
 
				+                    else:
			
 
				+                        dedupe_count += 1
			
 
				+
			
 
				+                    if len(inserts) == 50:
			
 
				+                        to_coll.insert_many(inserts, ordered=False)
			
 
				+                        to_dedup.add(unique)
			
 
				+                        data_count += len(inserts)
			
 
				+                        inserts = []
			
 
				+                        unique = []
			
 
				+
			
 
				+                if len(inserts) > 0:
			
 
				+                    to_coll.insert_many(inserts, ordered=False)
			
 
				+                    to_dedup.add(unique)
			
 
				+                    data_count += len(inserts)
			
 
				+
			
 
				+                logger.info(f'列表数据|数据处理|重复{dedupe_count}条|入库{data_count}条')
			
 
				+
			
 
				+        logger.info(f'列表数据|数据下载|10m后执行...')
			
 
				+        time.sleep(600)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    try:
			
 
				+        main()
			
 
				+    except KeyboardInterrupt:
			
 
				+        pass
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        net.send_wechat_warning('列表采集被中止')
			
 
				+        logger.exception(e)
			
--- a/gys/font_tool.py
+++ b/gys/font_tool.py
@@ -0,0 +1,260 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-10-31 
			
 
				+---------
			
 
				+@summary:  解析图片文本
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import io
			
 
				+import pathlib
			
 
				+import random
			
 
				+import re
			
 
				+import string
			
 
				+from pathlib import Path
			
 
				+from urllib.request import urlretrieve
			
 
				+
			
 
				+import numpy as np
			
 
				+from PIL import Image, ImageOps
			
 
				+from ddddocr import DdddOcr
			
 
				+from fontTools.misc.transform import Offset
			
 
				+from fontTools.pens.freetypePen import FreeTypePen  # pip install freetype-py
			
 
				+from fontTools.ttLib import TTFont
			
 
				+
			
 
				+_root = Path(__file__).parent
			
 
				+_cache_dir = _root.joinpath('cache')
			
 
				+_cache_dir.mkdir(exist_ok=True)
			
 
				+
			
 
				+_font_dir = _cache_dir.joinpath('font')
			
 
				+_font_dir.mkdir(exist_ok=True)
			
 
				+
			
 
				+_image_dir = _cache_dir.joinpath('image')
			
 
				+_image_dir.mkdir(exist_ok=True)
			
 
				+
			
 
				+
			
 
				+def get_random(length=4):
			
 
				+    return ''.join(random.sample(string.ascii_letters + string.digits, length))
			
 
				+
			
 
				+
			
 
				+def parse_font_url(html):
			
 
				+    result = re.search(r"'icomoon';src:url\('(.*?)'\)", html, re.S)
			
 
				+    if result is None:
			
 
				+        raise ValueError(f'字体库 url "{result}" ')
			
 
				+
			
 
				+    return result.group(1)
			
 
				+
			
 
				+
			
 
				+def create_file(filename):
			
 
				+    file = _font_dir.joinpath(filename)
			
 
				+    file.touch(exist_ok=True)
			
 
				+    return file
			
 
				+
			
 
				+
			
 
				+def download_font(html, font_type='ttf', to_local=False):
			
 
				+    filename = f'{get_random(6)}.{font_type}'
			
 
				+    tmp = create_file(filename)
			
 
				+    url = parse_font_url(html)
			
 
				+    urlretrieve(url, filename=tmp)
			
 
				+    if not to_local:
			
 
				+        file_bytes = tmp.read_bytes()
			
 
				+        tmp.unlink(missing_ok=True)
			
 
				+        tmp = file_bytes
			
 
				+    return tmp
			
 
				+
			
 
				+
			
 
				+def image_to_bytes(image, filetype='JPEG'):
			
 
				+    byte_stream = io.BytesIO()
			
 
				+    image.save(byte_stream, format=filetype)
			
 
				+    byte_array = byte_stream.getvalue()
			
 
				+    return byte_array
			
 
				+
			
 
				+
			
 
				+class ImageToText:
			
 
				+    def __init__(self, file, cache=False, ocr=False, callback=None, image_scale=5, auto_delete=True):
			
 
				+        """
			
 
				+
			
 
				+        @param file: 字体文件
			
 
				+        @param cache: 缓存字体图片到本地磁盘
			
 
				+        @param ocr: 图片识别启用Ocr
			
 
				+        @param image_scale: 图片缩放倍数
			
 
				+        @param callback: 图片文本识别处理的回调函数
			
 
				+        @param auto_delete: 自动清除字体图片
			
 
				+        """
			
 
				+        if not isinstance(file, (bytes, str, pathlib.PurePath)):
			
 
				+            raise TypeError("未知文件类型")
			
 
				+
			
 
				+        if isinstance(file, bytes):
			
 
				+            self._font = TTFont(io.BytesIO(file))
			
 
				+        elif isinstance(file, str):
			
 
				+            self._font = TTFont(file)
			
 
				+        else:
			
 
				+            assert isinstance(file, pathlib.PurePath)
			
 
				+            self._font = TTFont(file)
			
 
				+
			
 
				+        # 字体图片映射关系
			
 
				+        self._font_maps = {}
			
 
				+        self._image_scale = image_scale
			
 
				+
			
 
				+        # 缓存
			
 
				+        self._cache_images = {}
			
 
				+        self._to_local = cache
			
 
				+        self._auto_delete = False if cache is True else auto_delete
			
 
				+
			
 
				+        # Ocr
			
 
				+        self._callback = None
			
 
				+        self._enable_ocr = ocr
			
 
				+        if ocr is True:
			
 
				+            if callback is not None and callable(callback):
			
 
				+                self._callback = callback
			
 
				+            else:
			
 
				+                ddddocr = DdddOcr(beta=False, old=True, show_ad=False)
			
 
				+
			
 
				+                def _classification(files):
			
 
				+                    if isinstance(files, tuple):
			
 
				+                        img = files[1]
			
 
				+                    else:
			
 
				+                        img = files
			
 
				+
			
 
				+                    return ddddocr.classification(img)
			
 
				+
			
 
				+                self._callback = _classification
			
 
				+
			
 
				+    def to_xml(self):
			
 
				+        filename = self._font.reader.file.name
			
 
				+        font_f = Path(filename).with_suffix('.xml')
			
 
				+        self._font.saveXML(font_f)
			
 
				+
			
 
				+    @property
			
 
				+    def font_maps(self):
			
 
				+        return self._font_maps
			
 
				+
			
 
				+    def parse_font(self):
			
 
				+        self._font_encode()
			
 
				+
			
 
				+        if self._enable_ocr:
			
 
				+            self._font_draw()
			
 
				+            self._font_ocr()
			
 
				+
			
 
				+    def _font_encode(self):
			
 
				+        for unicode, name in self._font.getBestCmap().items():
			
 
				+            code = f'&#{str(hex(unicode))[1:]}'  # 0x100c4 => &#x100c4
			
 
				+            glyph = {'name': name, 'code': hex(unicode), 'zh': ''}
			
 
				+            self._font_maps[code] = glyph
			
 
				+            # print(code, glyph)
			
 
				+
			
 
				+    def _font_draw(self):
			
 
				+        glyph_set = self._font.getGlyphSet()
			
 
				+        for code, glyph_dict in self._font_maps.items():
			
 
				+            # print(code, glyph_dict)
			
 
				+            glyph = glyph_set[glyph_dict['name']]  # 获取字形
			
 
				+
			
 
				+            pen = FreeTypePen(None)  # 创建变换笔(FreeTypePen)实例,绘制字形
			
 
				+            glyph.draw(pen)  # 绘制字形
			
 
				+
			
 
				+            # 获取字形的宽度，以及从字体文件的 OS/2 表中获取推荐的上升高度和下降高度,确定图像的高度
			
 
				+            width, ascender, descender = (
			
 
				+                glyph.width,
			
 
				+                self._font['OS/2'].usWinAscent,
			
 
				+                -self._font['OS/2'].usWinDescent,
			
 
				+            )
			
 
				+            height = ascender - descender
			
 
				+
			
 
				+            # 创建图像并转换为数组
			
 
				+            single_font_image = pen.array(
			
 
				+                width=width,
			
 
				+                height=height,
			
 
				+                transform=Offset(0, -descender),
			
 
				+                contain=False,
			
 
				+                evenOdd=False,
			
 
				+            )
			
 
				+
			
 
				+            # 转换为灰度图像数组
			
 
				+            single_font_image = np.array(single_font_image) * 255
			
 
				+            # 反转颜色(使得黑色变为白色，白色变为黑色)
			
 
				+            single_font_image = 255 - single_font_image
			
 
				+
			
 
				+            # 创建 PIL 图像对象
			
 
				+            single_font_image = Image.fromarray(single_font_image)
			
 
				+            # 转换为灰度模式
			
 
				+            single_font_image = single_font_image.convert("L")
			
 
				+            # 图片添加边框
			
 
				+            single_font_image = ImageOps.expand(single_font_image, border=6, fill=255)
			
 
				+
			
 
				+            # 计算新的宽度和高度
			
 
				+            new_width = single_font_image.width // self._image_scale
			
 
				+            new_height = single_font_image.height // self._image_scale
			
 
				+
			
 
				+            # 调整图片大小
			
 
				+            single_font_image = single_font_image.resize(
			
 
				+                (new_width, new_height),
			
 
				+                resample=Image.Resampling.LANCZOS
			
 
				+            )
			
 
				+
			
 
				+            image_name = f'{glyph_dict["code"]}.jpg'
			
 
				+            if not self._to_local:
			
 
				+                image_bytes = image_to_bytes(single_font_image)
			
 
				+                self._cache_images[code] = (image_name, image_bytes, 'jpg')
			
 
				+            else:
			
 
				+                single_font_image.save(_image_dir.joinpath(image_name))  # 保存图像
			
 
				+
			
 
				+    def _font_ocr(self):
			
 
				+        for code, glyph_dict in dict(self._font_maps).items():
			
 
				+            if not self._to_local:
			
 
				+                files = self._cache_images[code]
			
 
				+                text = self._callback(files)
			
 
				+            else:
			
 
				+                files = _image_dir.joinpath(f'{glyph_dict["code"]}.jpg')
			
 
				+                text = self._callback(files)
			
 
				+
			
 
				+            self._font_maps[code]['zh'] = text
			
 
				+
			
 
				+    def __contains__(self, key):
			
 
				+        return key in self._font_maps
			
 
				+
			
 
				+    def __getitem__(self, key):
			
 
				+        if key in self._font_maps:
			
 
				+            return self._font_maps[key]
			
 
				+        else:
			
 
				+            raise KeyError(key)
			
 
				+
			
 
				+    def get(self, key, default=None):
			
 
				+        try:
			
 
				+            return self.__getitem__(key)
			
 
				+        except KeyError:
			
 
				+            return default
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        self.__del__()
			
 
				+        return
			
 
				+
			
 
				+    def _del(self, missing_ok=False):
			
 
				+        if self._auto_delete:
			
 
				+            for img_f in _image_dir.iterdir():
			
 
				+                img_f.unlink(missing_ok=True)
			
 
				+
			
 
				+            for font_f in _font_dir.iterdir():
			
 
				+                font_f.unlink(missing_ok=True)
			
 
				+
			
 
				+            try:
			
 
				+                # _image_dir.rmdir()
			
 
				+                # _font_dir.rmdir()
			
 
				+                _cache_dir.rmdir()
			
 
				+            except OSError as e:
			
 
				+                if not missing_ok:
			
 
				+                    raise e
			
 
				+
			
 
				+    def __del__(self):
			
 
				+        self._del(missing_ok=True)
			
 
				+
			
 
				+
			
 
				+FontTranslator = ImageToText
			
 
				+
			
 
				+
			
 
				+def parse_font(font_file, *, ocr=False, ocr_extract=None, **kwargs):
			
 
				+    ocr = True if ocr_extract is not None and callable(ocr_extract) else ocr
			
 
				+    translator = ImageToText(font_file, ocr=ocr, callback=ocr_extract, **kwargs)
			
 
				+    translator.parse_font()
			
 
				+    return translator
			
--- a/gys/proto/ocr.proto
+++ b/gys/proto/ocr.proto
@@ -0,0 +1,15 @@
 
				+syntax = "proto3";
			
 
				+//OCR图像转文字服务
			
 
				+package proto;
			
 
				+
			
 
				+message OcrRequest {
			
 
				+  bytes image = 1;
			
 
				+}
			
 
				+
			
 
				+message OcrResponse {
			
 
				+    string message = 1;
			
 
				+}
			
 
				+
			
 
				+service Ocr {
			
 
				+  rpc Ocr(OcrRequest) returns (OcrResponse);
			
 
				+}
			
--- a/gys/proto/ocr_pb2.py
+++ b/gys/proto/ocr_pb2.py
@@ -0,0 +1,131 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# Generated by the protocol buffer compiler.  DO NOT EDIT!
			
 
				+# source: proto/ocr.proto
			
 
				+
			
 
				+from google.protobuf import descriptor as _descriptor
			
 
				+from google.protobuf import message as _message
			
 
				+from google.protobuf import reflection as _reflection
			
 
				+from google.protobuf import symbol_database as _symbol_database
			
 
				+# @@protoc_insertion_point(imports)
			
 
				+
			
 
				+_sym_db = _symbol_database.Default()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+DESCRIPTOR = _descriptor.FileDescriptor(
			
 
				+  name='proto/ocr.proto',
			
 
				+  package='proto',
			
 
				+  syntax='proto3',
			
 
				+  serialized_options=None,
			
 
				+  serialized_pb=b'\n\x0fproto/ocr.proto\x12\x05proto\"\x1b\n\nOcrRequest\x12\r\n\x05image\x18\x01 \x01(\x0c\"\x1e\n\x0bOcrResponse\x12\x0f\n\x07message\x18\x01 \x01(\t23\n\x03Ocr\x12,\n\x03Ocr\x12\x11.proto.OcrRequest\x1a\x12.proto.OcrResponseb\x06proto3'
			
 
				+)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+_OCRREQUEST = _descriptor.Descriptor(
			
 
				+  name='OcrRequest',
			
 
				+  full_name='proto.OcrRequest',
			
 
				+  filename=None,
			
 
				+  file=DESCRIPTOR,
			
 
				+  containing_type=None,
			
 
				+  fields=[
			
 
				+    _descriptor.FieldDescriptor(
			
 
				+      name='image', full_name='proto.OcrRequest.image', index=0,
			
 
				+      number=1, type=12, cpp_type=9, label=1,
			
 
				+      has_default_value=False, default_value=b"",
			
 
				+      message_type=None, enum_type=None, containing_type=None,
			
 
				+      is_extension=False, extension_scope=None,
			
 
				+      serialized_options=None, file=DESCRIPTOR),
			
 
				+  ],
			
 
				+  extensions=[
			
 
				+  ],
			
 
				+  nested_types=[],
			
 
				+  enum_types=[
			
 
				+  ],
			
 
				+  serialized_options=None,
			
 
				+  is_extendable=False,
			
 
				+  syntax='proto3',
			
 
				+  extension_ranges=[],
			
 
				+  oneofs=[
			
 
				+  ],
			
 
				+  serialized_start=26,
			
 
				+  serialized_end=53,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+_OCRRESPONSE = _descriptor.Descriptor(
			
 
				+  name='OcrResponse',
			
 
				+  full_name='proto.OcrResponse',
			
 
				+  filename=None,
			
 
				+  file=DESCRIPTOR,
			
 
				+  containing_type=None,
			
 
				+  fields=[
			
 
				+    _descriptor.FieldDescriptor(
			
 
				+      name='message', full_name='proto.OcrResponse.message', index=0,
			
 
				+      number=1, type=9, cpp_type=9, label=1,
			
 
				+      has_default_value=False, default_value=b"".decode('utf-8'),
			
 
				+      message_type=None, enum_type=None, containing_type=None,
			
 
				+      is_extension=False, extension_scope=None,
			
 
				+      serialized_options=None, file=DESCRIPTOR),
			
 
				+  ],
			
 
				+  extensions=[
			
 
				+  ],
			
 
				+  nested_types=[],
			
 
				+  enum_types=[
			
 
				+  ],
			
 
				+  serialized_options=None,
			
 
				+  is_extendable=False,
			
 
				+  syntax='proto3',
			
 
				+  extension_ranges=[],
			
 
				+  oneofs=[
			
 
				+  ],
			
 
				+  serialized_start=55,
			
 
				+  serialized_end=85,
			
 
				+)
			
 
				+
			
 
				+DESCRIPTOR.message_types_by_name['OcrRequest'] = _OCRREQUEST
			
 
				+DESCRIPTOR.message_types_by_name['OcrResponse'] = _OCRRESPONSE
			
 
				+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
			
 
				+
			
 
				+OcrRequest = _reflection.GeneratedProtocolMessageType('OcrRequest', (_message.Message,), {
			
 
				+  'DESCRIPTOR' : _OCRREQUEST,
			
 
				+  '__module__' : 'proto.ocr_pb2'
			
 
				+  # @@protoc_insertion_point(class_scope:proto.OcrRequest)
			
 
				+  })
			
 
				+_sym_db.RegisterMessage(OcrRequest)
			
 
				+
			
 
				+OcrResponse = _reflection.GeneratedProtocolMessageType('OcrResponse', (_message.Message,), {
			
 
				+  'DESCRIPTOR' : _OCRRESPONSE,
			
 
				+  '__module__' : 'proto.ocr_pb2'
			
 
				+  # @@protoc_insertion_point(class_scope:proto.OcrResponse)
			
 
				+  })
			
 
				+_sym_db.RegisterMessage(OcrResponse)
			
 
				+
			
 
				+
			
 
				+
			
 
				+_OCR = _descriptor.ServiceDescriptor(
			
 
				+  name='Ocr',
			
 
				+  full_name='proto.Ocr',
			
 
				+  file=DESCRIPTOR,
			
 
				+  index=0,
			
 
				+  serialized_options=None,
			
 
				+  serialized_start=87,
			
 
				+  serialized_end=138,
			
 
				+  methods=[
			
 
				+  _descriptor.MethodDescriptor(
			
 
				+    name='Ocr',
			
 
				+    full_name='proto.Ocr.Ocr',
			
 
				+    index=0,
			
 
				+    containing_service=None,
			
 
				+    input_type=_OCRREQUEST,
			
 
				+    output_type=_OCRRESPONSE,
			
 
				+    serialized_options=None,
			
 
				+  ),
			
 
				+])
			
 
				+_sym_db.RegisterServiceDescriptor(_OCR)
			
 
				+
			
 
				+DESCRIPTOR.services_by_name['Ocr'] = _OCR
			
 
				+
			
 
				+# @@protoc_insertion_point(module_scope)
			
--- a/gys/proto/ocr_pb2_grpc.py
+++ b/gys/proto/ocr_pb2_grpc.py
@@ -0,0 +1,64 @@
 
				+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
			
 
				+import grpc
			
 
				+
			
 
				+from proto import ocr_pb2 as proto_dot_ocr__pb2
			
 
				+
			
 
				+
			
 
				+class OcrStub(object):
			
 
				+    """Missing associated documentation comment in .proto file"""
			
 
				+
			
 
				+    def __init__(self, channel):
			
 
				+        """Constructor.
			
 
				+
			
 
				+        Args:
			
 
				+            channel: A grpc.Channel.
			
 
				+        """
			
 
				+        self.Ocr = channel.unary_unary(
			
 
				+                '/proto.Ocr/Ocr',
			
 
				+                request_serializer=proto_dot_ocr__pb2.OcrRequest.SerializeToString,
			
 
				+                response_deserializer=proto_dot_ocr__pb2.OcrResponse.FromString,
			
 
				+                )
			
 
				+
			
 
				+
			
 
				+class OcrServicer(object):
			
 
				+    """Missing associated documentation comment in .proto file"""
			
 
				+
			
 
				+    def Ocr(self, request, context):
			
 
				+        """Missing associated documentation comment in .proto file"""
			
 
				+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
			
 
				+        context.set_details('Method not implemented!')
			
 
				+        raise NotImplementedError('Method not implemented!')
			
 
				+
			
 
				+
			
 
				+def add_OcrServicer_to_server(servicer, server):
			
 
				+    rpc_method_handlers = {
			
 
				+            'Ocr': grpc.unary_unary_rpc_method_handler(
			
 
				+                    servicer.Ocr,
			
 
				+                    request_deserializer=proto_dot_ocr__pb2.OcrRequest.FromString,
			
 
				+                    response_serializer=proto_dot_ocr__pb2.OcrResponse.SerializeToString,
			
 
				+            ),
			
 
				+    }
			
 
				+    generic_handler = grpc.method_handlers_generic_handler(
			
 
				+            'proto.Ocr', rpc_method_handlers)
			
 
				+    server.add_generic_rpc_handlers((generic_handler,))
			
 
				+
			
 
				+
			
 
				+ # This class is part of an EXPERIMENTAL API.
			
 
				+class Ocr(object):
			
 
				+    """Missing associated documentation comment in .proto file"""
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def Ocr(request,
			
 
				+            target,
			
 
				+            options=(),
			
 
				+            channel_credentials=None,
			
 
				+            call_credentials=None,
			
 
				+            compression=None,
			
 
				+            wait_for_ready=None,
			
 
				+            timeout=None,
			
 
				+            metadata=None):
			
 
				+        return grpc.experimental.unary_unary(request, target, '/proto.Ocr/Ocr',
			
 
				+            proto_dot_ocr__pb2.OcrRequest.SerializeToString,
			
 
				+            proto_dot_ocr__pb2.OcrResponse.FromString,
			
 
				+            options, channel_credentials,
			
 
				+            call_credentials, compression, wait_for_ready, timeout, metadata)
			
--- a/gys/setting.py
+++ b/gys/setting.py
@@ -0,0 +1,30 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-11-08 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+
			
 
				+REDIS_URL = 'redis://:k5ZJR5KV4q7DRZ92DQ@172.17.162.34:8361/0'
			
 
				+
			
 
				+MONGO_HOST = '172.17.4.87'
			
 
				+MONGO_PORT = 27080
			
 
				+MONGO_DB = 'py_spider'
			
 
				+MONGO_LIST_COLL = 'gys_list'
			
 
				+MONGO_DATA_COLL = 'data_bak'
			
 
				+
			
 
				+# 第三方Ocr
			
 
				+DG_OCR_A2S_ADDRESS = '172.17.4.188:9090'
			
 
				+DG_OCR_TOPIC = 'dg_ocr'
			
 
				+DG_OCR_TIMEOUT = 20
			
 
				+
			
 
				+# 剑鱼Ocr
			
 
				+JY_OCR_A2S_ADDRESS = '172.17.4.188:9090'
			
 
				+JY_OCR_A2S_TIMEOUT = 60
			
 
				+JY_OCR_TOPIC = 'spider_ocr'
			
 
				+
			
 
				+# 剑鱼ApiOcr
			
 
				+JY_API_OCR_ADDRESS = '172.17.208.114:8089'
			
 
				+JY_API_OCR_TIMEOUT = 20
			
--- a/gys/utils.py
+++ b/gys/utils.py
@@ -0,0 +1,176 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on 2024-10-30 
			
 
				+---------
			
 
				+@summary:  
			
 
				+---------
			
 
				+@author: Dzr
			
 
				+"""
			
 
				+import datetime
			
 
				+import functools
			
 
				+import re
			
 
				+import time
			
 
				+
			
 
				+from lxml.html import Element, HtmlElement, fromstring, tostring
			
 
				+
			
 
				+
			
 
				+def run_time(fun):
			
 
				+    @functools.wraps(fun)
			
 
				+    def wrapper(*args, **kwargs):
			
 
				+        time_start = time.time()
			
 
				+        result = fun(*args, **kwargs)
			
 
				+        time_end = time.time()
			
 
				+        print(time_end - time_start)
			
 
				+        return result
			
 
				+    return wrapper
			
 
				+
			
 
				+
			
 
				+def is_chinese_character(char):
			
 
				+    # Unicode范围：汉字基本区（4E00-9FFF），扩展A区（3400-4DBF），扩展B区（20000-2A6DF），扩展C区（2A700-2B73F），扩展D区（2B740-2B81F），扩展E区（2B820-2CEAF），扩展F区（2CEB0-2EBEF），扩展G区（30000-3134F）
			
 
				+    # regex = re.compile(r'^[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$')
			
 
				+    # 排除数字汉字（4E00-4E9F）
			
 
				+    regex = re.compile(r'^[\u4ea0-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$')
			
 
				+    return bool(regex.match(char))
			
 
				+
			
 
				+
			
 
				+def is_specific_number_chinese_character(char):
			
 
				+    # Unicode范围：特定的数字汉字：零,一,二,三,四...
			
 
				+    regex = re.compile(r'^[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u96f6]$')
			
 
				+    return bool(regex.match(char))
			
 
				+
			
 
				+
			
 
				+def is_en(char):
			
 
				+    # Unicode范围：英文字母（大写和小写）
			
 
				+    regex = re.compile(r'^[A-Za-z]$')
			
 
				+    return bool(regex.match(char))
			
 
				+
			
 
				+
			
 
				+def is_digit(char):
			
 
				+    # Unicode范围：阿拉伯数字
			
 
				+    regex = re.compile(r'^[1-9]$')
			
 
				+    return bool(regex.match(char))
			
 
				+
			
 
				+
			
 
				+def is_zero_or_o(char):
			
 
				+    # 正则表达式匹配 '0', 'o' 或 'O'
			
 
				+    regex = re.compile(r'^[0oO]$')
			
 
				+    return bool(regex.match(char))
			
 
				+
			
 
				+
			
 
				+def replace_element(old_tag: HtmlElement, new_tag: HtmlElement):
			
 
				+    old_tag.getparent().replace(old_tag, new_tag)
			
 
				+
			
 
				+
			
 
				+def create_element(tag, attrib, text=None):
			
 
				+    element = Element(tag, **attrib)
			
 
				+    if text is not None:
			
 
				+        element.text = text
			
 
				+    return element
			
 
				+
			
 
				+
			
 
				+def drop_element(tag: HtmlElement, feature: str):
			
 
				+    element = tag.xpath(feature)
			
 
				+    element = next(iter(element or []), None)
			
 
				+    if element is not None:
			
 
				+        element.drop_tree()
			
 
				+
			
 
				+
			
 
				+def translate(val, font_maps):
			
 
				+    characters = val.split(";")
			
 
				+
			
 
				+    ret = ""
			
 
				+    characters = list(filter(lambda x: x.strip() != '', characters))
			
 
				+    for character in characters:
			
 
				+        ret += font_maps[f'&#{hex(int(character[2:]))[1:]}']['zh']
			
 
				+    return ret
			
 
				+
			
 
				+
			
 
				+def translate_text(tag: HtmlElement, font_maps, pattern):
			
 
				+    original_element_str = tostring(tag, encoding='gbk').decode('gbk')
			
 
				+    origin_text = re.findall(pattern, original_element_str, flags=re.S)  # 正则抽取保证文本不自动解码
			
 
				+    origin_text = next(iter(origin_text or []), '')
			
 
				+    if not origin_text:
			
 
				+        return origin_text
			
 
				+
			
 
				+    target_text = translate(origin_text, font_maps)
			
 
				+    # print(origin_text, ' <= ', target_text)
			
 
				+    return target_text
			
 
				+
			
 
				+
			
 
				+def translate_element_text(tag: HtmlElement, font_maps, feature, pattern):
			
 
				+    # 抽取原始文本
			
 
				+    original_element = next(iter(tag.xpath(feature) or []), None)
			
 
				+    if original_element is None:
			
 
				+        raise ValueError(f'{original_element}')
			
 
				+
			
 
				+    tag_name = original_element.tag
			
 
				+    attrib = original_element.attrib
			
 
				+    target_text = translate_text(original_element, font_maps, pattern)
			
 
				+
			
 
				+    # 创建新元素
			
 
				+    new_element = create_element(tag_name, attrib, text=target_text)
			
 
				+    # 替换旧元素
			
 
				+    replace_element(original_element, new_element)
			
 
				+
			
 
				+
			
 
				+def parse_element(tag: HtmlElement, font_maps):
			
 
				+    # 字体混淆反解析 - 标题
			
 
				+    translate_element_text(
			
 
				+        tag,
			
 
				+        font_maps,
			
 
				+        '//div[@class="bw_140 secret"]',
			
 
				+        r'<div.*>(.*?)</div>'
			
 
				+    )
			
 
				+
			
 
				+    # 字体混淆反解析 - 说明
			
 
				+    translate_element_text(
			
 
				+        tag,
			
 
				+        font_maps,
			
 
				+        '//td[@class="secret"]/div',
			
 
				+        r'<div.*>(.*?)</div>'
			
 
				+    )
			
 
				+
			
 
				+    # 删除源码中敏感数据
			
 
				+    drop_element(tag, '//div[@class="details_txt"]')
			
 
				+
			
 
				+
			
 
				+def extract_list(html, font_maps):
			
 
				+    results = []
			
 
				+    tree = fromstring(html)
			
 
				+    for li_tag in tree.xpath('//ul[@class="industry_ul"]/li'):
			
 
				+        elem1 = next(iter(li_tag.xpath('./div[@class="industry_left"]') or []), Element('div'))
			
 
				+        p_tag = next(iter(elem1.xpath('./p[@class="tt"]/text()') or []), '全国').replace('收货地：', '')
			
 
				+        args = str(p_tag).split()
			
 
				+        if len(args) == 2:
			
 
				+            area, city = args
			
 
				+        else:
			
 
				+            city = ''
			
 
				+            area = args[0]
			
 
				+
			
 
				+        elem2 = next(iter(li_tag.xpath('./div[@class="industry_cc"]') or []), Element('div'))
			
 
				+        a_tag = next(iter(elem2.xpath('.//h3[@class="secret"]/a') or []), Element('a'))
			
 
				+        title = translate_text(a_tag, font_maps, r'<a.*>(.*?)</a>')
			
 
				+        href = a_tag.get('href')
			
 
				+        publish_time = next(iter(elem2.xpath('./div/span/text()') or []), '').replace('发布时间：', '')
			
 
				+        publish_time_ts = datetime.datetime.strptime(publish_time, '%Y-%m-%d').timestamp()
			
 
				+        # print(f'{title} {href} {publish_time}')
			
 
				+        results.append({
			
 
				+            'title': title,
			
 
				+            'href': href,
			
 
				+            'publishtime': publish_time,
			
 
				+            'l_np_publishtime': publish_time_ts,
			
 
				+            'area': area,
			
 
				+            'city': city,
			
 
				+            'district': '',
			
 
				+        })
			
 
				+    return results
			
 
				+
			
 
				+
			
 
				+def extract_detail_html(html, font_maps):
			
 
				+    tree = fromstring(html)
			
 
				+    parse_element(tree, font_maps)
			
 
				+    element = tree.xpath('//div[@class="details_text"]')
			
 
				+    element = next(iter(element or []), Element('div'))
			
 
				+    source = tostring(element, encoding='gbk').decode('gbk')
			
 
				+    # print(source)
			
 
				+    return source