# -*- coding: utf-8 -*- """ Created on 2024-10-30 --------- @summary: --------- @author: Dzr """ import datetime import functools import re import time from lxml.html import Element, HtmlElement, fromstring, tostring def run_time(fun): @functools.wraps(fun) def wrapper(*args, **kwargs): time_start = time.time() result = fun(*args, **kwargs) time_end = time.time() print(time_end - time_start) return result return wrapper def is_chinese_character(char): # Unicode范围:汉字基本区(4E00-9FFF),扩展A区(3400-4DBF),扩展B区(20000-2A6DF),扩展C区(2A700-2B73F),扩展D区(2B740-2B81F),扩展E区(2B820-2CEAF),扩展F区(2CEB0-2EBEF),扩展G区(30000-3134F) # regex = re.compile(r'^[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$') # 排除数字汉字(4E00-4E9F) regex = re.compile(r'^[\u4ea0-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$') return bool(regex.match(char)) def is_specific_number_chinese_character(char): # Unicode范围:特定的数字汉字:零,一,二,三,四... regex = re.compile(r'^[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u96f6]$') return bool(regex.match(char)) def is_en(char): # Unicode范围:英文字母(大写和小写) regex = re.compile(r'^[A-Za-z]$') return bool(regex.match(char)) def is_digit(char): # Unicode范围:阿拉伯数字 regex = re.compile(r'^[1-9]$') return bool(regex.match(char)) def is_zero_or_o(char): # 正则表达式匹配 '0', 'o' 或 'O' regex = re.compile(r'^[0oO]$') return bool(regex.match(char)) def replace_element(old_tag: HtmlElement, new_tag: HtmlElement): old_tag.getparent().replace(old_tag, new_tag) def create_element(tag, attrib, text=None): element = Element(tag, **attrib) if text is not None: element.text = text return element def drop_element(tag: HtmlElement, feature: str): element = tag.xpath(feature) element = next(iter(element or []), None) if element is not None: element.drop_tree() def translate(val, font_maps): characters = val.split(";") ret = "" characters = list(filter(lambda x: x.strip() != '', characters)) for character in characters: ret += font_maps[f'{hex(int(character[2:]))[1:]}']['zh'] return ret def translate_text(tag: HtmlElement, font_maps, pattern): original_element_str = tostring(tag, encoding='gbk').decode('gbk') origin_text = re.findall(pattern, original_element_str, flags=re.S) # 正则抽取保证文本不自动解码 origin_text = next(iter(origin_text or []), '') if not origin_text: return origin_text target_text = translate(origin_text, font_maps) # print(origin_text, ' <= ', target_text) return target_text def translate_element_text(tag: HtmlElement, font_maps, feature, pattern): # 抽取原始文本 original_element = next(iter(tag.xpath(feature) or []), None) if original_element is None: raise ValueError(f'{original_element}') tag_name = original_element.tag attrib = original_element.attrib target_text = translate_text(original_element, font_maps, pattern) # 创建新元素 new_element = create_element(tag_name, attrib, text=target_text) # 替换旧元素 replace_element(original_element, new_element) def parse_element(tag: HtmlElement, font_maps): # 字体混淆反解析 - 标题 translate_element_text( tag, font_maps, '//div[@class="bw_140 secret"]', r'