123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176 |
- # -*- coding: utf-8 -*-
- """
- Created on 2024-10-30
- ---------
- @summary:
- ---------
- @author: Dzr
- """
- import datetime
- import functools
- import re
- import time
- from lxml.html import Element, HtmlElement, fromstring, tostring
- def run_time(fun):
- @functools.wraps(fun)
- def wrapper(*args, **kwargs):
- time_start = time.time()
- result = fun(*args, **kwargs)
- time_end = time.time()
- print(time_end - time_start)
- return result
- return wrapper
- def is_chinese_character(char):
- # Unicode范围:汉字基本区(4E00-9FFF),扩展A区(3400-4DBF),扩展B区(20000-2A6DF),扩展C区(2A700-2B73F),扩展D区(2B740-2B81F),扩展E区(2B820-2CEAF),扩展F区(2CEB0-2EBEF),扩展G区(30000-3134F)
- # regex = re.compile(r'^[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$')
- # 排除数字汉字(4E00-4E9F)
- regex = re.compile(r'^[\u4ea0-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$')
- return bool(regex.match(char))
- def is_specific_number_chinese_character(char):
- # Unicode范围:特定的数字汉字:零,一,二,三,四...
- regex = re.compile(r'^[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u96f6]$')
- return bool(regex.match(char))
- def is_en(char):
- # Unicode范围:英文字母(大写和小写)
- regex = re.compile(r'^[A-Za-z]$')
- return bool(regex.match(char))
- def is_digit(char):
- # Unicode范围:阿拉伯数字
- regex = re.compile(r'^[1-9]$')
- return bool(regex.match(char))
- def is_zero_or_o(char):
- # 正则表达式匹配 '0', 'o' 或 'O'
- regex = re.compile(r'^[0oO]$')
- return bool(regex.match(char))
- def replace_element(old_tag: HtmlElement, new_tag: HtmlElement):
- old_tag.getparent().replace(old_tag, new_tag)
- def create_element(tag, attrib, text=None):
- element = Element(tag, **attrib)
- if text is not None:
- element.text = text
- return element
- def drop_element(tag: HtmlElement, feature: str):
- element = tag.xpath(feature)
- element = next(iter(element or []), None)
- if element is not None:
- element.drop_tree()
- def translate(val, font_maps):
- characters = val.split(";")
- ret = ""
- characters = list(filter(lambda x: x.strip() != '', characters))
- for character in characters:
- ret += font_maps[f'&#{hex(int(character[2:]))[1:]}']['zh']
- return ret
- def translate_text(tag: HtmlElement, font_maps, pattern):
- original_element_str = tostring(tag, encoding='gbk').decode('gbk')
- origin_text = re.findall(pattern, original_element_str, flags=re.S) # 正则抽取保证文本不自动解码
- origin_text = next(iter(origin_text or []), '')
- if not origin_text:
- return origin_text
- target_text = translate(origin_text, font_maps)
- # print(origin_text, ' <= ', target_text)
- return target_text
- def translate_element_text(tag: HtmlElement, font_maps, feature, pattern):
- # 抽取原始文本
- original_element = next(iter(tag.xpath(feature) or []), None)
- if original_element is None:
- raise ValueError(f'{original_element}')
- tag_name = original_element.tag
- attrib = original_element.attrib
- target_text = translate_text(original_element, font_maps, pattern)
- # 创建新元素
- new_element = create_element(tag_name, attrib, text=target_text)
- # 替换旧元素
- replace_element(original_element, new_element)
- def parse_element(tag: HtmlElement, font_maps):
- # 字体混淆反解析 - 标题
- translate_element_text(
- tag,
- font_maps,
- '//div[@class="bw_140 secret"]',
- r'<div.*>(.*?)</div>'
- )
- # 字体混淆反解析 - 说明
- translate_element_text(
- tag,
- font_maps,
- '//td[@class="secret"]/div',
- r'<div.*>(.*?)</div>'
- )
- # 删除源码中敏感数据
- drop_element(tag, '//div[@class="details_txt"]')
- def extract_list(html, font_maps):
- results = []
- tree = fromstring(html)
- for li_tag in tree.xpath('//ul[@class="industry_ul"]/li'):
- elem1 = next(iter(li_tag.xpath('./div[@class="industry_left"]') or []), Element('div'))
- p_tag = next(iter(elem1.xpath('./p[@class="tt"]/text()') or []), '全国').replace('收货地:', '')
- args = str(p_tag).split()
- if len(args) == 2:
- area, city = args
- else:
- city = ''
- area = args[0]
- elem2 = next(iter(li_tag.xpath('./div[@class="industry_cc"]') or []), Element('div'))
- a_tag = next(iter(elem2.xpath('.//h3[@class="secret"]/a') or []), Element('a'))
- title = translate_text(a_tag, font_maps, r'<a.*>(.*?)</a>')
- href = a_tag.get('href')
- publish_time = next(iter(elem2.xpath('./div/span/text()') or []), '').replace('发布时间:', '')
- publish_time_ts = datetime.datetime.strptime(publish_time, '%Y-%m-%d').timestamp()
- # print(f'{title} {href} {publish_time}')
- results.append({
- 'title': title,
- 'href': href,
- 'publishtime': publish_time,
- 'l_np_publishtime': publish_time_ts,
- 'area': area,
- 'city': city,
- 'district': '',
- })
- return results
- def extract_detail_html(html, font_maps):
- tree = fromstring(html)
- parse_element(tree, font_maps)
- element = tree.xpath('//div[@class="details_text"]')
- element = next(iter(element or []), Element('div'))
- source = tostring(element, encoding='gbk').decode('gbk')
- # print(source)
- return source
|