utils.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on 2024-10-30
  4. ---------
  5. @summary:
  6. ---------
  7. @author: Dzr
  8. """
  9. import datetime
  10. import functools
  11. import re
  12. import time
  13. from lxml.html import Element, HtmlElement, fromstring, tostring
  14. def run_time(fun):
  15. @functools.wraps(fun)
  16. def wrapper(*args, **kwargs):
  17. time_start = time.time()
  18. result = fun(*args, **kwargs)
  19. time_end = time.time()
  20. print(time_end - time_start)
  21. return result
  22. return wrapper
  23. def is_chinese_character(char):
  24. # Unicode范围:汉字基本区(4E00-9FFF),扩展A区(3400-4DBF),扩展B区(20000-2A6DF),扩展C区(2A700-2B73F),扩展D区(2B740-2B81F),扩展E区(2B820-2CEAF),扩展F区(2CEB0-2EBEF),扩展G区(30000-3134F)
  25. # regex = re.compile(r'^[\u4e00-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$')
  26. # 排除数字汉字(4E00-4E9F)
  27. regex = re.compile(r'^[\u4ea0-\u9fff\u3400-\u4dbf\u20000-\u2a6df\u2a700-\u2b73f\u2b740-\u2b81f\u2b820-\u2ceaf\u2ceb0-\u2ebef\u30000-\u3134f]$')
  28. return bool(regex.match(char))
  29. def is_specific_number_chinese_character(char):
  30. # Unicode范围:特定的数字汉字:零,一,二,三,四...
  31. regex = re.compile(r'^[\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u96f6]$')
  32. return bool(regex.match(char))
  33. def is_en(char):
  34. # Unicode范围:英文字母(大写和小写)
  35. regex = re.compile(r'^[A-Za-z]$')
  36. return bool(regex.match(char))
  37. def is_digit(char):
  38. # Unicode范围:阿拉伯数字
  39. regex = re.compile(r'^[1-9]$')
  40. return bool(regex.match(char))
  41. def is_zero_or_o(char):
  42. # 正则表达式匹配 '0', 'o' 或 'O'
  43. regex = re.compile(r'^[0oO]$')
  44. return bool(regex.match(char))
  45. def replace_element(old_tag: HtmlElement, new_tag: HtmlElement):
  46. old_tag.getparent().replace(old_tag, new_tag)
  47. def create_element(tag, attrib, text=None):
  48. element = Element(tag, **attrib)
  49. if text is not None:
  50. element.text = text
  51. return element
  52. def drop_element(tag: HtmlElement, feature: str):
  53. element = tag.xpath(feature)
  54. element = next(iter(element or []), None)
  55. if element is not None:
  56. element.drop_tree()
  57. def translate(val, font_maps):
  58. characters = val.split(";")
  59. ret = ""
  60. characters = list(filter(lambda x: x.strip() != '', characters))
  61. for character in characters:
  62. ret += font_maps[f'&#{hex(int(character[2:]))[1:]}']['zh']
  63. return ret
  64. def translate_text(tag: HtmlElement, font_maps, pattern):
  65. original_element_str = tostring(tag, encoding='gbk').decode('gbk')
  66. origin_text = re.findall(pattern, original_element_str, flags=re.S) # 正则抽取保证文本不自动解码
  67. origin_text = next(iter(origin_text or []), '')
  68. if not origin_text:
  69. return origin_text
  70. target_text = translate(origin_text, font_maps)
  71. # print(origin_text, ' <= ', target_text)
  72. return target_text
  73. def translate_element_text(tag: HtmlElement, font_maps, feature, pattern):
  74. # 抽取原始文本
  75. original_element = next(iter(tag.xpath(feature) or []), None)
  76. if original_element is None:
  77. raise ValueError(f'{original_element}')
  78. tag_name = original_element.tag
  79. attrib = original_element.attrib
  80. target_text = translate_text(original_element, font_maps, pattern)
  81. # 创建新元素
  82. new_element = create_element(tag_name, attrib, text=target_text)
  83. # 替换旧元素
  84. replace_element(original_element, new_element)
  85. def parse_element(tag: HtmlElement, font_maps):
  86. # 字体混淆反解析 - 标题
  87. translate_element_text(
  88. tag,
  89. font_maps,
  90. '//div[@class="bw_140 secret"]',
  91. r'<div.*>(.*?)</div>'
  92. )
  93. # 字体混淆反解析 - 说明
  94. translate_element_text(
  95. tag,
  96. font_maps,
  97. '//td[@class="secret"]/div',
  98. r'<div.*>(.*?)</div>'
  99. )
  100. # 删除源码中敏感数据
  101. drop_element(tag, '//div[@class="details_txt"]')
  102. def extract_list(html, font_maps):
  103. results = []
  104. tree = fromstring(html)
  105. for li_tag in tree.xpath('//ul[@class="industry_ul"]/li'):
  106. elem1 = next(iter(li_tag.xpath('./div[@class="industry_left"]') or []), Element('div'))
  107. p_tag = next(iter(elem1.xpath('./p[@class="tt"]/text()') or []), '全国').replace('收货地:', '')
  108. args = str(p_tag).split()
  109. if len(args) == 2:
  110. area, city = args
  111. else:
  112. city = ''
  113. area = args[0]
  114. elem2 = next(iter(li_tag.xpath('./div[@class="industry_cc"]') or []), Element('div'))
  115. a_tag = next(iter(elem2.xpath('.//h3[@class="secret"]/a') or []), Element('a'))
  116. title = translate_text(a_tag, font_maps, r'<a.*>(.*?)</a>')
  117. href = a_tag.get('href')
  118. publish_time = next(iter(elem2.xpath('./div/span/text()') or []), '').replace('发布时间:', '')
  119. publish_time_ts = datetime.datetime.strptime(publish_time, '%Y-%m-%d').timestamp()
  120. # print(f'{title} {href} {publish_time}')
  121. results.append({
  122. 'title': title,
  123. 'href': href,
  124. 'publishtime': publish_time,
  125. 'l_np_publishtime': publish_time_ts,
  126. 'area': area,
  127. 'city': city,
  128. 'district': '',
  129. })
  130. return results
  131. def extract_detail_html(html, font_maps):
  132. tree = fromstring(html)
  133. parse_element(tree, font_maps)
  134. element = tree.xpath('//div[@class="details_text"]')
  135. element = next(iter(element or []), Element('div'))
  136. source = tostring(element, encoding='gbk').decode('gbk')
  137. # print(source)
  138. return source