clean_html.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # -*- coding: utf-8 -*-
  2. import re
  3. from lxml.html import fromstring, HtmlElement, tostring
  4. __all__ = ['cleaner', 'drop_tree_by_lxml']
  5. '''独立元素'''
  6. INDEPENDENT_TAGS = {
  7. '<head>[\s\S]*?</head>': '',
  8. '<html>|<html [^>]*>|</html>': '',
  9. '<body>|<body [^>]*>|</body>': '',
  10. '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '', # 元数据
  11. '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
  12. '\\xa0|\\u3000': '', # 空格
  13. '<!--[\s\S]*?-->': '', # 注释
  14. '<style[^<>]*>[\s\S]*?</style>': '', # 样式
  15. '<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
  16. '<input>': '', # 输入框
  17. '</input>': '', # 输入框
  18. '<img[^>]*>': '<br>', # 图片
  19. }
  20. '''行内元素'''
  21. INLINE_TAGS = {
  22. '<a>|<a [^>]*>|</a>': '', # 超链接
  23. '<link>|<link [^>]*>|</link>': '', # 超链接
  24. '<span>|<span [^>]*>|</span>': '', # span
  25. '<label>|<label [^>]*>|</label>': '<br>', # label
  26. '<font>|<font [^>]*>|</font>': '', # font
  27. 'data:image(.*?) ': '', # 图片base64
  28. }
  29. '''块级元素'''
  30. BLOCK_TAGS = {
  31. '<div>\s*?</div>': '',
  32. '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
  33. '<p>|<p [^>]*>': '<br>', # 段落
  34. '</p>': '', # 段落
  35. '<div>|<div [^>]*>': '<br>', # 分割
  36. '</div>': '', # 分割 division
  37. '<o:p>|<o:p [^>]*>|</o:p>': '' # OFFICE微软WORD段落
  38. }
  39. '''其他'''
  40. OTHER = {
  41. '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
  42. '<epointform>': '',
  43. '<!doctype html>|<!doctype html [^>]*>': '',
  44. '【关闭】|关闭': '',
  45. '【打印】|打印本页': '',
  46. '【字体:[\s\S]*】': '',
  47. '文章来源:[\u4e00-\u9fa5]+': '',
  48. '浏览次数:.*[<]+': '',
  49. '(责任编辑:.*?)': '',
  50. '分享到[:]': '',
  51. }
  52. '''样式'''
  53. CSS_STYLE = {
  54. 'style="[\s\S]*?"|style ="[\s\S]*?"': '',
  55. 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
  56. 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
  57. 'class="[\s\S]*?"|class ="[\s\S]*?"': '',
  58. 'align="[\s\S]*?"|align ="[\s\S]*?"': '',
  59. 'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
  60. }
  61. '''空白符'''
  62. BLANKS = {
  63. '\n\s*\n': '\n',
  64. '\s*\n\s*': '\n',
  65. '[^\S\n]': ' ',
  66. '\s+': ' ',
  67. }
  68. '''css标签集合'''
  69. TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
  70. '''css属性集合'''
  71. ATTRS = {'id', 'class', 'style', 'width'}
  72. '''特殊样式的标签'''
  73. SPECIAL_TAGS = {
  74. re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
  75. }
  76. def _repair_tag():
  77. """异常的标签组合,用来替换非标准页面的标签"""
  78. _repairs = {}
  79. for tag in TAGS:
  80. for attr in ATTRS:
  81. key = '{}{}'.format(tag, attr)
  82. val = '{} {}'.format(tag, attr)
  83. _repairs[key] = val
  84. return _repairs
  85. def _escape_character(html):
  86. """转义字符"""
  87. html = html.replace('&lt;', '<')
  88. html = html.replace('&gt;', '>')
  89. html = html.replace('&quot;', '"')
  90. html = html.replace('&amp;', '&')
  91. return html
  92. def _lowercase_tag(html):
  93. """元素标签转成小写,不影响页面文本"""
  94. tags = re.findall("<[^>]+>", html)
  95. tag_sets = set(tags)
  96. if len(tag_sets) > 10000:
  97. from bs4 import BeautifulSoup
  98. soup = BeautifulSoup(html, 'lxml')
  99. html = str(soup.body.next_element)
  100. else:
  101. for tag in tag_sets:
  102. html = html.replace(tag, str(tag).lower())
  103. repair_tags = _repair_tag() # 标签修复
  104. for err, right in repair_tags.items():
  105. html = html.replace(err, right)
  106. return html
  107. def _clear_special_tag(html):
  108. """删除特殊元素标签"""
  109. for tag, repl in SPECIAL_TAGS.items():
  110. html = tag.sub(repl, html)
  111. return html
  112. def _clear_input_tag(html, display=False):
  113. """提取value值,替换input标签"""
  114. if not display:
  115. html = html.replace('<input', '<input style="border-color: transparent;"') # 不显示输入框边框
  116. tag = re.compile(r'<input .*?>', re.S)
  117. value = re.compile(r'value=["|\'](.*?)["|\']')
  118. lst = re.findall(tag, html) or []
  119. for ipt in lst:
  120. val = re.findall(value, ipt)
  121. if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
  122. html = html.replace(ipt, val[0])
  123. return html
  124. def drop_tree_by_lxml(html, feature):
  125. tree: HtmlElement = fromstring(html)
  126. tag_lst = tree.xpath(feature)
  127. for tag in tag_lst:
  128. tag.drop_tree()
  129. html = tostring(tree, encoding='utf8').decode('utf8')
  130. return html
  131. def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
  132. """
  133. 源码清洗
  134. :param html: 清洗的页面
  135. :param special: 额外指定页面清洗规则
  136. :param completely: 是否完全清洗页面
  137. :param del_tag: 删除标签
  138. :return: 页面源码
  139. """
  140. special = set() if special is None else special
  141. OTHER.update(special)
  142. remove_tags = {
  143. **INDEPENDENT_TAGS,
  144. **INLINE_TAGS,
  145. **BLOCK_TAGS,
  146. **OTHER,
  147. **CSS_STYLE,
  148. **BLANKS,
  149. }
  150. html = _lowercase_tag(html)
  151. if del_tag:
  152. html = _clear_special_tag(html)
  153. for tag, repl in remove_tags.items():
  154. html = re.sub(tag, repl, html)
  155. if completely:
  156. html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html) # 画布
  157. html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html) # 内框架
  158. html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
  159. html = _escape_character(html)
  160. html = _clear_input_tag(html, **kwargs)
  161. return html