clean_html.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. # -*- coding: utf-8 -*-
  2. import re
  3. __all__ = ['cleaner']
  4. # 独立元素
  5. INDEPENDENT_TAGS = {
  6. '<head>[\s\S]*?</head>': '',
  7. '<html>|<html [^>]*>|</html>': '',
  8. '<body>|<body [^>]*>|</body>': '',
  9. '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '', # 元数据
  10. '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
  11. '\\xa0|\\u3000': '', # 空格
  12. '<!--[\s\S]*?-->': '', # 注释
  13. '<style[^<>]*>[\s\S]*?</style>': '', # 样式
  14. '<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
  15. '<input>': '', # 输入框
  16. '</input>': '', # 输入框
  17. '<img[^>]*>': '<br>', # 图片
  18. }
  19. # 行内元素
  20. INLINE_TAGS = {
  21. '<a>|<a [^>]*>|</a>': '', # 超链接
  22. '<link>|<link [^>]*>|</link>': '', # 超链接
  23. '<span>|<span [^>]*>|</span>': '', # span
  24. '<label>|<label [^>]*>|</label>': '<br>', # label
  25. '<font>|<font [^>]*>|</font>': '', # font
  26. 'data:image(.*?) ': '', # 图片base64
  27. }
  28. # 块级元素
  29. BLOCK_TAGS = {
  30. '<div>\s*?</div>':'',
  31. '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
  32. '<p>|<p [^>]*>': '<br>', # 段落
  33. '</p>': '', # 段落
  34. '<div>|<div [^>]*>': '<br>', # 分割 division
  35. '</div>': '', # 分割 division
  36. '<o:p>|<o:p [^>]*>|</o:p>': '' # OFFICE微软WORD段落
  37. }
  38. # 其他
  39. OTHER = {
  40. '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
  41. '<epointform>': '',
  42. '<!doctype html>|<!doctype html [^>]*>': '',
  43. '【关闭】|关闭': '',
  44. '【打印】|打印本页': '',
  45. '【字体:[\s\S]*】': '',
  46. '文章来源:[\u4e00-\u9fa5]+': '',
  47. '浏览次数:.*[<]+': '',
  48. '(责任编辑:.*?)': '',
  49. '分享到[:]': '',
  50. }
  51. # 样式
  52. CSS_STYLE = {
  53. 'style="[\s\S]*?"|style ="[\s\S]*?"': '',
  54. 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
  55. 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
  56. 'class="[\s\S]*?"|class ="[\s\S]*?"': '',
  57. 'align="[\s\S]*?"|align ="[\s\S]*?"': '',
  58. 'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
  59. }
  60. # 空白符
  61. BLANKS = {
  62. '\n\s*\n': '\n',
  63. '\s*\n\s*': '\n',
  64. '[^\S\n]': ' ',
  65. '\s+': ' ',
  66. }
  67. # css标签集合
  68. TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
  69. # css属性集合
  70. ATTRS = {'id', 'class', 'style', 'width'}
  71. # 特殊样式+指定样式的标签
  72. SPECIAL_TAGS = {
  73. re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "<br>",
  74. }
  75. def _repair_tag():
  76. """异常的标签组合,用来替换非标准页面的标签"""
  77. _repairs = {}
  78. for tag in TAGS:
  79. for attr in ATTRS:
  80. key = '{}{}'.format(tag, attr)
  81. val = '{} {}'.format(tag, attr)
  82. _repairs[key] = val
  83. return _repairs
  84. def _escape_character(html):
  85. """转义字符"""
  86. html = html.replace('&lt;', '<')
  87. html = html.replace('&gt;', '>')
  88. html = html.replace('&quot;', '"')
  89. html = html.replace('&amp;', '&')
  90. # 不显示输入框边框
  91. html = html.replace('<input', '<input style="border-color: transparent;"')
  92. return html
  93. def _clean_input(html):
  94. """提取value值,替换input标签"""
  95. inputTxt = re.compile(r'<input .*?>', re.S)
  96. valueTxt = re.compile(r'value=["|\'](.*?)["|\']')
  97. input_list = re.findall(inputTxt, html) or []
  98. for ipt in input_list:
  99. val = re.findall(valueTxt, ipt)
  100. if val and "hidden" not in ipt and "hide" not in ipt and "display: none" not in ipt:
  101. html = html.replace(ipt,val[0])
  102. return html
  103. def _lowercase_tag(html):
  104. """标签归一化处理(全部小写 + 标签修复)"""
  105. tags = re.findall("<[^>]+>", html)
  106. tag_sets = set(tags)
  107. if len(tag_sets) > 10000:
  108. from bs4 import BeautifulSoup
  109. soup = BeautifulSoup(html, "lxml")
  110. html = str(soup.body.next_element)
  111. else:
  112. for tag in tag_sets:
  113. html = html.replace(tag, str(tag).lower())
  114. repair_tags = _repair_tag()
  115. for err, right in repair_tags.items():
  116. html = html.replace(err, right)
  117. return html
  118. def _del_tag(html):
  119. """删除特殊+指定样式的标签"""
  120. for tag, repl in SPECIAL_TAGS.items():
  121. html = tag.sub(repl, html)
  122. return html
  123. def cleaner(html, special=None, completely=False):
  124. """
  125. 数据清洗
  126. :param html: 清洗的页面
  127. :param special: 额外指定页面清洗规则
  128. :param completely: 是否完全清洗页面
  129. :return: 清洗后的页面源码
  130. """
  131. if special is None:
  132. special = {}
  133. OTHER.update(special)
  134. remove_tags = {
  135. **INDEPENDENT_TAGS,
  136. **INLINE_TAGS,
  137. **BLOCK_TAGS,
  138. **OTHER,
  139. **CSS_STYLE,
  140. **BLANKS,
  141. }
  142. html = _lowercase_tag(html)
  143. # html = _del_tag(html) # 优先处理
  144. for tag, repl in remove_tags.items():
  145. html = re.sub(tag, repl, html)
  146. if completely:
  147. html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html) # 画布
  148. html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html) # 内框架
  149. html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
  150. html = _escape_character(html)
  151. html = _clean_input(html) # 处理 input
  152. return html