123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- # -*- coding: utf-8 -*-
- import re
- __all__ = ['cleaner']
- '''独立元素'''
- INDEPENDENT_TAGS = {
- '<head>[\s\S]*?</head>': '',
- '<html>|<html [^>]*>|</html>': '',
- '<body>|<body [^>]*>|</body>': '',
- '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '', # 元数据
- '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
- '\\xa0|\\u3000': '', # 空格
- '<!--[\s\S]*?-->': '', # 注释
- '<style[^<>]*>[\s\S]*?</style>': '', # 样式
- '<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
- '<input>': '', # 输入框
- '</input>': '', # 输入框
- '<img[^>]*>': '<br>', # 图片
- }
- '''行内元素'''
- INLINE_TAGS = {
- '<a>|<a [^>]*>|</a>': '', # 超链接
- '<link>|<link [^>]*>|</link>': '', # 超链接
- '<span>|<span [^>]*>|</span>': '', # span
- '<label>|<label [^>]*>|</label>': '<br>', # label
- '<font>|<font [^>]*>|</font>': '', # font
- 'data:image(.*?) ': '', # 图片base64
- }
- '''块级元素'''
- BLOCK_TAGS = {
- '<div>\s*?</div>': '',
- '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
- '<p>|<p [^>]*>': '<br>', # 段落
- '</p>': '', # 段落
- '<div>|<div [^>]*>': '<br>', # 分割
- '</div>': '', # 分割 division
- '<o:p>|<o:p [^>]*>|</o:p>': '' # OFFICE微软WORD段落
- }
- '''其他'''
- OTHER = {
- '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
- '<epointform>': '',
- '<!doctype html>|<!doctype html [^>]*>': '',
- '【关闭】|关闭': '',
- '【打印】|打印本页': '',
- '【字体:[\s\S]*】': '',
- '文章来源:[\u4e00-\u9fa5]+': '',
- '浏览次数:.*[<]+': '',
- '(责任编辑:.*?)': '',
- '分享到[:]': '',
- }
- '''样式'''
- CSS_STYLE = {
- 'style="[\s\S]*?"|style ="[\s\S]*?"': '',
- 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
- 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
- 'class="[\s\S]*?"|class ="[\s\S]*?"': '',
- 'align="[\s\S]*?"|align ="[\s\S]*?"': '',
- 'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
- }
- '''空白符'''
- BLANKS = {
- '\n\s*\n': '\n',
- '\s*\n\s*': '\n',
- '[^\S\n]': ' ',
- '\s+': ' ',
- }
- '''css标签集合'''
- TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
- '''css属性集合'''
- ATTRS = {'id', 'class', 'style', 'width'}
- '''特殊样式的标签'''
- SPECIAL_TAGS = {
- re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): '<br>',
- }
- def _repair_tag():
- """异常的标签组合,用来替换非标准页面的标签"""
- _repairs = {}
- for tag in TAGS:
- for attr in ATTRS:
- key = '{}{}'.format(tag, attr)
- val = '{} {}'.format(tag, attr)
- _repairs[key] = val
- return _repairs
- def _escape_character(html):
- """转义字符"""
- html = html.replace('<', '<')
- html = html.replace('>', '>')
- html = html.replace('"', '"')
- html = html.replace('&', '&')
- return html
- def _lowercase_tag(html):
- """元素标签转成小写,不影响页面文本"""
- tags = re.findall("<[^>]+>", html)
- tag_sets = set(tags)
- if len(tag_sets) > 10000:
- from bs4 import BeautifulSoup
- soup = BeautifulSoup(html, 'lxml')
- html = str(soup.body.next_element)
- else:
- for tag in tag_sets:
- html = html.replace(tag, str(tag).lower())
- repair_tags = _repair_tag() # 标签修复
- for err, right in repair_tags.items():
- html = html.replace(err, right)
- return html
- def _clear_special_tag(html):
- """删除特殊元素标签"""
- for tag, repl in SPECIAL_TAGS.items():
- html = tag.sub(repl, html)
- return html
- def _clear_input_tag(html, display=False):
- """提取value值,替换input标签"""
- if not display:
- html = html.replace('<input', '<input style="border-color: transparent;"') # 不显示输入框边框
- tag = re.compile(r'<input .*?>', re.S)
- value = re.compile(r'value=["|\'](.*?)["|\']')
- lst = re.findall(tag, html) or []
- for ipt in lst:
- val = re.findall(value, ipt)
- if val and 'hidden' not in ipt and 'hide' not in ipt and 'display: none' not in ipt:
- html = html.replace(ipt, val[0])
- return html
- def cleaner(html, special=None, completely=False, del_tag=False, **kwargs):
- """
- 源码清洗
- :param html: 清洗的页面
- :param special: 额外指定页面清洗规则
- :param completely: 是否完全清洗页面
- :param del_tag: 删除标签
- :return: 页面源码
- """
- special = set() if special is None else special
- OTHER.update(special)
- remove_tags = {
- **INDEPENDENT_TAGS,
- **INLINE_TAGS,
- **BLOCK_TAGS,
- **OTHER,
- **CSS_STYLE,
- **BLANKS,
- }
- html = _lowercase_tag(html)
- if del_tag:
- html = _clear_special_tag(html)
- for tag, repl in remove_tags.items():
- html = re.sub(tag, repl, html)
- if completely:
- html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html) # 画布
- html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html) # 内框架
- html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
- html = _escape_character(html)
- html = _clear_input_tag(html, **kwargs)
- return html
|