|
]*>': '
', # 分割 division
'
': '', # 分割 division
'
|]*>|': '' # OFFICE微软WORD段落
}
# 其他
OTHER = {
']*>|]*>|': '',
'': '',
'|]*>': '',
'【关闭】|关闭': '',
'【打印】|打印本页': '',
'【字体:[\s\S]*】': '',
'文章来源:[\u4e00-\u9fa5]+': '',
'浏览次数:.*[<]+': '',
'(责任编辑:.*?)': '',
'分享到[:]': '',
}
# 样式
CSS_STYLE = {
'style="[\s\S]*?"|style ="[\s\S]*?"': '',
'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
'class="[\s\S]*?"|class ="[\s\S]*?"': '',
'align="[\s\S]*?"|align ="[\s\S]*?"': '',
'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
}
# 空白符
BLANKS = {
'\n\s*\n': '\n',
'\s*\n\s*': '\n',
'[^\S\n]': ' ',
'\s+': ' ',
}
# css标签集合
TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
# css属性集合
ATTRS = {'id', 'class', 'style', 'width'}
# 特殊样式+指定样式的标签
SPECIAL_TAGS = {
re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "
",
}
def _repair_tag():
"""异常的标签组合,用来替换非标准页面的标签"""
_repairs = {}
for tag in TAGS:
for attr in ATTRS:
key = '{}{}'.format(tag, attr)
val = '{} {}'.format(tag, attr)
_repairs[key] = val
return _repairs
def _escape_character(html):
"""转义字符"""
html = html.replace('<', '<')
html = html.replace('>', '>')
html = html.replace('"', '"')
html = html.replace('&', '&')
# 不显示输入框边框
html = html.replace('', re.S)
valueTxt = re.compile(r'value=["|\'](.*?)["|\']')
input_list = re.findall(inputTxt, html) or []
for ipt in input_list:
val = re.findall(valueTxt, ipt)
if val and "hidden" not in ipt and "hide" not in ipt and "display: none" not in ipt:
html = html.replace(ipt,val[0])
return html
def _lowercase_tag(html):
"""标签归一化处理(全部小写 + 标签修复)"""
tags = re.findall("<[^>]+>", html)
tag_sets = set(tags)
if len(tag_sets) > 10000:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
html = str(soup.body.next_element)
else:
for tag in tag_sets:
html = html.replace(tag, str(tag).lower())
repair_tags = _repair_tag()
for err, right in repair_tags.items():
html = html.replace(err, right)
return html
def _del_tag(html):
"""删除特殊+指定样式的标签"""
for tag, repl in SPECIAL_TAGS.items():
html = tag.sub(repl, html)
return html
def cleaner(html, special=None, completely=False):
"""
数据清洗
:param html: 清洗的页面
:param special: 额外指定页面清洗规则
:param completely: 是否完全清洗页面
:return: 清洗后的页面源码
"""
if special is None:
special = {}
OTHER.update(special)
remove_tags = {
**INDEPENDENT_TAGS,
**INLINE_TAGS,
**BLOCK_TAGS,
**OTHER,
**CSS_STYLE,
**BLANKS,
}
html = _lowercase_tag(html)
# html = _del_tag(html) # 优先处理
for tag, repl in remove_tags.items():
html = re.sub(tag, repl, html)
if completely:
html = re.sub(r'', '', html) # 画布
html = re.sub(r'', '', html) # 内框架
html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
html = _escape_character(html)
html = _clean_input(html) # 处理 input
return html