|
@@ -1,90 +1,146 @@
|
|
|
-# HTML 替换
|
|
|
import re
|
|
|
|
|
|
+__all__ = ['cleaner', 'clean_js']
|
|
|
+
|
|
|
+# 独立元素
|
|
|
+INDEPENDENT_TAGS = {
|
|
|
+ '<head>[\s\S]*?</head>': '',
|
|
|
+ '<html>|<html [^>]*>|</html>': '',
|
|
|
+ '<body>|<body [^>]*>|</body>': '',
|
|
|
+ '<meta[^<>]*>|<meta [^<>]*>|<meta[^<>]*>[\s\S]*?</meta>|</meta>': '', # 元数据
|
|
|
+ '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
|
|
|
+ '\\xa0|\\u3000': '', # 空格
|
|
|
+ '<!--[\s\S]*?-->': '', # 注释
|
|
|
+ '<style[^<>]*>[\s\S]*?</style>': '', # 样式
|
|
|
+ '<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
|
|
|
+ '<input>': '', # 输入框
|
|
|
+ '<img[^>]*>': '<br>', # 图片
|
|
|
+}
|
|
|
+# 行内元素
|
|
|
+INLINE_TAGS = {
|
|
|
+ '<a>|<a [^>]*>|</a>': '', # 超链接
|
|
|
+ '<span>|<span [^>]*>|</span>': '', # span
|
|
|
+ '<label>|<label [^>]*>|</label>': '<br>', # label
|
|
|
+ '<font>|<font [^>]*>|</font>': '', # font
|
|
|
+}
|
|
|
+# 块级元素
|
|
|
+BLOCK_TAGS = {
|
|
|
+ '<h[1-6][^>]*>[\s\S]*?</h[1-6]>': '', # 标题
|
|
|
+ # '<h[1-6][^>]*>|</h[1-6]>': '', # 标题
|
|
|
+ '<p>|<p [^>]*>|</p>': '<br>', # 段落
|
|
|
+ '<div>|<div [^>]*>|</div>': '<br>', # 分割 division
|
|
|
+ '<o:p>|<o:p [^>]*>|</o:p>': '' # OFFICE微软WORD段落
|
|
|
+}
|
|
|
+# 其他
|
|
|
+OTHER = {
|
|
|
+ '<?xml[^>]*>|<?xml [^>]*>|<?xml:.*?>': '',
|
|
|
+ '<epointform>': '',
|
|
|
+ '<!doctype html>|<!doctype html [^>]*>': '',
|
|
|
+ '【关闭】|关闭': '',
|
|
|
+ '【打印】|打印本页': '',
|
|
|
+ '【字体:[\s\S]*】': '',
|
|
|
+ '文章来源:[\u4e00-\u9fa5]+': '',
|
|
|
+ '浏览次数:.*[<]+': '',
|
|
|
+ '(责任编辑:.*?)': '',
|
|
|
+ '分享到[:]': '',
|
|
|
+ '阅读数[::]\d+': '',
|
|
|
+}
|
|
|
+# 样式
|
|
|
+CSS_STYLE = {
|
|
|
+ 'style="[\s\S]*?"|style ="[\s\S]*?"': '',
|
|
|
+ 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '',
|
|
|
+ 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '',
|
|
|
+ 'class="[\s\S]*?"|class ="[\s\S]*?"': '',
|
|
|
+ 'align="[\s\S]*?"|align ="[\s\S]*?"': '',
|
|
|
+ 'cellpadding="(\d+)"|cellspacing="(\d+)"': '',
|
|
|
+}
|
|
|
+# 空白符
|
|
|
+BLANKS = {
|
|
|
+ '\n\s*\n': '\n',
|
|
|
+ '\s*\n\s*': '\n',
|
|
|
+ '[^\S\n]': ' ',
|
|
|
+ '\s+': ' ',
|
|
|
+}
|
|
|
+# css标签集合
|
|
|
+TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'}
|
|
|
+# css属性集合
|
|
|
+ATTRS = {'id', 'class', 'style', 'width'}
|
|
|
+
|
|
|
+
|
|
|
+def _repair_tag():
|
|
|
+ """异常的标签组合,用来替换非标准页面的标签"""
|
|
|
+ _repairs = {}
|
|
|
+ for tag in TAGS:
|
|
|
+ for attr in ATTRS:
|
|
|
+ key = '{}{}'.format(tag, attr)
|
|
|
+ val = '{} {}'.format(tag, attr)
|
|
|
+ _repairs[key] = val
|
|
|
+ return _repairs
|
|
|
|
|
|
-def th(neirong):
|
|
|
- tihuan = {
|
|
|
- '<!--.*?-->': '',
|
|
|
- '"': "'",
|
|
|
- '\n': '',
|
|
|
- '\xa0': "",
|
|
|
- '<script .*?>': '',
|
|
|
- '</script>': '',
|
|
|
- '<span .*?>': '',
|
|
|
- '</span> ': '',
|
|
|
- '<p.*?>': '<br>',
|
|
|
- '</p>': '<br>',
|
|
|
- '<div>': '<br>',
|
|
|
- '<div .*?>': '<br>',
|
|
|
- '<img .*?>': '<br>',
|
|
|
- '</div>': '<br>',
|
|
|
- '<style.*?</style>': '',
|
|
|
- '<EpointForm>': '',
|
|
|
- '<html.*?</head>': '',
|
|
|
- '<input .*?>': '',
|
|
|
- '<!DOCTYPE.*?>': '',
|
|
|
- '</meta>': '',
|
|
|
- '<?xml:.*?>': '',
|
|
|
- '<label.*?>': '<br>',
|
|
|
- '</label>': '',
|
|
|
- 'style=".*?"': '',
|
|
|
- "style='.*?'": '',
|
|
|
- 'class=".*?"': '',
|
|
|
- "class='.*?'": '',
|
|
|
- "bordercolor='.*?'": '',
|
|
|
- 'bgcolor=".*?"': '',
|
|
|
- 'BORDERCOLOR=".*?"': '',
|
|
|
- 'width=".*?"': '',
|
|
|
- '<a name=".*?">': '',
|
|
|
- '<o:p>': '',
|
|
|
- '</o:p>': '',
|
|
|
- '<A name=.*?>': '',
|
|
|
- '<a .*?>': '',
|
|
|
- '</a>': '',
|
|
|
- '<font .*?>': '',
|
|
|
- '</font>': '',
|
|
|
- '<body>': '',
|
|
|
- '</body>': '',
|
|
|
- '<h\d{1}\s{0,10}>.*</h\d{1}\s{0,10}>': '',
|
|
|
- '</h\d{1}\s{0,10}>': '',
|
|
|
- '<h\d{1}\s{0,10}}>': '',
|
|
|
- '【关闭】': '',
|
|
|
- '【打印】': '',
|
|
|
- }
|
|
|
|
|
|
- nr = neirong
|
|
|
+def _escape_character(html):
|
|
|
+ """转义字符"""
|
|
|
+ html = html.replace('<', '<')
|
|
|
+ html = html.replace('>', '>')
|
|
|
+ html = html.replace('"', '"')
|
|
|
+ html = html.replace('&', '&')
|
|
|
+ return html
|
|
|
|
|
|
- all_tag = re.findall("<[^>]+>", nr)
|
|
|
- for tag in all_tag:
|
|
|
- nr = nr.replace(tag, str(tag).lower())
|
|
|
|
|
|
- def thh(k, v, c):
|
|
|
- return re.sub(k, v, c)
|
|
|
+def _lowercase_tag(html):
|
|
|
+ """标签归一化处理(全部小写)"""
|
|
|
+ tags = re.findall("<[^>]+>", html)
|
|
|
+ for tag in tags:
|
|
|
+ html = html.replace(tag, str(tag).lower())
|
|
|
|
|
|
- for k, v in tihuan.items():
|
|
|
- nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
|
|
|
- return nr
|
|
|
+ repair_tags = _repair_tag()
|
|
|
+ for err, right in repair_tags.items():
|
|
|
+ html = html.replace(err, right)
|
|
|
|
|
|
+ return html
|
|
|
|
|
|
-def th_1(neirong):
|
|
|
- tihuan = {
|
|
|
- '<!--.*?-->': '',
|
|
|
- '"': "'",
|
|
|
- '\n': '',
|
|
|
- '\xa0': "",
|
|
|
- '<script .*?>': '',
|
|
|
- '</script>': '',
|
|
|
+
|
|
|
+def cleaner(html, special=None, completely=False):
|
|
|
+ """
|
|
|
+ 数据清洗
|
|
|
+
|
|
|
+ :param html: 清洗的页面
|
|
|
+ :param special: 额外指定页面清洗规则
|
|
|
+ :param completely: 是否完全清洗页面
|
|
|
+ :return: 清洗后的页面源码
|
|
|
+ """
|
|
|
+ if special is None:
|
|
|
+ special = {}
|
|
|
+ OTHER.update(special)
|
|
|
+ remove_tags = {
|
|
|
+ **INDEPENDENT_TAGS,
|
|
|
+ **INLINE_TAGS,
|
|
|
+ **BLOCK_TAGS,
|
|
|
+ **OTHER,
|
|
|
+ **CSS_STYLE,
|
|
|
+ **BLANKS,
|
|
|
}
|
|
|
+ html = _lowercase_tag(html)
|
|
|
+ for tag, repl in remove_tags.items():
|
|
|
+ html = re.sub(tag, repl, html)
|
|
|
|
|
|
- nr = neirong
|
|
|
+ if completely:
|
|
|
+ html = re.sub(r'<canvas[^<>]*>[\s\S]*?</canvas>', '', html) # 画布
|
|
|
+ html = re.sub(r'<iframe[^<>]*>[\s\S]*?</iframe>', '', html) # 内框架
|
|
|
+ html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html)
|
|
|
|
|
|
- all_tag = re.findall("<[^>]+>", nr)
|
|
|
- for tag in all_tag:
|
|
|
- nr = nr.replace(tag, str(tag).lower())
|
|
|
+ html = _escape_character(html)
|
|
|
+ return html
|
|
|
|
|
|
- def thh(k, v, c):
|
|
|
- return re.sub(k, v, c)
|
|
|
|
|
|
- for k, v in tihuan.items():
|
|
|
- nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
|
|
|
- return nr
|
|
|
+def clean_js(html):
|
|
|
+ remove_tags = {
|
|
|
+ '&(nbsp|e[mn]sp|thinsp|zwn?j|#13);': '', # 空格
|
|
|
+ '\\xa0|\\u3000': '', # 空格
|
|
|
+ '<script[^<>]*>[\s\S]*?</script>': '', # JavaScript
|
|
|
+ **BLANKS,
|
|
|
+ }
|
|
|
+ html = _lowercase_tag(html)
|
|
|
+ for tag, repl in remove_tags.items():
|
|
|
+ html = re.sub(tag, repl, html)
|
|
|
+ return html
|