]*>': '
', # 分割 division '

': '', # 分割 division '|]*>|': '' # OFFICE微软WORD段落 } # 其他 OTHER = { ']*>|]*>|': '', '': '', '|]*>': '', '【关闭】|关闭': '', '【打印】|打印本页': '', '【字体：[\s\S]*】': '', '文章来源：[\u4e00-\u9fa5]+': '', '浏览次数：.*[<]+': '', '（责任编辑：.*?）': '', '分享到[：]': '', } # 样式 CSS_STYLE = { 'style="[\s\S]*?"|style ="[\s\S]*?"': '', 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '', 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '', 'class="[\s\S]*?"|class ="[\s\S]*?"': '', 'align="[\s\S]*?"|align ="[\s\S]*?"': '', 'cellpadding="(\d+)"|cellspacing="(\d+)"': '', } # 空白符 BLANKS = { '\n\s*\n': '\n', '\s*\n\s*': '\n', '[^\S\n]': ' ', '\s+': ' ', } # css标签集合 TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'} # css属性集合 ATTRS = {'id', 'class', 'style', 'width'} def _repair_tag(): """异常的标签组合,用来替换非标准页面的标签""" _repairs = {} for tag in TAGS: for attr in ATTRS: key = '{}{}'.format(tag, attr) val = '{} {}'.format(tag, attr) _repairs[key] = val return _repairs def _escape_character(html): """转义字符""" html = html.replace('<', '<') html = html.replace('>', '>') html = html.replace('"', '"') html = html.replace('&', '&') return html def _lowercase_tag(html): """标签归一化处理（全部小写）""" tags = re.findall("<[^>]+>", html) for tag in tags: html = html.replace(tag, str(tag).lower()) repair_tags = _repair_tag() for err, right in repair_tags.items(): html = html.replace(err, right) return html def cleaner(html, special=None, completely=False): """ 数据清洗 :param html: 清洗的页面 :param special: 额外指定页面清洗规则 :param completely: 是否完全清洗页面 :return: 清洗后的页面源码 """ if special is None: special = {} OTHER.update(special) remove_tags = { **INDEPENDENT_TAGS, **INLINE_TAGS, **BLOCK_TAGS, **OTHER, **CSS_STYLE, **BLANKS, } html = _lowercase_tag(html) for tag, repl in remove_tags.items(): html = re.sub(tag, repl, html) if completely: html = re.sub(r']*>[\s\S]*?', '', html) # 画布 html = re.sub(r']*>[\s\S]*?', '', html) # 内框架 html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html) html = _escape_character(html) return html