]*>': '
', # 分割 division '

': '', # 分割 division '|]*>|': '' # OFFICE微软WORD段落 } # 其他 OTHER = { ']*>|]*>|': '', '': '', '|]*>': '', '【关闭】|关闭': '', '【打印】|打印本页': '', '【字体：[\s\S]*】': '', '文章来源：[\u4e00-\u9fa5]+': '', '浏览次数：.*[<]+': '', '（责任编辑：.*?）': '', '分享到[：]': '', } # 样式 CSS_STYLE = { 'style="[\s\S]*?"|style ="[\s\S]*?"': '', 'bgcolor="[\s\S]*?"|bgcolor ="[\s\S]*?"': '', 'bordercolor="[\s\S]*?"|bordercolor ="[\s\S]*?"': '', 'class="[\s\S]*?"|class ="[\s\S]*?"': '', 'align="[\s\S]*?"|align ="[\s\S]*?"': '', 'cellpadding="(\d+)"|cellspacing="(\d+)"': '', } # 空白符 BLANKS = { '\n\s*\n': '\n', '\s*\n\s*': '\n', '[^\S\n]': ' ', '\s+': ' ', } # css标签集合 TAGS = {'table', 'tr', 'td', 'div', 'span', 'p'} # css属性集合 ATTRS = {'id', 'class', 'style', 'width'} # 特殊样式+指定样式的标签 SPECIAL_TAGS = { re.compile('(?i)<[^>]+style="display: none".*[^>]+>'): "
", } def _repair_tag(): """异常的标签组合,用来替换非标准页面的标签""" _repairs = {} for tag in TAGS: for attr in ATTRS: key = '{}{}'.format(tag, attr) val = '{} {}'.format(tag, attr) _repairs[key] = val return _repairs def _escape_character(html): """转义字符""" html = html.replace('<', '<') html = html.replace('>', '>') html = html.replace('"', '"') html = html.replace('&', '&') # 不显示输入框边框 html = html.replace('', re.S) valueTxt = re.compile(r'value=["|\'](.*?)["|\']') input_list = re.findall(inputTxt, html) or [] for ipt in input_list: val = re.findall(valueTxt, ipt) if val and "hidden" not in ipt and "hide" not in ipt and "display: none" not in ipt: html = html.replace(ipt,val[0]) return html def _lowercase_tag(html): """标签归一化处理（全部小写 + 标签修复）""" tags = re.findall("<[^>]+>", html) tag_sets = set(tags) if len(tag_sets) > 10000: from bs4 import BeautifulSoup soup = BeautifulSoup(html, "lxml") html = str(soup.body.next_element) else: for tag in tag_sets: html = html.replace(tag, str(tag).lower()) repair_tags = _repair_tag() for err, right in repair_tags.items(): html = html.replace(err, right) return html def _del_tag(html): """删除特殊+指定样式的标签""" for tag, repl in SPECIAL_TAGS.items(): html = tag.sub(repl, html) return html def cleaner(html, special=None, completely=False): """ 数据清洗 :param html: 清洗的页面 :param special: 额外指定页面清洗规则 :param completely: 是否完全清洗页面 :return: 清洗后的页面源码 """ if special is None: special = {} OTHER.update(special) remove_tags = { **INDEPENDENT_TAGS, **INLINE_TAGS, **BLOCK_TAGS, **OTHER, **CSS_STYLE, **BLANKS, } html = _lowercase_tag(html) # html = _del_tag(html) # 优先处理 for tag, repl in remove_tags.items(): html = re.sub(tag, repl, html) if completely: html = re.sub(r']*>[\s\S]*?', '', html) # 画布 html = re.sub(r']*>[\s\S]*?', '', html) # 内框架 html = re.sub('<([^<>\u4e00-\u9fa5]|微软雅黑|宋体|仿宋)+>', '', html) html = _escape_character(html) html = _clean_input(html) # 处理 input return html