import re def clean_html(html_str): """HTML 替换""" patterns = { '': '', '"': "'", '\n': '', '\xa0': "", '': '', '': '', ' ': '', '': '', '': '', '': '
', '

': '
', '
': '
', '
': '
', '
': '
', '': '
', '': '', '': '', '': '', '': '', '': '', '': '', '': '
', '': '', 'style=".*?"': '', "style='.*?'": '', 'class=".*?"': '', "class='.*?'": '', "align='.*?'": '', 'align=".*?"': '', 'border=".*?"': '', "border='.*?'": '', 'cellpadding=".*?"': '', "cellpadding='.*?'": '', 'cellspacing=".*?"': '', "cellspacing='.*?'": '', 'center=".*?"': '', "center='.*?'": '', 'width=".*?"': '', "width='.*?'": '', "bordercolor='.*?'": '', 'bgcolor=".*?"': '', 'BORDERCOLOR=".*?"': '', '': '', '': '', '': '', '': '', '': '', '': '', '': '', '': '', '': '', '': '', '': '', '': '', '【关闭】': '', '【打印】': '', '若附件无法下载,你可以尝试使用360极速浏览器进行下载!': '', } all_tag = re.findall("<[^>]+>", html_str) for tag in all_tag: html_str = html_str.replace(tag, str(tag).lower()) repl_str = [ '中国采购与招标网', '采购与招标网', 'www.chinabidding.com.cn', 'www.chinabidding.cn' ] for repl in repl_str: html_str = re.sub(repl, '___', html_str, re.S | re.M) def substitutes(k, v, c): return re.sub(k, v, c) for k, v in patterns.items(): html_str = re.sub(k, v, substitutes(k, v, html_str), re.S, re.M) return html_str