# coding:utf-8
import re
br_reg = re.compile('
', re.I)
table_reg = re.compile('<([/]*table[^>]*)>', re.I)
tablebody_reg = re.compile('<([/]*tbody[^>]*)>', re.I)
input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
tr_reg = re.compile('<([/]*tr[^>]*)>', re.I)
th_reg = re.compile('<([/]*th[^>]*)>', re.I)
td_reg = re.compile('<([/]*td[^>]*)>', re.I)
p_reg = re.compile('<[/]?p>', re.I)
othertag_reg = re.compile('<[^>]+>', re.I | re.M)
other_symbol_reg = re.compile('[\t| ]*')
seg_first_space_reg = re.compile('\n+\\s*', re.M)
mul_crcf_reg = re.compile('\n+', re.M)
brackets_reg = re.compile('\\s+')
table_fk_reg = re.compile('(\\[table[^\\]]*\\])(.*?)(\\[/table\\])', re.M | re.S | re.I)
##html标签清理
def Clean(html: str):
html = br_reg.sub('\n', html)
html = table_reg.sub('', html)
html = tablebody_reg.sub('', html)
html = tr_reg.sub('\n', html)
html = td_reg.sub(' ', html)
html = p_reg.sub('\n', html)
html = othertag_reg.sub('', html)
html = other_symbol_reg.sub('', html)
html = seg_first_space_reg.sub('\n', html)
html = mul_crcf_reg.sub('\n', html)
return html
def ClearSpace(txt: str):
return brackets_reg.sub('', txt)
##html标签清理,但保留table表格
def CleanKeepTable(html: str):
html = br_reg.sub('\n', html)
html = table_reg.sub(subFunc4Match, html)
html = tablebody_reg.sub(subFunc4Match, html)
html = tr_reg.sub(subFunc4Match, html)
html = td_reg.sub(subFunc4Match, html)
html = th_reg.sub(subFunc4Match, html)
html = p_reg.sub('\n', html)
html = othertag_reg.sub('', html)
# html = other_symbol_reg.sub('',html)
html = seg_first_space_reg.sub('\n', html)
# print("-->", html)
html = table_fk_reg.sub(lambda x: x.group(1) + mul_crcf_reg.sub(' ', x.group(2)) + x.group(3), html)
html = mul_crcf_reg.sub('\n', html)
# 清理table标签中的空格
html = html.replace('[', '<').replace(']', '>')
html = html.replace('