# coding:utf-8 import re br_reg = re.compile('', re.I) table_reg = re.compile('<([/]*table[^>]*)>', re.I) tablebody_reg = re.compile('<([/]*tbody[^>]*)>', re.I) input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I) tr_reg = re.compile('<([/]*tr[^>]*)>', re.I) th_reg = re.compile('<([/]*th[^>]*)>', re.I) td_reg = re.compile('<([/]*td[^>]*)>', re.I) p_reg = re.compile('<[/]?p>', re.I) othertag_reg = re.compile('<[^>]+>', re.I | re.M) other_symbol_reg = re.compile('[\t| ]*') seg_first_space_reg = re.compile('\n+\\s*', re.M) mul_crcf_reg = re.compile('\n+', re.M) brackets_reg = re.compile('\\s+') table_fk_reg = re.compile('(\\[table[^\\]]*\\])(.*?)(\\[/table\\])', re.M | re.S | re.I) zhushi_reg = re.compile('', re.I) ##html标签清理 def Clean(html: str): html = br_reg.sub('\n', html) html = table_reg.sub('', html) html = tablebody_reg.sub('', html) html = tr_reg.sub('\n', html) html = td_reg.sub(' ', html) html = p_reg.sub('\n', html) html = othertag_reg.sub('', html) html = other_symbol_reg.sub('', html) html = seg_first_space_reg.sub('\n', html) html = mul_crcf_reg.sub('\n', html) return html def ClearSpace(txt: str): return brackets_reg.sub('', txt) ##html标签清理,但保留table表格 def CleanKeepTable(html: str): html = zhushi_reg.sub('', html) html = br_reg.sub('\n', html) html = table_reg.sub(subFunc4Match, html) html = tablebody_reg.sub(subFunc4Match, html) html = tr_reg.sub(subFunc4Match, html) html = td_reg.sub(subFunc4Match, html) html = th_reg.sub(subFunc4Match, html) html = p_reg.sub('\n', html) html = othertag_reg.sub('', html) html = seg_first_space_reg.sub('\n', html) # print("-->", html) html = table_fk_reg.sub(lambda x: x.group(1) + mul_crcf_reg.sub(' ', x.group(2)) + x.group(3), html) html = mul_crcf_reg.sub('\n', html) # 清理table标签中的空格 html = html.replace('[', '<').replace(']', '>') html = html.replace('', '\n') return html def subFunc4Match(strmatch): try: if strmatch: return '[%s]' % (strmatch.group(1)) else: return "" except Exception as e: print(e) def extract_input_value(html): input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I) input_r = re.compile(r'<[/]*input[^>].*?[/]>', re.I) result = input_r.findall(html) for input_detail in result: ret = input_reg.findall(input_detail) if ret: html = html.replace(input_detail, f"{ret[0]}") return html