123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- # coding:utf-8
- import re
- br_reg = re.compile('<br[/]*>', re.I)
- table_reg = re.compile('<([/]*table[^>]*)>', re.I)
- tablebody_reg = re.compile('<([/]*tbody[^>]*)>', re.I)
- input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
- tr_reg = re.compile('<([/]*tr[^>]*)>', re.I)
- th_reg = re.compile('<([/]*th[^>]*)>', re.I)
- td_reg = re.compile('<([/]*td[^>]*)>', re.I)
- p_reg = re.compile('<[/]?p>', re.I)
- othertag_reg = re.compile('<[^>]+>', re.I | re.M)
- other_symbol_reg = re.compile('[\t| ]*')
- seg_first_space_reg = re.compile('\n+\\s*', re.M)
- mul_crcf_reg = re.compile('\n+', re.M)
- brackets_reg = re.compile('\\s+')
- table_fk_reg = re.compile('(\\[table[^\\]]*\\])(.*?)(\\[/table\\])', re.M | re.S | re.I)
- ##html标签清理
- def Clean(html: str):
- html = br_reg.sub('\n', html)
- html = table_reg.sub('', html)
- html = tablebody_reg.sub('', html)
- html = tr_reg.sub('\n', html)
- html = td_reg.sub(' ', html)
- html = p_reg.sub('\n', html)
- html = othertag_reg.sub('', html)
- html = other_symbol_reg.sub('', html)
- html = seg_first_space_reg.sub('\n', html)
- html = mul_crcf_reg.sub('\n', html)
- return html
- def ClearSpace(txt: str):
- return brackets_reg.sub('', txt)
- ##html标签清理,但保留table表格
- def CleanKeepTable(html: str):
- html = br_reg.sub('\n', html)
- html = table_reg.sub(subFunc4Match, html)
- html = tablebody_reg.sub(subFunc4Match, html)
- html = tr_reg.sub(subFunc4Match, html)
- html = td_reg.sub(subFunc4Match, html)
- html = th_reg.sub(subFunc4Match, html)
- html = p_reg.sub('\n', html)
- html = othertag_reg.sub('', html)
- # html = other_symbol_reg.sub('',html)
- html = seg_first_space_reg.sub('\n', html)
- # print("-->", html)
- html = table_fk_reg.sub(lambda x: x.group(1) + mul_crcf_reg.sub(' ', x.group(2)) + x.group(3), html)
- html = mul_crcf_reg.sub('\n', html)
- # 清理table标签中的空格
- html = html.replace('[', '<').replace(']', '>')
- html = html.replace('<table', '\n<table').replace('</table>', '</table>\n')
- return html
- def subFunc4Match(strmatch):
- try:
- if strmatch:
- return '[%s]' % (strmatch.group(1))
- else:
- return ""
- except Exception as e:
- print(e)
- def extract_input_value(html):
- input_reg = re.compile(r'<[/]*input[^>].*?value="(.*?)"[/]>', re.I)
- input_r = re.compile(r'<[/]*input[^>].*?[/]>', re.I)
- result = input_r.findall(html)
- for input_detail in result:
- ret = input_reg.findall(input_detail)
- if ret:
- html = html.replace(input_detail, f"</td><td>{ret[0]}")
- return html
|