123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- # HTML 替换
- import re
- def th(neirong):
- tihuan = {
- '<!--.*?-->': '',
- '"': "'",
- '\n': '',
- '\xa0': "",
- '<script .*?>': '',
- '</script>': '',
- '<span .*?>': '',
- '</span> ': '',
- '<p.*?>': '<br>',
- '</p>': '<br>',
- '<div>': '<br>',
- '<div .*?>': '<br>',
- '<img .*?>': '<br>',
- '</div>': '<br>',
- '<style.*?</style>': '',
- '<EpointForm>': '',
- '<html.*?</head>': '',
- '<input .*?>': '',
- '<!DOCTYPE.*?>': '',
- '</meta>': '',
- '<?xml:.*?>': '',
- '<label.*?>': '<br>',
- '</label>': '',
- 'style=".*?"': '',
- "style='.*?'": '',
- 'class=".*?"': '',
- "class='.*?'": '',
- "bordercolor='.*?'": '',
- 'bgcolor=".*?"': '',
- 'BORDERCOLOR=".*?"': '',
- 'width=".*?"': '',
- '<a name=".*?">': '',
- '<o:p>': '',
- '</o:p>': '',
- '<A name=.*?>': '',
- '<a .*?>': '',
- '</a>': '',
- '<font .*?>': '',
- '</font>': '',
- '<body>': '',
- '</body>': '',
- '<h\d{1}\s{0,10}>.*</h\d{1}\s{0,10}>': '',
- '</h\d{1}\s{0,10}>': '',
- '<h\d{1}\s{0,10}}>': '',
- '【关闭】': '',
- '【打印】': '',
- }
- nr = neirong
- all_tag = re.findall("<[^>]+>", nr)
- for tag in all_tag:
- nr = nr.replace(tag, str(tag).lower())
- def thh(k, v, c):
- return re.sub(k, v, c)
- for k, v in tihuan.items():
- nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
- return nr
- def th_1(neirong):
- tihuan = {
- '<!--.*?-->': '',
- '"': "'",
- '\n': '',
- '\xa0': "",
- '<script .*?>': '',
- '</script>': '',
- }
- nr = neirong
- all_tag = re.findall("<[^>]+>", nr)
- for tag in all_tag:
- nr = nr.replace(tag, str(tag).lower())
- def thh(k, v, c):
- return re.sub(k, v, c)
- for k, v in tihuan.items():
- nr = re.sub(k, v, thh(k, v, nr), re.S, re.M)
- return nr
|