12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- # coding:utf-8
- from bs4 import BeautifulSoup
- from util.htmltag import CleanKeepTable
- from typing import AnyStr
- from loguru import logger
- from util.htmltag import extract_input_value
- from lxml import etree
- class HtmlHelper(object):
- @staticmethod
- def __repair_label(content_html: AnyStr) -> AnyStr:
- """
- html标签修复
- :param content_html: html内容
- :return:
- """
- soup = BeautifulSoup(content_html, 'html5lib')
- try:
- fixed_html = soup.prettify()
- except ValueError as e:
- logger.warning(e)
- fixed_html = content_html
- return fixed_html
- def __clean_script(self, Html):
- '''
- clear js/style
- :param Html:
- :return:
- '''
- html = etree.HTML(Html)
- ele = html.xpath('//script | //noscript | //style')
- for e in ele:
- e.getparent().remove(e)
- html_str = etree.tostring(html, encoding="utf-8").decode("utf-8")
- return html_str
- @staticmethod
- def __clear_tag(content_html: AnyStr) -> AnyStr:
- """
- 清理标签
- :param content_html:
- :return:
- """
- try:
- content_html = content_html.replace("\n", " ").replace("\t", "").replace("\xa0", "")
- tag_html = content_html.replace(">", "")
- tag_html = extract_input_value(tag_html)
- tag_html = CleanKeepTable(tag_html) # 保留 table标签
- tag_html = tag_html.strip().replace(":", ":").replace(" ", " ")
- tag_html = tag_html.replace(" ", " ").replace("▲", "")
- except Exception as e:
- logger.warning(e)
- return content_html
- return tag_html
- @staticmethod
- def __add_space(content_html):
- """
- 为表格添加空格
- :param content_html:
- :return:
- """
- if "<table" not in content_html:
- return content_html
- if "<thead>" in content_html:
- content_html = content_html.replace('<thead>', '')
- # if "<br/" in content_html:
- # content_html = content_html.replace(r'<br/', '')
- if "<table " in content_html and "<td " in content_html and "<tr " in content_html:
- return content_html
- content_html = content_html.replace('<tbody', '<tbody ')
- content_html = content_html.replace('<table', '<table ')
- content_html = content_html.replace('<th', '<th ')
- content_html = content_html.replace('<tr', '<tr ')
- content_html = content_html.replace('<td', '<td ')
- return content_html
- def preprocess(self, content: AnyStr) -> AnyStr:
- """
- 处理开始
- :param content:
- :return:
- """
- content = self.__add_space(content)
- content = self.__repair_label(content)
- if "script" in content:
- content = self.__clean_script(content)
- content = self.__clear_tag(content)
- return content
|