# coding:utf-8 from bs4 import BeautifulSoup from util.htmltag import CleanKeepTable from typing import AnyStr from loguru import logger from util.htmltag import extract_input_value from lxml import etree class HtmlHelper(object): @staticmethod def __repair_label(content_html: AnyStr) -> AnyStr: """ html标签修复 :param content_html: html内容 :return: """ soup = BeautifulSoup(content_html, 'html5lib') try: fixed_html = soup.prettify() except ValueError as e: logger.warning(e) fixed_html = content_html return fixed_html def __clean_script(self, Html): ''' clear js/style :param Html: :return: ''' html = etree.HTML(Html) ele = html.xpath('//script | //noscript | //style') for e in ele: e.getparent().remove(e) html_str = etree.tostring(html, encoding="utf-8").decode("utf-8") return html_str @staticmethod def __clear_tag(content_html: AnyStr) -> AnyStr: """ 清理标签 :param content_html: :return: """ try: content_html = content_html.replace("\n", " ").replace("\t", "").replace("\xa0", "") tag_html = content_html.replace(">", "") tag_html = extract_input_value(tag_html) tag_html = CleanKeepTable(tag_html) # 保留 table标签 tag_html = tag_html.strip().replace(":", "：").replace(" ", " ") tag_html = tag_html.replace(" ", " ").replace("▲", "") except Exception as e: logger.warning(e) return content_html return tag_html @staticmethod def __add_space(content_html): """ 为表格添加空格 :param content_html: :return: """ if "" in content_html: content_html = content_html.replace('', '') # if "
AnyStr: """ 处理开始 :param content: :return: """ content = self.__add_space(content) content = self.__repair_label(content) if "script" in content: content = self.__clean_script(content) content = self.__clear_tag(content) return content