QA
/
data_quality_server


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
							# coding:utf-8
from bs4 import BeautifulSoup
from util.htmltag import CleanKeepTable
from typing import AnyStr
from loguru import logger
from util.htmltag import extract_input_value
from lxml import etree


class HtmlHelper(object):
    @staticmethod
    def __repair_label(content_html: AnyStr) -> AnyStr:
        """
        html标签修复
        :param content_html: html内容
        :return:
        """
        soup = BeautifulSoup(content_html, 'html5lib')
        try:
            fixed_html = soup.prettify()
        except ValueError as e:
            logger.warning(e)
            fixed_html = content_html
        return fixed_html

    def __clean_script(self, Html):
        '''
        clear js/style
        :param Html:
        :return:
        '''
        html = etree.HTML(Html)
        ele = html.xpath('//script | //noscript | //style')
        for e in ele:
            e.getparent().remove(e)
        html_str = etree.tostring(html, encoding="utf-8").decode("utf-8")
        return html_str

    @staticmethod
    def __clear_tag(content_html: AnyStr) -> AnyStr:
        """
        清理标签
        :param content_html:
        :return:
        """
        try:
            content_html = content_html.replace("\n", " ").replace("\t", "").replace("\xa0", "")
            tag_html = content_html.replace("&gt;", "")
            tag_html = extract_input_value(tag_html)
            tag_html = CleanKeepTable(tag_html)  # 保留 table标签
            tag_html = tag_html.strip().replace(":", "：").replace("  ", " ")
            tag_html = tag_html.replace("  ", " ").replace("▲", "")
        except Exception as e:
            logger.warning(e)
            return content_html
        return tag_html

    @staticmethod
    def __add_space(content_html):
        """
        为表格添加空格
        :param content_html:
        :return:
        """
        if "<table" not in content_html:
            return content_html
        if "<thead>" in content_html:
            content_html = content_html.replace('<thead>', '')
        # if "<br/" in content_html:
            # content_html = content_html.replace(r'<br/', '')
        if "<table " in content_html and "<td " in content_html and "<tr " in content_html:
            return content_html
        content_html = content_html.replace('<tbody', '<tbody ')
        content_html = content_html.replace('<table', '<table ')
        content_html = content_html.replace('<th', '<th ')
        content_html = content_html.replace('<tr', '<tr ')
        content_html = content_html.replace('<td', '<td ')
        return content_html

    def preprocess(self, content: AnyStr) -> AnyStr:
        """
        处理开始
        :param content:
        :return:
        """
        content = self.__add_space(content)
        content = self.__repair_label(content)
        if "script" in content:
            content = self.__clean_script(content)
        content = self.__clear_tag(content)
        return content