html_preprocess.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. # coding:utf-8
  2. from bs4 import BeautifulSoup
  3. from util.htmltag import CleanKeepTable
  4. from typing import AnyStr
  5. from loguru import logger
  6. from util.htmltag import extract_input_value
  7. from lxml import etree
  8. class HtmlHelper(object):
  9. @staticmethod
  10. def __repair_label(content_html: AnyStr) -> AnyStr:
  11. """
  12. html标签修复
  13. :param content_html: html内容
  14. :return:
  15. """
  16. soup = BeautifulSoup(content_html, 'html5lib')
  17. try:
  18. fixed_html = soup.prettify()
  19. except ValueError as e:
  20. logger.warning(e)
  21. fixed_html = content_html
  22. return fixed_html
  23. def __clean_script(self, Html):
  24. '''
  25. clear js/style
  26. :param Html:
  27. :return:
  28. '''
  29. html = etree.HTML(Html)
  30. ele = html.xpath('//script | //noscript | //style')
  31. for e in ele:
  32. e.getparent().remove(e)
  33. html_str = etree.tostring(html, encoding="utf-8").decode("utf-8")
  34. return html_str
  35. @staticmethod
  36. def __clear_tag(content_html: AnyStr) -> AnyStr:
  37. """
  38. 清理标签
  39. :param content_html:
  40. :return:
  41. """
  42. try:
  43. content_html = content_html.replace("\n", " ").replace("\t", "").replace("\xa0", "")
  44. tag_html = content_html.replace(">", "")
  45. tag_html = extract_input_value(tag_html)
  46. tag_html = CleanKeepTable(tag_html) # 保留 table标签
  47. tag_html = tag_html.strip().replace(":", ":").replace(" ", " ")
  48. tag_html = tag_html.replace(" ", " ").replace("▲", "")
  49. except Exception as e:
  50. logger.warning(e)
  51. return content_html
  52. return tag_html
  53. @staticmethod
  54. def __add_space(content_html):
  55. """
  56. 为表格添加空格
  57. :param content_html:
  58. :return:
  59. """
  60. if "<table" not in content_html:
  61. return content_html
  62. if "<thead>" in content_html:
  63. content_html = content_html.replace('<thead>', '')
  64. # if "<br/" in content_html:
  65. # content_html = content_html.replace(r'<br/', '')
  66. if "<table " in content_html and "<td " in content_html and "<tr " in content_html:
  67. return content_html
  68. content_html = content_html.replace('<tbody', '<tbody ')
  69. content_html = content_html.replace('<table', '<table ')
  70. content_html = content_html.replace('<th', '<th ')
  71. content_html = content_html.replace('<tr', '<tr ')
  72. content_html = content_html.replace('<td', '<td ')
  73. return content_html
  74. def preprocess(self, content: AnyStr) -> AnyStr:
  75. """
  76. 处理开始
  77. :param content:
  78. :return:
  79. """
  80. content = self.__add_space(content)
  81. content = self.__repair_label(content)
  82. if "script" in content:
  83. content = self.__clean_script(content)
  84. content = self.__clear_tag(content)
  85. return content