table_extract.py 1.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # coding:utf-8
  2. class TableStruct(object):
  3. def __init__(self):
  4. """
  5. 定义表格属性
  6. """
  7. self.min_x = None
  8. self.max_x = None
  9. self.min_y = None
  10. self.max_y = None
  11. self.contents = ""
  12. def parse(self, table):
  13. """
  14. 解析表格结构
  15. :param table:
  16. :return:
  17. """
  18. contents = table.extract()
  19. if contents:
  20. self.contents = self.__table_format(contents)
  21. self.min_x, self.min_y, self.max_x, self.max_y = table.bbox
  22. @staticmethod
  23. def __table_format(data: list):
  24. """
  25. 生成table标签数据,结构化数据
  26. :param data:
  27. :return:
  28. """
  29. if not data:
  30. return ""
  31. table = '<table border=1>'
  32. for row in data:
  33. table += '<tr>'
  34. for col in row:
  35. if col is None:
  36. col = ''
  37. table += '<td>%s</td>' % col
  38. table += '</tr>'
  39. table += "</table>"
  40. return table