file_docx.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. """
  2. .docx文件解析
  3. """
  4. import os
  5. import docx
  6. from file_processing.file_picture import ocr_patch
  7. from docx.document import Document
  8. from docx.oxml.table import CT_Tbl
  9. from docx.oxml.text.paragraph import CT_P
  10. from docx.table import _Cell, Table
  11. from docx.text.paragraph import Paragraph
  12. from loguru import logger
  13. from docs.config import error_number
  14. def iter_block_items(parent):
  15. if isinstance(parent, Document):
  16. parent_elm = parent.element.body
  17. elif isinstance(parent, _Cell):
  18. parent_elm = parent._tc
  19. else:
  20. raise ValueError("something's not right")
  21. for child in parent_elm.iterchildren():
  22. if isinstance(child, CT_P):
  23. yield Paragraph(child, parent)
  24. elif isinstance(child, CT_Tbl):
  25. yield Table(child, parent)
  26. try:
  27. if str(child.xml).find('<w:drawing>') > 0:
  28. yield '图像'
  29. except Exception as e:
  30. yield ''
  31. def read_table(table):
  32. return [[cell.text for cell in row.cells] for row in table.rows]
  33. def save_picture(word_path, doc):
  34. """
  35. .docx文件中的图片和该文件在同目录,返回图片路径列表(有序)
  36. :param word_path:
  37. :param doc:
  38. :return:
  39. """
  40. img_path = []
  41. for i, shape in enumerate(doc.inline_shapes):
  42. content_id = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
  43. content_type = doc.part.related_parts[content_id].content_type
  44. if not content_type.startswith('image'):
  45. continue
  46. img_name = os.path.join(os.path.dirname(word_path), './%d.png' % (i))
  47. img_data = doc.part.related_parts[content_id]._blob
  48. with open(img_name, 'wb') as fp:
  49. fp.write(img_data)
  50. img_path.append(os.path.abspath(img_name))
  51. return img_path
  52. def table_add_html(block):
  53. text = '<table>'
  54. tables = read_table(block)
  55. for index_row, row in enumerate(tables): # 行tr/列td
  56. text = text + '<tr>'
  57. for index_col, line in enumerate(row):
  58. text = text + '<td>' + line + '</td>'
  59. text = text + '</tr>'
  60. text = text + '</table>'
  61. return text
  62. def read_docx(word_path):
  63. try:
  64. doc = docx.Document(word_path)
  65. segments = []
  66. imagepos = 0
  67. for block in iter_block_items(doc):
  68. if isinstance(block, Paragraph):
  69. if block.text == '' or block.text == '\n': continue
  70. segments.append((str(block.text), 0))
  71. elif isinstance(block, Table):
  72. string = table_add_html(block)
  73. segments.append((string, 0))
  74. else:
  75. segments.append(('图像%d' % imagepos, imagepos))
  76. imagepos += 1
  77. # 调用保存图像的函数,返回图片的路径
  78. picture_path = save_picture(word_path, doc=doc)
  79. logger.debug('picture_path:::' + str(picture_path))
  80. # 返回图片路径,调用OCR模块,返回组织好的字符串列表,依次取列表元素和图像0,图像1,图像2..对应替换
  81. image_list = []
  82. for i, pic_path in enumerate(picture_path):
  83. image_list.append(('图像%d' % i, pic_path))
  84. ret_dict = ocr_patch(image_list)
  85. for i, val in enumerate(segments):
  86. if val[0] in ret_dict:
  87. segments[i] = (ret_dict[val[0]], val[1])
  88. segments = [s for s, i in segments]
  89. return '\n'.join(segments), error_number["成功"]
  90. except Exception as e:
  91. logger.error(e)
  92. return '', error_number["解析错误"]