123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- """
- .docx文件解析
- """
- import os
- import docx
- from file_processing.file_picture import ocr_patch
- from docx.document import Document
- from docx.oxml.table import CT_Tbl
- from docx.oxml.text.paragraph import CT_P
- from docx.table import _Cell, Table
- from docx.text.paragraph import Paragraph
- from loguru import logger
- from docs.config import error_number
- def iter_block_items(parent):
- if isinstance(parent, Document):
- parent_elm = parent.element.body
- elif isinstance(parent, _Cell):
- parent_elm = parent._tc
- else:
- raise ValueError("something's not right")
- for child in parent_elm.iterchildren():
- if isinstance(child, CT_P):
- yield Paragraph(child, parent)
- elif isinstance(child, CT_Tbl):
- yield Table(child, parent)
- try:
- if str(child.xml).find('<w:drawing>') > 0:
- yield '图像'
- except Exception as e:
- yield ''
- def read_table(table):
- return [[cell.text for cell in row.cells] for row in table.rows]
- def save_picture(word_path, doc):
- """
- .docx文件中的图片和该文件在同目录,返回图片路径列表(有序)
- :param word_path:
- :param doc:
- :return:
- """
- img_path = []
- for i, shape in enumerate(doc.inline_shapes):
- content_id = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
- content_type = doc.part.related_parts[content_id].content_type
- if not content_type.startswith('image'):
- continue
- img_name = os.path.join(os.path.dirname(word_path), './%d.png' % (i))
- img_data = doc.part.related_parts[content_id]._blob
- with open(img_name, 'wb') as fp:
- fp.write(img_data)
- img_path.append(os.path.abspath(img_name))
- return img_path
- def table_add_html(block):
- text = '<table>'
- tables = read_table(block)
- for index_row, row in enumerate(tables): # 行tr/列td
- text = text + '<tr>'
- for index_col, line in enumerate(row):
- text = text + '<td>' + line + '</td>'
- text = text + '</tr>'
- text = text + '</table>'
- return text
- def read_docx(word_path):
- try:
- doc = docx.Document(word_path)
- segments = []
- imagepos = 0
- for block in iter_block_items(doc):
- if isinstance(block, Paragraph):
- if block.text == '' or block.text == '\n': continue
- segments.append((str(block.text), 0))
- elif isinstance(block, Table):
- string = table_add_html(block)
- segments.append((string, 0))
- else:
- segments.append(('图像%d' % imagepos, imagepos))
- imagepos += 1
- # 调用保存图像的函数,返回图片的路径
- picture_path = save_picture(word_path, doc=doc)
- logger.debug('picture_path:::' + str(picture_path))
- # 返回图片路径,调用OCR模块,返回组织好的字符串列表,依次取列表元素和图像0,图像1,图像2..对应替换
- image_list = []
- for i, pic_path in enumerate(picture_path):
- image_list.append(('图像%d' % i, pic_path))
- ret_dict = ocr_patch(image_list)
- for i, val in enumerate(segments):
- if val[0] in ret_dict:
- segments[i] = (ret_dict[val[0]], val[1])
- segments = [s for s, i in segments]
- return '\n'.join(segments), error_number["成功"]
- except Exception as e:
- logger.error(e)
- return '', error_number["解析错误"]
|