""" .docx文件解析 """ import os import docx from file_processing.file_picture import ocr_patch from docx.document import Document from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.table import _Cell, Table from docx.text.paragraph import Paragraph from loguru import logger from docs.config import error_number def iter_block_items(parent): if isinstance(parent, Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent) try: if str(child.xml).find('') > 0: yield '图像' except Exception as e: yield '' def read_table(table): return [[cell.text for cell in row.cells] for row in table.rows] def save_picture(word_path, doc): """ .docx文件中的图片和该文件在同目录,返回图片路径列表(有序) :param word_path: :param doc: :return: """ img_path = [] for i, shape in enumerate(doc.inline_shapes): content_id = shape._inline.graphic.graphicData.pic.blipFill.blip.embed content_type = doc.part.related_parts[content_id].content_type if not content_type.startswith('image'): continue img_name = os.path.join(os.path.dirname(word_path), './%d.png' % (i)) img_data = doc.part.related_parts[content_id]._blob with open(img_name, 'wb') as fp: fp.write(img_data) img_path.append(os.path.abspath(img_name)) return img_path def table_add_html(block): text = '' tables = read_table(block) for index_row, row in enumerate(tables): # 行tr/列td text = text + '' for index_col, line in enumerate(row): text = text + '' text = text + '' text = text + '
' + line + '
' return text def read_docx(word_path): try: doc = docx.Document(word_path) segments = [] imagepos = 0 for block in iter_block_items(doc): if isinstance(block, Paragraph): if block.text == '' or block.text == '\n': continue segments.append((str(block.text), 0)) elif isinstance(block, Table): string = table_add_html(block) segments.append((string, 0)) else: segments.append(('图像%d' % imagepos, imagepos)) imagepos += 1 # 调用保存图像的函数,返回图片的路径 picture_path = save_picture(word_path, doc=doc) logger.debug('picture_path:::' + str(picture_path)) # 返回图片路径,调用OCR模块,返回组织好的字符串列表,依次取列表元素和图像0,图像1,图像2..对应替换 image_list = [] for i, pic_path in enumerate(picture_path): image_list.append(('图像%d' % i, pic_path)) ret_dict = ocr_patch(image_list) for i, val in enumerate(segments): if val[0] in ret_dict: segments[i] = (ret_dict[val[0]], val[1]) segments = [s for s, i in segments] return '\n'.join(segments), error_number["成功"] except Exception as e: logger.error(e) return '', error_number["解析错误"]