ai
/
nsq_convert2txt


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
							"""
.docx文件解析

"""

import os
import docx
from file_processing.file_picture import ocr_patch
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from loguru import logger
from docs.config import error_number


def iter_block_items(parent):
    if isinstance(parent, Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
        try:
            if str(child.xml).find('<w:drawing>') > 0:
                yield '图像'
        except Exception as e:
            yield ''


def read_table(table):
    return [[cell.text for cell in row.cells] for row in table.rows]


def save_picture(word_path, doc):
    """
    .docx文件中的图片和该文件在同目录,返回图片路径列表(有序)
    :param word_path:
    :param doc:
    :return:
    """

    img_path = []
    for i, shape in enumerate(doc.inline_shapes):
        content_id = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
        content_type = doc.part.related_parts[content_id].content_type
        if not content_type.startswith('image'):
            continue
        img_name = os.path.join(os.path.dirname(word_path), './%d.png' % (i))
        img_data = doc.part.related_parts[content_id]._blob
        with open(img_name, 'wb') as fp:
            fp.write(img_data)
            img_path.append(os.path.abspath(img_name))
    return img_path


def table_add_html(block):
    text = '<table>'
    tables = read_table(block)
    for index_row, row in enumerate(tables):  # 行tr/列td
        text = text + '<tr>'
        for index_col, line in enumerate(row):
            text = text + '<td>' + line + '</td>'
        text = text + '</tr>'
    text = text + '</table>'
    return text


def read_docx(word_path):
    try:
        doc = docx.Document(word_path)
        segments = []
        imagepos = 0
        for block in iter_block_items(doc):
            if isinstance(block, Paragraph):
                if block.text == '' or block.text == '\n': continue
                segments.append((str(block.text), 0))

            elif isinstance(block, Table):
                string = table_add_html(block)
                segments.append((string, 0))
            else:
                segments.append(('图像%d' % imagepos, imagepos))
                imagepos += 1

        # 调用保存图像的函数，返回图片的路径
        picture_path = save_picture(word_path, doc=doc)
        logger.debug('picture_path:::' + str(picture_path))
        # 返回图片路径，调用OCR模块,返回组织好的字符串列表,依次取列表元素和图像0,图像1,图像2..对应替换
        image_list = []
        for i, pic_path in enumerate(picture_path):
            image_list.append(('图像%d' % i, pic_path))
        ret_dict = ocr_patch(image_list)
        for i, val in enumerate(segments):
            if val[0] in ret_dict:
                segments[i] = (ret_dict[val[0]], val[1])
        segments = [s for s, i in segments]
        return '\n'.join(segments), error_number["成功"]
    except Exception as e:
        logger.error(e)
        return '', error_number["解析错误"]