file_doc.py 818 B

1234567891011121314151617181920212223
  1. """
  2. .doc解析
  3. """
  4. import subprocess, os
  5. from loguru import logger
  6. from file_processing.file_docx import read_docx
  7. from docs.config import error_number
  8. def read_doc(file_path):
  9. out_dir = os.path.dirname(file_path)
  10. try:
  11. args = 'soffice --headless --convert-to docx %s --outdir %s' % (file_path, out_dir)
  12. output = subprocess.check_output(args, shell=True)
  13. logger.debug(str(output))
  14. except subprocess.CalledProcessError as e:
  15. logger.error('doc文件转换出错')
  16. logger.error(e)
  17. return '', error_number["解析错误"]
  18. file_name = os.path.basename(file_path).split('.')[0] + '.docx'
  19. logger.debug('doc文件转docx后文件路径>>>' + os.path.join(out_dir, file_name))
  20. docx_path = os.path.join(out_dir, file_name)
  21. return read_docx(docx_path)