parse_file.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #!/usr/bin/python3.6
  2. # -*- coding: utf-8 -*-
  3. # @Author : lijunliang
  4. # @Email : lijunliang@topnet.net.cn
  5. # @File : parse_file.py
  6. # @Software: PyCharm
  7. from util.convert2text import pdf_get_pages, word_get_text
  8. from util.file_operations import save_file
  9. from util.convert2pdf import lib2pdf, html2pdf
  10. from util.file_operations import file_copy
  11. from module.convertPdfServer import convert_pdf_start
  12. from module.convertTxtServer import convert_txt_start
  13. from loguru import logger
  14. import os
  15. # docType = ['doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx', "txt",html,htm]
  16. def parse_file_start(file_path: str) -> (str, int):
  17. """
  18. 解析文本
  19. :param file_path:
  20. :return: 文本,长度,pdf_path路径
  21. """
  22. # 获取后缀判断
  23. prefix_suffix = file_path.split(".")
  24. suffix = prefix_suffix[-1]
  25. suffix = suffix.lower()
  26. suffix = suffix.strip()
  27. prefix_suffix[-2] = prefix_suffix[-2] + "1"
  28. prefix_suffix[-1] = "pdf"
  29. pdf_path = ".".join(prefix_suffix)
  30. # 后缀为pdf
  31. print(f"--->{suffix}")
  32. if suffix == "pdf": # 解析pdf
  33. state = file_copy(file_path, pdf_path)
  34. if not state: # 拷贝失败
  35. return "", 0, pdf_path
  36. # 后缀为html
  37. elif suffix in ["html", "htm"]:
  38. state = html2pdf(file_path, pdf_path)
  39. if not state:
  40. return "", 0, pdf_path
  41. # 后缀为doc,docx,'xls', 'xlsx' 由于兼容问题调用windows服务速度慢
  42. elif suffix in ["doc", "docx"]:
  43. stat, request_ret = convert_pdf_start(file_path, suffix)
  44. if not stat:
  45. return "", 0, pdf_path
  46. state = save_file(request_ret, pdf_path)
  47. if not state: # 接收转存失败
  48. return "", 0, pdf_path
  49. # 后缀为'ppt', 'pptx'调用linux服务速度快
  50. elif suffix in ['ppt', 'pptx', "txt"]:
  51. pdf_path = lib2pdf(file_path) # libreoffice
  52. else:
  53. return "", 0, ""
  54. if not pdf_path:
  55. return "", 0, pdf_path
  56. # 解析pdf
  57. pages,textes = pdf_get_pages(pdf_path)
  58. if not pages:
  59. return "", 0, pdf_path
  60. stat, text = convert_txt_start(pdf_path)
  61. if not stat:
  62. text=textes
  63. new_text = text.replace("\n", " ").replace("\r", "")
  64. new_text=new_text.strip()
  65. return new_text, pages, pdf_path
  66. def get_property(file_path: str) -> (str, str, float):
  67. '''
  68. 获取文件属性
  69. :param file_path: 文件类型
  70. :return:文件类型,后缀,大小字节
  71. '''
  72. suffix = file_path.split(".")[-1]
  73. suffix = suffix.lower()
  74. doctype = suffix.strip() # 文件类型
  75. suffix = "." + doctype # 文件后缀
  76. fileSize = os.path.getsize(file_path) #
  77. return doctype, suffix, fileSize
  78. if __name__ == '__main__':
  79. import chardet
  80. ret = parse_file_start("../data/1234.pdf")
  81. # print(chardet.detect(ret[0])['encoding'])
  82. print(ret)