jy_ocr.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. # -*- coding: utf-8 -*-
  2. import base64
  3. import io
  4. import pathlib
  5. import time
  6. import requests
  7. from a2s.a2s_client import a2s_execute
  8. from a2s.tools import grpc_deserialize
  9. from a2s.tools import json_serialize, json_deserialize
  10. from ddddocr import DdddOcr
  11. import setting
  12. from proto import ocr_pb2
  13. from utils import (
  14. is_specific_number_chinese_character,
  15. is_en,
  16. is_zero_or_o,
  17. is_digit
  18. )
  19. def dg_ocr_client(image, retry=5, a2s_ip=None, topic=None, timeout=None):
  20. """
  21. 读光ocr
  22. :param bytes image: 图片二进制
  23. :param int retry: 重试次数
  24. :param str a2s_ip: 服务host
  25. :param str topic: 调用功能的主题名称
  26. :param int timeout: 超时时间
  27. :return:
  28. """
  29. if a2s_ip is None:
  30. a2s_ip = setting.DG_OCR_A2S_ADDRESS
  31. if topic is None:
  32. topic = setting.DG_OCR_TOPIC
  33. if timeout is None:
  34. timeout = setting.DG_OCR_TIMEOUT
  35. image_str = base64.b64encode(image).decode('utf-8') # 将bytes转换为base64字符串
  36. data = json_serialize({'image': image_str})
  37. for _ in range(retry):
  38. resp = a2s_execute(a2s_ip, topic, bytes_data=data, timeout=timeout)
  39. if resp is None:
  40. # 超时,链接异常断开重试
  41. time.sleep(5)
  42. continue
  43. # 将Base64编码的字符串解码为二进制数据
  44. resp_json = json_deserialize(resp)
  45. state = resp_json.get("state", 0)
  46. if state != 200:
  47. # 解析异常失败,重试or结束
  48. return None
  49. return resp_json.get("output", [])
  50. def jy_ocr_client(data, retry=5, a2s_ip=None, topic=None, timeout=None, missing_ok=False):
  51. """
  52. jy-ocr
  53. @param bytes data:
  54. @param int retry:
  55. @param str a2s_ip:
  56. @param str topic:
  57. @param int timeout:
  58. @param bool missing_ok:
  59. @return:
  60. """
  61. if a2s_ip is None:
  62. a2s_ip = setting.JY_OCR_A2S_ADDRESS
  63. if topic is None:
  64. topic = setting.JY_OCR_TOPIC
  65. if timeout is None:
  66. timeout = setting.JY_OCR_A2S_TIMEOUT
  67. for r in range(retry):
  68. try:
  69. rsp = ocr_pb2.OcrRequest(image=data)
  70. rsp = rsp.SerializeToString()
  71. resp_data = a2s_execute(a2s_ip, topic, timeout, rsp)
  72. if resp_data is None:
  73. continue
  74. resp = ocr_pb2.OcrResponse()
  75. resp = grpc_deserialize(resp, resp_data)
  76. result = str(resp.message).strip()
  77. return result
  78. except Exception as e:
  79. if not missing_ok:
  80. raise e
  81. def jy_api_ocr_client(data, retry=5, timeout=None, missing_ok=False):
  82. if not isinstance(data, (str, bytes)):
  83. raise ValueError('未知数据类型')
  84. if isinstance(data, bytes):
  85. data = io.BytesIO(data)
  86. else:
  87. assert isinstance(data, str)
  88. with open(data, 'rb') as f:
  89. data = f.read()
  90. url = f'http://{setting.JY_API_OCR_ADDRESS}/api/tr-run/'
  91. if timeout is None:
  92. timeout = setting.JY_API_OCR_TIMEOUT
  93. files = {'file': data}
  94. for _ in range(retry):
  95. request_param = dict(files=files, data={'compress': 0})
  96. try:
  97. response = requests.post(url, timeout=timeout, **request_param)
  98. if response.status_code != 200:
  99. continue
  100. raw_out = response.json()['data']['raw_out']
  101. data = '' if not raw_out else raw_out[0][1]
  102. return [data]
  103. except requests.exceptions.RequestException as e:
  104. if not missing_ok:
  105. raise e
  106. def dg_ocr(image, default=''):
  107. return next(iter(dg_ocr_client(image=image)), default)
  108. def jy_ocr(image, default=''):
  109. return next(iter(jy_ocr_client(data=image)), default)
  110. def jy_api_ocr(image, default=''):
  111. return next(iter(jy_api_ocr_client(data=image)), default)
  112. ocr = DdddOcr(show_ad=False)
  113. def mix_ocr_image_extract(files):
  114. image = files.read_bytes() if isinstance(files, pathlib.Path) else files[1]
  115. file_content = ocr.classification(image)
  116. # print(f'{files.name} ', 'ddd-ocr => ', file_content)
  117. # print(f'{files.name} ', 'ddd-ocr => ', file_content, 'dg-ocr => ', dg_ocr_file_content)
  118. if (
  119. not file_content
  120. or len(file_content) > 1
  121. or is_specific_number_chinese_character(file_content)
  122. or is_en(file_content)
  123. ):
  124. file_content = dg_ocr(image)
  125. # _ = ' ' * (len(files.name))
  126. # print(f'{_} ', 'dg-ocr => ', file_content)
  127. if is_zero_or_o(file_content):
  128. file_content = jy_ocr(image)
  129. # _ = ' ' * (len(files.name))
  130. # print(f'{_} ', 'jy-ocr => ', file_content)
  131. return file_content
  132. def dg_ocr_image_extract(files):
  133. image = files.read_bytes() if isinstance(files, pathlib.Path) else files[1]
  134. file_content = dg_ocr(image)
  135. if (
  136. not file_content
  137. or len(file_content) > 1
  138. or is_zero_or_o(file_content)
  139. or is_en(file_content)
  140. or is_digit(file_content)
  141. ):
  142. file_content = jy_ocr(image)
  143. # file_content = jy_api_ocr(image)
  144. # _ = ' ' * (len(filename))
  145. # print(f'{_} ', 'jy-ocr => ', file_content)
  146. return file_content
  147. if __name__ == '__main__':
  148. image_path = "./cache/image/0x100c3.jpg"
  149. with open(image_path, "rb") as rp:
  150. texts = dg_ocr_client(rp.read())
  151. print(texts)