乱码识别.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import binascii
  2. import chardet
  3. def check_is_encode_error(string):
  4. try:
  5. string.encode('gbk')
  6. except UnicodeEncodeError:
  7. return True
  8. return False
  9. sss= '銆愭潗鏂欒澶囥?戠鍥涘笀鍙厠杈炬媺甯?220kV鍙樼數绔欏強71鍥?220kV鍙樼數绔欎富鍙樺瀹规墿寤哄伐绋?锛堢患鍚堣嚜鍔ㄥ寲锛夋嫑鏍囧叕鍛?'
  10. # sss ='cdsc你家妇女的可能'
  11. is_code = check_is_encode_error(sss)
  12. print(is_code)
  13. print(sss.encode('utf-8'))
  14. # print (bytes(sss.encode('utf-8')).decode('Big5'))
  15. # hex = sss.encode('utf-8')
  16. # print(hex.decode("ASCII"))
  17. # print(binascii.unhexlify(hex.decode("ASCII")))
  18. # print (chardet.UniversalDetector.ESC_DETECTOR())
  19. '''EUC-JP, SHIFT_JIS, and ISO-2022-JP (Japanese 日文)
  20. EUC-KR and ISO-2022-KR (Korean 韩文)
  21. KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, and windows-1251 (Russian 俄文)
  22. ISO-8859-2 and windows-1250 (Hungarian 匈牙利文)
  23. ISO-8859-5 and windows-1251 (Bulgarian 保加利亚文)
  24. ISO-8859-1 and windows-1252 (Western European languages 西欧文字)
  25. ISO-8859-7 and windows-1253 (Greek 希腊文)
  26. ISO-8859-8 and windows-1255 (Visual and Logical Hebrew 视觉顺序和逻辑顺序的希伯来文)
  27. ASCII'''
  28. chart = ["EUC-JP","ASCII","UTF-8","UTF-16","UTF-32","TIS-620","ISO-8859-8","ISO-8859-7","ISO-8859-1","ISO-8859-5","ISO-8859-2",
  29. "windows-1250","windows-1251","windows-1252","windows-1253","windows-1255","KOI8-R","MacCyrillic","IBM855","IBM866","EUC-KR",
  30. "ISO-2022-KR","SHIFT_JIS","ISO-2022-JP"]
  31. for char in chart:
  32. try:
  33. print(bytes(sss.encode('utf-8')).decode(char))
  34. except:
  35. pass