spacepaste

  1.  
  2. # -*- coding: UTF-8 -*-
  3. def recode_utf8(data):
  4. """
  5. Given a string which is either:
  6. * unicode
  7. * well-encoded utf8
  8. * well-encoded latin1
  9. * poorly-encoded utf8+latin1
  10. Return the equivalent utf8-encoded byte string.
  11. """
  12. if isinstance(data, unicode):
  13. # The input is already decoded. Just return the utf8.
  14. return data.encode('UTF-8')
  15. try:
  16. decoded = data.decode('UTF-8')
  17. except UnicodeDecodeError:
  18. # Indicates latin1 encoded bytes.
  19. decoded = data.decode('latin1')
  20. while True:
  21. # Check if the data is poorly-encoded as utf8+latin1
  22. try:
  23. encoded = decoded.encode('latin1')
  24. except UnicodeEncodeError:
  25. # Indicates non-latin1-encodable characters; it's not utf8+latin1.
  26. return decoded.encode('UTF-8')
  27. try:
  28. decoded = encoded.decode('UTF-8')
  29. except UnicodeDecodeError:
  30. # Can't decode the latin1 as utf8; it's not utf8+latin1.
  31. return decoded.encode('UTF-8')
  32. import unittest as T
  33. class TestRecodeUtf8(T.TestCase):
  34. latin1 = u'München' # encodable to latin1
  35. utf8 = u'Łódź' # not encodable to latin1
  36. def test_unicode(self):
  37. "An un-encoded unicode string should just become utf8-encoded"
  38. self.assertEqual(
  39. recode_utf8(self.utf8),
  40. self.utf8.encode('UTF-8'),
  41. )
  42. def test_utf8(self):
  43. "A utf8-encoded string should be unchanged"
  44. utf8 = self.utf8.encode('UTF-8')
  45. self.assertEqual(
  46. recode_utf8(utf8),
  47. utf8,
  48. )
  49. def test_latin1(self):
  50. "A latin1-encoded string should become utf8-encoded"
  51. self.assertEqual(
  52. recode_utf8(self.latin1.encode('latin1')),
  53. self.latin1.encode('UTF-8'),
  54. )
  55. def test_utf8_plus_latin1(self):
  56. "A poorly-encoded utf8+latin1 string should become utf8-encoded"
  57. utf8 = self.utf8.encode('UTF-8')
  58. poorly_encoded = utf8.decode('latin1').encode('UTF-8')
  59. self.assertEqual(
  60. recode_utf8(poorly_encoded),
  61. utf8,
  62. )
  63. def test_utf8_plus_latin1_several_times(self):
  64. "A string mangled by utf8+latin1 several times should become utf8-encoded"
  65. utf8 = self.utf8.encode('UTF-8')
  66. poorly_encoded = utf8
  67. for x in range(10):
  68. poorly_encoded = poorly_encoded.decode('latin1').encode('UTF-8')
  69. self.assertEqual(
  70. recode_utf8(poorly_encoded),
  71. utf8,
  72. )
  73. if __name__ == '__main__':
  74. T.main()
  75.