-
- # -*- coding: UTF-8 -*-
- def recode_utf8(data):
- """
- Given a string which is either:
- * unicode
- * well-encoded utf8
- * well-encoded latin1
- * poorly-encoded utf8+latin1
- Return the equivalent utf8-encoded byte string.
- """
- if isinstance(data, unicode):
- # The input is already decoded. Just return the utf8.
- return data.encode('UTF-8')
-
- try:
- decoded = data.decode('UTF-8')
- except UnicodeDecodeError:
- # Indicates latin1 encoded bytes.
- decoded = data.decode('latin1')
-
- while True:
- # Check if the data is poorly-encoded as utf8+latin1
- try:
- encoded = decoded.encode('latin1')
- except UnicodeEncodeError:
- # Indicates non-latin1-encodable characters; it's not utf8+latin1.
- return decoded.encode('UTF-8')
-
- try:
- decoded = encoded.decode('UTF-8')
- except UnicodeDecodeError:
- # Can't decode the latin1 as utf8; it's not utf8+latin1.
- return decoded.encode('UTF-8')
-
-
- import unittest as T
- class TestRecodeUtf8(T.TestCase):
- latin1 = u'München' # encodable to latin1
- utf8 = u'Łódź' # not encodable to latin1
-
- def test_unicode(self):
- "An un-encoded unicode string should just become utf8-encoded"
- self.assertEqual(
- recode_utf8(self.utf8),
- self.utf8.encode('UTF-8'),
- )
-
- def test_utf8(self):
- "A utf8-encoded string should be unchanged"
- utf8 = self.utf8.encode('UTF-8')
- self.assertEqual(
- recode_utf8(utf8),
- utf8,
- )
-
- def test_latin1(self):
- "A latin1-encoded string should become utf8-encoded"
- self.assertEqual(
- recode_utf8(self.latin1.encode('latin1')),
- self.latin1.encode('UTF-8'),
- )
-
- def test_utf8_plus_latin1(self):
- "A poorly-encoded utf8+latin1 string should become utf8-encoded"
- utf8 = self.utf8.encode('UTF-8')
- poorly_encoded = utf8.decode('latin1').encode('UTF-8')
- self.assertEqual(
- recode_utf8(poorly_encoded),
- utf8,
- )
-
- def test_utf8_plus_latin1_several_times(self):
- "A string mangled by utf8+latin1 several times should become utf8-encoded"
- utf8 = self.utf8.encode('UTF-8')
- poorly_encoded = utf8
- for x in range(10):
- poorly_encoded = poorly_encoded.decode('latin1').encode('UTF-8')
-
- self.assertEqual(
- recode_utf8(poorly_encoded),
- utf8,
- )
-
-
-
- if __name__ == '__main__':
- T.main()
-