Use utf_8 'merge' encoding for all locales.

author Zac Medico <zmedico@gentoo.org>

Fri, 9 Sep 2011 20:47:30 +0000 (13:47 -0700)

committer Zac Medico <zmedico@gentoo.org>

Fri, 9 Sep 2011 20:47:30 +0000 (13:47 -0700)
author Zac Medico <zmedico@gentoo.org>
Fri, 9 Sep 2011 20:47:30 +0000 (13:47 -0700)
committer Zac Medico <zmedico@gentoo.org>
Fri, 9 Sep 2011 20:47:30 +0000 (13:47 -0700)
diff --git a/pym/portage/__init__.py b/pym/portage/__init__.py

index 789d04307ae48ba693b5bfc40e119be963348f91..d3df6e3231061f2d6f12836f52b06be8e504978f 100644 (file)
--- a/pym/portage/__init__.py
+++ b/pym/portage/__init__.py
@@ -148,31 +148,35 @@ if sys.hexversion >= 0x3000000:
         basestring = str
         long = int
  
-# Assume utf_8 fs encoding everywhere except in merge code, where the
-# user's locale is respected.
+# We use utf_8 encoding everywhere. Previously, we used
+# sys.getfilesystemencoding() for the 'merge' encoding, but that had
+# various problems:
+#
+#   1) If the locale is ever changed then it can cause orphan files due
+#      to changed character set translation.
+#
+#   2) Ebuilds typically install files with utf_8 encoded file names,
+#      and then portage would be forced to rename those files to match
+#      sys.getfilesystemencoding(), possibly breaking things.
+#
+#   3) Automatic translation between encodings can lead to nonsensical
+#      file names when the source encoding is unknown by portage.
+#
+#   4) It's inconvenient for ebuilds to convert the encodings of file
+#      names to match the current locale, and upstreams typically encode
+#      file names with utf_8 encoding.
+#
+# So, instead of relying on sys.getfilesystemencoding(), we avoid the above
+# problems by using a constant utf_8 'merge' encoding for all locales, as
+# discussed in bug #382199 and bug #381509.
  _encodings = {
         'content'                : 'utf_8',
         'fs'                     : 'utf_8',
-       'merge'                  : sys.getfilesystemencoding(),
+       'merge'                  : 'utf_8',
         'repo.content'           : 'utf_8',
         'stdio'                  : 'utf_8',
  }
  
-# sys.getfilesystemencoding() can return None if python is built with
-# USE=build (stage 1). If the filesystem encoding is undefined or is a
-# subset of utf_8, then we default to utf_8 encoding for merges, since
-# it probably won't hurt, and forced conversion to ascii encoding is
-# known to break some packages that install file names with utf_8
-# encoding (see bug #381509). The ascii aliases are borrowed from
-# python's encodings.aliases.aliases dict.
-if _encodings['merge'] is None or \
-       _encodings['merge'].lower().replace('-', '_') in \
-       ('ascii', '646', 'ansi_x3.4_1968', 'ansi_x3_4_1968',
-       'ansi_x3.4_1986', 'cp367', 'csascii', 'ibm367', 'iso646_us',
-       'iso_646.irv_1991', 'iso_ir_6', 'us', 'us_ascii'):
-
-       _encodings['merge'] = 'utf_8'
-
  if sys.hexversion >= 0x3000000:
         def _unicode_encode(s, encoding=_encodings['content'], errors='backslashreplace'):
                 if isinstance(s, str):
author	Zac Medico <zmedico@gentoo.org>
	Fri, 9 Sep 2011 20:47:30 +0000 (13:47 -0700)
committer	Zac Medico <zmedico@gentoo.org>
	Fri, 9 Sep 2011 20:47:30 +0000 (13:47 -0700)