From 1007e8518133efcab25d7ad713262a7660c5bf95 Mon Sep 17 00:00:00 2001
From: stevenknight <stevenknight@fdb21ef1-2011-0410-befe-b5e4ea1792b1>
Date: Mon, 2 Mar 2009 19:11:09 +0000
Subject: [PATCH] Fix handling of both UTF_16_LE and UTF_16_BE.  Add an actual
 test for scanning Unicode files for implicit dependencies.  Clean up how we
 handle decoding, and wrap it for earlier Python versions.

git-svn-id: http://scons.tigris.org/svn/scons/trunk@4061 fdb21ef1-2011-0410-befe-b5e4ea1792b1
---
 src/CHANGES.txt             |   2 +
 src/engine/SCons/Node/FS.py |  65 +++++++-
 test/Scanner/unicode.py     | 319 ++++++++++++++++++++++++++++++++++++
 3 files changed, 379 insertions(+), 7 deletions(-)
 create mode 100644 test/Scanner/unicode.py

diff --git a/src/CHANGES.txt b/src/CHANGES.txt
index 0defff82..58972402 100644
--- a/src/CHANGES.txt
+++ b/src/CHANGES.txt
@@ -15,6 +15,8 @@ RELEASE X.X.X - XXX
     - Fix the -n option when used with VariantDir(duplicate=1)
       and the variant directory doesn't already exist.
 
+    - Fix scanning of Unicode files for both UTF-16 endian flavors.
+
 
 
 RELEASE 1.2.0.d20090223 - Mon, 23 Feb 2009 08:41:06 -0800
diff --git a/src/engine/SCons/Node/FS.py b/src/engine/SCons/Node/FS.py
index a4036ab3..bd8314b6 100644
--- a/src/engine/SCons/Node/FS.py
+++ b/src/engine/SCons/Node/FS.py
@@ -58,12 +58,46 @@ else:
     except AttributeError:
         codecs.BOM_UTF8 = '\xef\xbb\xbf'
     try:
-        codecs.BOM_UTF16
+        codecs.BOM_UTF16_LE
+        codecs.BOM_UTF16_BE
     except AttributeError:
-        if sys.byteorder == 'little':
-            codecs.BOM_UTF16 = '\xff\xfe'
+        codecs.BOM_UTF16_LE = '\xff\xfe'
+        codecs.BOM_UTF16_BE = '\xfe\xff'
+
+    # Provide a wrapper function to handle decoding differences in
+    # different versions of Python.  Normally, we'd try to do this in the
+    # compat layer (and maybe it still makes sense to move there?) but
+    # that doesn't provide a way to supply the string class used in
+    # pre-2.3 Python versions with a .decode() method that all strings
+    # naturally have.  Plus, the 2.[01] encodings behave differently
+    # enough that we have to settle for a lowest-common-denominator
+    # wrapper approach.
+    #
+    # Note that the 2.[012] implementations below may be inefficient
+    # because they perform an explicit look up of the encoding for every
+    # decode, but they're old enough (and we want to stop supporting
+    # them soon enough) that it's not worth complicating the interface.
+    # Think of it as additional incentive for people to upgrade...
+    try:
+        ''.decode
+    except AttributeError:
+        # 2.0 through 2.2:  strings have no .decode() method
+        try:
+            codecs.lookup('ascii').decode
+        except AttributeError:
+            # 2.0 and 2.1:  encodings are a tuple of functions, and the
+            # decode() function returns a (result, length) tuple.
+            def my_decode(contents, encoding):
+                return codecs.lookup(encoding)[1](contents)[0]
         else:
-            codecs.BOM_UTF16 = '\xfe\xff'
+            # 2.2:  encodings are an object with methods, and the
+            # .decode() method returns just the decoded bytes.
+            def my_decode(contents, encoding):
+                return codecs.lookup(encoding).decode(contents)
+    else:
+        # 2.3 or later:  use the .decode() string method
+        def my_decode(contents, encoding):
+            return contents.decode(encoding)
 
 import SCons.Action
 from SCons.Debug import logInstanceCreation
@@ -2309,10 +2343,27 @@ class File(Base):
         # it's a valid python string.
         def get_text_contents(self):
             contents = self.get_contents()
+            # The behavior of various decode() methods and functions
+            # w.r.t. the initial BOM bytes is different for different
+            # encodings and/or Python versions.  ('utf-8' does not strip
+            # them, but has a 'utf-8-sig' which does; 'utf-16' seems to
+            # strip them; etc.)  Just side step all the complication by
+            # explicitly stripping the BOM before we decode().
             if contents.startswith(codecs.BOM_UTF8):
-                contents = contents.decode('utf-8')
-            elif contents.startswith(codecs.BOM_UTF16):
-                contents = contents.decode('utf-16')
+                contents = contents[len(codecs.BOM_UTF8):]
+                # TODO(2.2):  Remove when 2.3 becomes floor.
+                #contents = contents.decode('utf-8')
+                contents = my_decode(contents, 'utf-8')
+            elif contents.startswith(codecs.BOM_UTF16_LE):
+                contents = contents[len(codecs.BOM_UTF16_LE):]
+                # TODO(2.2):  Remove when 2.3 becomes floor.
+                #contents = contents.decode('utf-16-le')
+                contents = my_decode(contents, 'utf-16-le')
+            elif contents.startswith(codecs.BOM_UTF16_BE):
+                contents = contents[len(codecs.BOM_UTF16_BE):]
+                # TODO(2.2):  Remove when 2.3 becomes floor.
+                #contents = contents.decode('utf-16-be')
+                contents = my_decode(contents, 'utf-16-be')
             return contents
 
     def get_content_hash(self):
diff --git a/test/Scanner/unicode.py b/test/Scanner/unicode.py
new file mode 100644
index 00000000..b895caae
--- /dev/null
+++ b/test/Scanner/unicode.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+#
+# __COPYRIGHT__
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+
+"""
+Verify that we can scan Unicode-encoded files for implicit
+dependencies.
+"""
+
+__revision__ = "__FILE__ __REVISION__ __DATE__ __DEVELOPER__"
+
+import TestSCons
+
+_python_ = TestSCons._python_
+
+test = TestSCons.TestSCons()
+
+try:
+    unicode
+except NameError:
+    import sys
+    msg = "Unicode not supported by Python version %s; skipping test\n"
+    test.skip_test(msg % sys.version[:3])
+
+import codecs
+
+test.write('build.py', r"""
+import codecs
+import sys
+
+# TODO(2.2):  Remove when 2.3 becomes the minimal supported version.
+try:
+    codecs.BOM_UTF8
+except AttributeError:
+    codecs.BOM_UTF8 = '\xef\xbb\xbf'
+try:
+    codecs.BOM_UTF16_LE
+    codecs.BOM_UTF16_BE
+except AttributeError:
+    codecs.BOM_UTF16_LE = '\xff\xfe'
+    codecs.BOM_UTF16_BE = '\xfe\xff'
+
+try:
+    ''.decode
+except AttributeError:
+    # 2.0 through 2.2:  strings have no .decode() method
+    try:
+        codecs.lookup('ascii').decode
+    except AttributeError:
+        # 2.0 and 2.1:  encodings are a tuple of functions, and the
+        # decode() function returns a (result, length) tuple.
+        def my_decode(contents, encoding):
+            return codecs.lookup(encoding)[1](contents)[0]
+    else:
+        # 2.2:  encodings are an object with methods, and the .decode()
+        # and .decode() returns just the decoded bytes.
+        def my_decode(contents, encoding):
+            return codecs.lookup(encoding).decode(contents)
+else:
+    # 2.3 or later:  use the .decode() string method
+    def my_decode(contents, encoding):
+        return contents.decode(encoding)
+
+def process(outfp, infile):
+    contents = open(infile, 'rb').read()
+    if contents.startswith(codecs.BOM_UTF8):
+        contents = contents[len(codecs.BOM_UTF8):]
+        # TODO(2.2):  Remove when 2.3 becomes the minimal supported version.
+        #contents = contents.decode('utf-8')
+        contents = my_decode(contents, 'utf-8')
+    elif contents.startswith(codecs.BOM_UTF16_LE):
+        contents = contents[len(codecs.BOM_UTF16_LE):]
+        # TODO(2.2):  Remove when 2.3 becomes the minimal supported version.
+        #contents = contents.decode('utf-16-le')
+        contents = my_decode(contents, 'utf-16-le')
+    elif contents.startswith(codecs.BOM_UTF16_BE):
+        contents = contents[len(codecs.BOM_UTF16_BE):]
+        # TODO(2.2):  Remove when 2.3 becomes the minimal supported version.
+        #contents = contents.decode('utf-16-be')
+        contents = my_decode(contents, 'utf-16-be')
+    for line in contents.split('\n')[:-1]:
+        if line[:8] == 'include ':
+            process(outfp, line[8:])
+        elif line[:8] == 'getfile ':
+            outfp.write('include ' + line[8:] + '\n')
+            # note: converted, but not acted upon
+        else:
+            outfp.write(line + '\n')
+
+output = open(sys.argv[2], 'wb')
+process(output, sys.argv[1])
+
+sys.exit(0)
+""")
+
+test.write('SConstruct', """
+import re
+
+include_re = re.compile(r'^include\s+(\S+)$', re.M)
+
+def kfile_scan(node, env, scanpaths, arg):
+    contents = node.get_text_contents()
+    includes = include_re.findall(contents)
+    return includes
+
+kscan = Scanner(name = 'kfile',
+                function = kfile_scan,
+                argument = None,
+                skeys = ['.k'],
+                recursive = True)
+
+env = Environment()
+env.Append(SCANNERS = kscan)
+
+env.Command('foo', 'foo.k', r'%(_python_)s build.py $SOURCES $TARGET')
+""" % locals())
+
+test.write('foo.k', """\
+foo.k 1 line 1
+include ascii.k
+include utf8.k
+include utf16le.k
+include utf16be.k
+foo.k 1 line 4
+""")
+
+contents = unicode("""\
+ascii.k 1 line 1
+include ascii.inc
+ascii.k 1 line 3
+""")
+test.write('ascii.k', contents.encode('ascii'))
+
+contents = unicode("""\
+utf8.k 1 line 1
+include utf8.inc
+utf8.k 1 line 3
+""")
+test.write('utf8.k', codecs.BOM_UTF8 + contents.encode('utf-8'))
+
+contents = unicode("""\
+utf16le.k 1 line 1
+include utf16le.inc
+utf16le.k 1 line 3
+""")
+test.write('utf16le.k', codecs.BOM_UTF16_LE + contents.encode('utf-16-le'))
+
+contents = unicode("""\
+utf16be.k 1 line 1
+include utf16be.inc
+utf16be.k 1 line 3
+""")
+test.write('utf16be.k', codecs.BOM_UTF16_BE + contents.encode('utf-16-be'))
+
+test.write('ascii.inc', "ascii.inc 1\n")
+test.write('utf8.inc', "utf8.inc 1\n")
+test.write('utf16le.inc', "utf16le.inc 1\n")
+test.write('utf16be.inc', "utf16be.inc 1\n")
+
+test.run(arguments='foo')
+
+expect = """\
+foo.k 1 line 1
+ascii.k 1 line 1
+ascii.inc 1
+ascii.k 1 line 3
+utf8.k 1 line 1
+utf8.inc 1
+utf8.k 1 line 3
+utf16le.k 1 line 1
+utf16le.inc 1
+utf16le.k 1 line 3
+utf16be.k 1 line 1
+utf16be.inc 1
+utf16be.k 1 line 3
+foo.k 1 line 4
+"""
+
+test.must_match('foo', expect)
+
+test.up_to_date(arguments='foo')
+
+
+
+test.write('ascii.inc', "ascii.inc 2\n")
+
+test.not_up_to_date(arguments = 'foo')
+
+expect = """\
+foo.k 1 line 1
+ascii.k 1 line 1
+ascii.inc 2
+ascii.k 1 line 3
+utf8.k 1 line 1
+utf8.inc 1
+utf8.k 1 line 3
+utf16le.k 1 line 1
+utf16le.inc 1
+utf16le.k 1 line 3
+utf16be.k 1 line 1
+utf16be.inc 1
+utf16be.k 1 line 3
+foo.k 1 line 4
+"""
+
+test.must_match('foo', expect)
+
+test.up_to_date(arguments = 'foo')
+
+
+
+test.write('utf8.inc', "utf8.inc 2\n")
+
+test.not_up_to_date(arguments = 'foo')
+
+expect = """\
+foo.k 1 line 1
+ascii.k 1 line 1
+ascii.inc 2
+ascii.k 1 line 3
+utf8.k 1 line 1
+utf8.inc 2
+utf8.k 1 line 3
+utf16le.k 1 line 1
+utf16le.inc 1
+utf16le.k 1 line 3
+utf16be.k 1 line 1
+utf16be.inc 1
+utf16be.k 1 line 3
+foo.k 1 line 4
+"""
+
+test.must_match('foo', expect)
+
+test.up_to_date(arguments = 'foo')
+
+
+
+test.write('utf16le.inc', "utf16le.inc 2\n")
+
+test.not_up_to_date(arguments = 'foo')
+
+expect = """\
+foo.k 1 line 1
+ascii.k 1 line 1
+ascii.inc 2
+ascii.k 1 line 3
+utf8.k 1 line 1
+utf8.inc 2
+utf8.k 1 line 3
+utf16le.k 1 line 1
+utf16le.inc 2
+utf16le.k 1 line 3
+utf16be.k 1 line 1
+utf16be.inc 1
+utf16be.k 1 line 3
+foo.k 1 line 4
+"""
+
+test.must_match('foo', expect)
+
+test.up_to_date(arguments = 'foo')
+
+
+
+test.write('utf16be.inc', "utf16be.inc 2\n")
+
+test.not_up_to_date(arguments = 'foo')
+
+expect = """\
+foo.k 1 line 1
+ascii.k 1 line 1
+ascii.inc 2
+ascii.k 1 line 3
+utf8.k 1 line 1
+utf8.inc 2
+utf8.k 1 line 3
+utf16le.k 1 line 1
+utf16le.inc 2
+utf16le.k 1 line 3
+utf16be.k 1 line 1
+utf16be.inc 2
+utf16be.k 1 line 3
+foo.k 1 line 4
+"""
+
+test.must_match('foo', expect)
+
+test.up_to_date(arguments = 'foo')
+
+
+
+test.pass_test()
+
+# Local Variables:
+# tab-width:4
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=4 shiftwidth=4:
-- 
2.26.2