From 1007e8518133efcab25d7ad713262a7660c5bf95 Mon Sep 17 00:00:00 2001 From: stevenknight Date: Mon, 2 Mar 2009 19:11:09 +0000 Subject: [PATCH] Fix handling of both UTF_16_LE and UTF_16_BE. Add an actual test for scanning Unicode files for implicit dependencies. Clean up how we handle decoding, and wrap it for earlier Python versions. git-svn-id: http://scons.tigris.org/svn/scons/trunk@4061 fdb21ef1-2011-0410-befe-b5e4ea1792b1 --- src/CHANGES.txt | 2 + src/engine/SCons/Node/FS.py | 65 +++++++- test/Scanner/unicode.py | 319 ++++++++++++++++++++++++++++++++++++ 3 files changed, 379 insertions(+), 7 deletions(-) create mode 100644 test/Scanner/unicode.py diff --git a/src/CHANGES.txt b/src/CHANGES.txt index 0defff82..58972402 100644 --- a/src/CHANGES.txt +++ b/src/CHANGES.txt @@ -15,6 +15,8 @@ RELEASE X.X.X - XXX - Fix the -n option when used with VariantDir(duplicate=1) and the variant directory doesn't already exist. + - Fix scanning of Unicode files for both UTF-16 endian flavors. + RELEASE 1.2.0.d20090223 - Mon, 23 Feb 2009 08:41:06 -0800 diff --git a/src/engine/SCons/Node/FS.py b/src/engine/SCons/Node/FS.py index a4036ab3..bd8314b6 100644 --- a/src/engine/SCons/Node/FS.py +++ b/src/engine/SCons/Node/FS.py @@ -58,12 +58,46 @@ else: except AttributeError: codecs.BOM_UTF8 = '\xef\xbb\xbf' try: - codecs.BOM_UTF16 + codecs.BOM_UTF16_LE + codecs.BOM_UTF16_BE except AttributeError: - if sys.byteorder == 'little': - codecs.BOM_UTF16 = '\xff\xfe' + codecs.BOM_UTF16_LE = '\xff\xfe' + codecs.BOM_UTF16_BE = '\xfe\xff' + + # Provide a wrapper function to handle decoding differences in + # different versions of Python. Normally, we'd try to do this in the + # compat layer (and maybe it still makes sense to move there?) but + # that doesn't provide a way to supply the string class used in + # pre-2.3 Python versions with a .decode() method that all strings + # naturally have. Plus, the 2.[01] encodings behave differently + # enough that we have to settle for a lowest-common-denominator + # wrapper approach. + # + # Note that the 2.[012] implementations below may be inefficient + # because they perform an explicit look up of the encoding for every + # decode, but they're old enough (and we want to stop supporting + # them soon enough) that it's not worth complicating the interface. + # Think of it as additional incentive for people to upgrade... + try: + ''.decode + except AttributeError: + # 2.0 through 2.2: strings have no .decode() method + try: + codecs.lookup('ascii').decode + except AttributeError: + # 2.0 and 2.1: encodings are a tuple of functions, and the + # decode() function returns a (result, length) tuple. + def my_decode(contents, encoding): + return codecs.lookup(encoding)[1](contents)[0] else: - codecs.BOM_UTF16 = '\xfe\xff' + # 2.2: encodings are an object with methods, and the + # .decode() method returns just the decoded bytes. + def my_decode(contents, encoding): + return codecs.lookup(encoding).decode(contents) + else: + # 2.3 or later: use the .decode() string method + def my_decode(contents, encoding): + return contents.decode(encoding) import SCons.Action from SCons.Debug import logInstanceCreation @@ -2309,10 +2343,27 @@ class File(Base): # it's a valid python string. def get_text_contents(self): contents = self.get_contents() + # The behavior of various decode() methods and functions + # w.r.t. the initial BOM bytes is different for different + # encodings and/or Python versions. ('utf-8' does not strip + # them, but has a 'utf-8-sig' which does; 'utf-16' seems to + # strip them; etc.) Just side step all the complication by + # explicitly stripping the BOM before we decode(). if contents.startswith(codecs.BOM_UTF8): - contents = contents.decode('utf-8') - elif contents.startswith(codecs.BOM_UTF16): - contents = contents.decode('utf-16') + contents = contents[len(codecs.BOM_UTF8):] + # TODO(2.2): Remove when 2.3 becomes floor. + #contents = contents.decode('utf-8') + contents = my_decode(contents, 'utf-8') + elif contents.startswith(codecs.BOM_UTF16_LE): + contents = contents[len(codecs.BOM_UTF16_LE):] + # TODO(2.2): Remove when 2.3 becomes floor. + #contents = contents.decode('utf-16-le') + contents = my_decode(contents, 'utf-16-le') + elif contents.startswith(codecs.BOM_UTF16_BE): + contents = contents[len(codecs.BOM_UTF16_BE):] + # TODO(2.2): Remove when 2.3 becomes floor. + #contents = contents.decode('utf-16-be') + contents = my_decode(contents, 'utf-16-be') return contents def get_content_hash(self): diff --git a/test/Scanner/unicode.py b/test/Scanner/unicode.py new file mode 100644 index 00000000..b895caae --- /dev/null +++ b/test/Scanner/unicode.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python +# +# __COPYRIGHT__ +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + +""" +Verify that we can scan Unicode-encoded files for implicit +dependencies. +""" + +__revision__ = "__FILE__ __REVISION__ __DATE__ __DEVELOPER__" + +import TestSCons + +_python_ = TestSCons._python_ + +test = TestSCons.TestSCons() + +try: + unicode +except NameError: + import sys + msg = "Unicode not supported by Python version %s; skipping test\n" + test.skip_test(msg % sys.version[:3]) + +import codecs + +test.write('build.py', r""" +import codecs +import sys + +# TODO(2.2): Remove when 2.3 becomes the minimal supported version. +try: + codecs.BOM_UTF8 +except AttributeError: + codecs.BOM_UTF8 = '\xef\xbb\xbf' +try: + codecs.BOM_UTF16_LE + codecs.BOM_UTF16_BE +except AttributeError: + codecs.BOM_UTF16_LE = '\xff\xfe' + codecs.BOM_UTF16_BE = '\xfe\xff' + +try: + ''.decode +except AttributeError: + # 2.0 through 2.2: strings have no .decode() method + try: + codecs.lookup('ascii').decode + except AttributeError: + # 2.0 and 2.1: encodings are a tuple of functions, and the + # decode() function returns a (result, length) tuple. + def my_decode(contents, encoding): + return codecs.lookup(encoding)[1](contents)[0] + else: + # 2.2: encodings are an object with methods, and the .decode() + # and .decode() returns just the decoded bytes. + def my_decode(contents, encoding): + return codecs.lookup(encoding).decode(contents) +else: + # 2.3 or later: use the .decode() string method + def my_decode(contents, encoding): + return contents.decode(encoding) + +def process(outfp, infile): + contents = open(infile, 'rb').read() + if contents.startswith(codecs.BOM_UTF8): + contents = contents[len(codecs.BOM_UTF8):] + # TODO(2.2): Remove when 2.3 becomes the minimal supported version. + #contents = contents.decode('utf-8') + contents = my_decode(contents, 'utf-8') + elif contents.startswith(codecs.BOM_UTF16_LE): + contents = contents[len(codecs.BOM_UTF16_LE):] + # TODO(2.2): Remove when 2.3 becomes the minimal supported version. + #contents = contents.decode('utf-16-le') + contents = my_decode(contents, 'utf-16-le') + elif contents.startswith(codecs.BOM_UTF16_BE): + contents = contents[len(codecs.BOM_UTF16_BE):] + # TODO(2.2): Remove when 2.3 becomes the minimal supported version. + #contents = contents.decode('utf-16-be') + contents = my_decode(contents, 'utf-16-be') + for line in contents.split('\n')[:-1]: + if line[:8] == 'include ': + process(outfp, line[8:]) + elif line[:8] == 'getfile ': + outfp.write('include ' + line[8:] + '\n') + # note: converted, but not acted upon + else: + outfp.write(line + '\n') + +output = open(sys.argv[2], 'wb') +process(output, sys.argv[1]) + +sys.exit(0) +""") + +test.write('SConstruct', """ +import re + +include_re = re.compile(r'^include\s+(\S+)$', re.M) + +def kfile_scan(node, env, scanpaths, arg): + contents = node.get_text_contents() + includes = include_re.findall(contents) + return includes + +kscan = Scanner(name = 'kfile', + function = kfile_scan, + argument = None, + skeys = ['.k'], + recursive = True) + +env = Environment() +env.Append(SCANNERS = kscan) + +env.Command('foo', 'foo.k', r'%(_python_)s build.py $SOURCES $TARGET') +""" % locals()) + +test.write('foo.k', """\ +foo.k 1 line 1 +include ascii.k +include utf8.k +include utf16le.k +include utf16be.k +foo.k 1 line 4 +""") + +contents = unicode("""\ +ascii.k 1 line 1 +include ascii.inc +ascii.k 1 line 3 +""") +test.write('ascii.k', contents.encode('ascii')) + +contents = unicode("""\ +utf8.k 1 line 1 +include utf8.inc +utf8.k 1 line 3 +""") +test.write('utf8.k', codecs.BOM_UTF8 + contents.encode('utf-8')) + +contents = unicode("""\ +utf16le.k 1 line 1 +include utf16le.inc +utf16le.k 1 line 3 +""") +test.write('utf16le.k', codecs.BOM_UTF16_LE + contents.encode('utf-16-le')) + +contents = unicode("""\ +utf16be.k 1 line 1 +include utf16be.inc +utf16be.k 1 line 3 +""") +test.write('utf16be.k', codecs.BOM_UTF16_BE + contents.encode('utf-16-be')) + +test.write('ascii.inc', "ascii.inc 1\n") +test.write('utf8.inc', "utf8.inc 1\n") +test.write('utf16le.inc', "utf16le.inc 1\n") +test.write('utf16be.inc', "utf16be.inc 1\n") + +test.run(arguments='foo') + +expect = """\ +foo.k 1 line 1 +ascii.k 1 line 1 +ascii.inc 1 +ascii.k 1 line 3 +utf8.k 1 line 1 +utf8.inc 1 +utf8.k 1 line 3 +utf16le.k 1 line 1 +utf16le.inc 1 +utf16le.k 1 line 3 +utf16be.k 1 line 1 +utf16be.inc 1 +utf16be.k 1 line 3 +foo.k 1 line 4 +""" + +test.must_match('foo', expect) + +test.up_to_date(arguments='foo') + + + +test.write('ascii.inc', "ascii.inc 2\n") + +test.not_up_to_date(arguments = 'foo') + +expect = """\ +foo.k 1 line 1 +ascii.k 1 line 1 +ascii.inc 2 +ascii.k 1 line 3 +utf8.k 1 line 1 +utf8.inc 1 +utf8.k 1 line 3 +utf16le.k 1 line 1 +utf16le.inc 1 +utf16le.k 1 line 3 +utf16be.k 1 line 1 +utf16be.inc 1 +utf16be.k 1 line 3 +foo.k 1 line 4 +""" + +test.must_match('foo', expect) + +test.up_to_date(arguments = 'foo') + + + +test.write('utf8.inc', "utf8.inc 2\n") + +test.not_up_to_date(arguments = 'foo') + +expect = """\ +foo.k 1 line 1 +ascii.k 1 line 1 +ascii.inc 2 +ascii.k 1 line 3 +utf8.k 1 line 1 +utf8.inc 2 +utf8.k 1 line 3 +utf16le.k 1 line 1 +utf16le.inc 1 +utf16le.k 1 line 3 +utf16be.k 1 line 1 +utf16be.inc 1 +utf16be.k 1 line 3 +foo.k 1 line 4 +""" + +test.must_match('foo', expect) + +test.up_to_date(arguments = 'foo') + + + +test.write('utf16le.inc', "utf16le.inc 2\n") + +test.not_up_to_date(arguments = 'foo') + +expect = """\ +foo.k 1 line 1 +ascii.k 1 line 1 +ascii.inc 2 +ascii.k 1 line 3 +utf8.k 1 line 1 +utf8.inc 2 +utf8.k 1 line 3 +utf16le.k 1 line 1 +utf16le.inc 2 +utf16le.k 1 line 3 +utf16be.k 1 line 1 +utf16be.inc 1 +utf16be.k 1 line 3 +foo.k 1 line 4 +""" + +test.must_match('foo', expect) + +test.up_to_date(arguments = 'foo') + + + +test.write('utf16be.inc', "utf16be.inc 2\n") + +test.not_up_to_date(arguments = 'foo') + +expect = """\ +foo.k 1 line 1 +ascii.k 1 line 1 +ascii.inc 2 +ascii.k 1 line 3 +utf8.k 1 line 1 +utf8.inc 2 +utf8.k 1 line 3 +utf16le.k 1 line 1 +utf16le.inc 2 +utf16le.k 1 line 3 +utf16be.k 1 line 1 +utf16be.inc 2 +utf16be.k 1 line 3 +foo.k 1 line 4 +""" + +test.must_match('foo', expect) + +test.up_to_date(arguments = 'foo') + + + +test.pass_test() + +# Local Variables: +# tab-width:4 +# indent-tabs-mode:nil +# End: +# vim: set expandtab tabstop=4 shiftwidth=4: -- 2.26.2